[ot-tags] Speed up hb_ot_tags_from_language()

Part of https://github.com/harfbuzz/harfbuzz/issues/3591

"After that, bulk of the time I suppose is spent in binary-searching the
language table. I suggest we split the language table in 2-letter and
3-letter tags, to speed-up the vast majority of cases that are
2-letter."

benchmark-ot, before:

----------------------------------------------------------------------------------------------
Benchmark                                                    Time             CPU   Iterations
----------------------------------------------------------------------------------------------
BM_hb_ot_tags_from_script_and_language/COMMON zh_CN        112 ns          111 ns      6286271
BM_hb_ot_tags_from_script_and_language/COMMON en_US       60.6 ns         60.4 ns     11671176
BM_hb_ot_tags_from_script_and_language/LATIN en_US        61.3 ns         61.1 ns     11442645
BM_hb_ot_tags_from_script_and_language/COMMON none        4.75 ns         4.74 ns    146997235
BM_hb_ot_tags_from_script_and_language/LATIN none         4.65 ns         4.64 ns    150938747

After:

----------------------------------------------------------------------------------------------
Benchmark                                                    Time             CPU   Iterations
----------------------------------------------------------------------------------------------
BM_hb_ot_tags_from_script_and_language/COMMON zh_CN       89.5 ns         89.2 ns      7747649
BM_hb_ot_tags_from_script_and_language/COMMON en_US       38.5 ns         38.4 ns     18199432
BM_hb_ot_tags_from_script_and_language/LATIN en_US        39.0 ns         38.9 ns     18049238
BM_hb_ot_tags_from_script_and_language/COMMON none        4.53 ns         4.52 ns    154895110
BM_hb_ot_tags_from_script_and_language/LATIN none         4.54 ns         4.52 ns    154762105
This commit is contained in:
Behdad Esfahbod 2022-05-17 14:28:28 -06:00
parent 9baccb9860
commit dd3c858f84
3 changed files with 270 additions and 239 deletions

View File

@ -894,7 +894,6 @@ print ()
print ('#ifndef HB_OT_TAG_TABLE_HH')
print ('#define HB_OT_TAG_TABLE_HH')
print ()
print ('static const LangTag ot_languages[] = {')
def hb_tag (tag):
"""Convert a tag to ``HB_TAG`` form.
@ -944,33 +943,35 @@ def get_matching_language_name (intersection, candidates):
def same_tag (bcp_47_tag, ot_tags):
return len (bcp_47_tag) == 3 and len (ot_tags) == 1 and bcp_47_tag == ot_tags[0].lower ()
for language, tags in sorted (ot.from_bcp_47.items ()):
if language == '' or '-' in language:
continue
commented_out = same_tag (language, tags)
for i, tag in enumerate (tags, start=1):
print ('%s{\"%s\",\t%s},' % ('/*' if commented_out else ' ', language, hb_tag (tag)), end='')
if commented_out:
print ('*/', end='')
print ('\t/* ', end='')
bcp_47_name = bcp_47.names.get (language, '')
bcp_47_name_candidates = bcp_47_name.split ('\n')
ot_name = ot.names[tag]
scope = bcp_47.scopes.get (language, '')
if tag == DEFAULT_LANGUAGE_SYSTEM:
write (f'{bcp_47_name_candidates[0]}{scope} != {ot.names[language.upper ()]}')
else:
intersection = language_name_intersection (bcp_47_name, ot_name)
if not intersection:
write ('%s%s -> %s' % (bcp_47_name_candidates[0], scope, ot_name))
for language_len in (2, 3):
print ('static const LangTag ot_languages%d[] = {' % language_len)
for language, tags in sorted (ot.from_bcp_47.items ()):
if language == '' or '-' in language:
continue
if len(language) != language_len: continue
commented_out = same_tag (language, tags)
for i, tag in enumerate (tags, start=1):
print ('%s{\"%s\",\t%s},' % ('/*' if commented_out else ' ', language, hb_tag (tag)), end='')
if commented_out:
print ('*/', end='')
print ('\t/* ', end='')
bcp_47_name = bcp_47.names.get (language, '')
bcp_47_name_candidates = bcp_47_name.split ('\n')
ot_name = ot.names[tag]
scope = bcp_47.scopes.get (language, '')
if tag == DEFAULT_LANGUAGE_SYSTEM:
write (f'{bcp_47_name_candidates[0]}{scope} != {ot.names[language.upper ()]}')
else:
name = get_matching_language_name (intersection, bcp_47_name_candidates)
bcp_47.names[language] = name
write ('%s%s' % (name if len (name) > len (ot_name) else ot_name, scope))
print (' */')
print ('};')
print ()
intersection = language_name_intersection (bcp_47_name, ot_name)
if not intersection:
write ('%s%s -> %s' % (bcp_47_name_candidates[0], scope, ot_name))
else:
name = get_matching_language_name (intersection, bcp_47_name_candidates)
bcp_47.names[language] = name
write ('%s%s' % (name if len (name) > len (ot_name) else ot_name, scope))
print (' */')
print ('};')
print ()
print ('/**')
print (' * hb_ot_tags_from_complex_language:')

File diff suppressed because it is too large Load Diff

View File

@ -283,7 +283,21 @@ hb_ot_tags_from_language (const char *lang_str,
ISALPHA (s[1]))
lang_str = s + 1;
}
if (hb_sorted_array (ot_languages).bfind (lang_str, &tag_idx))
const LangTag *ot_languages = nullptr;
unsigned ot_languages_len = 0;
const char *dash = strchr (lang_str, '-');
unsigned first_len = dash ? dash - lang_str : limit - lang_str;
if (first_len == 2)
{
ot_languages = ot_languages2;
ot_languages_len = ARRAY_LENGTH (ot_languages2);
}
else if (first_len == 3)
{
ot_languages = ot_languages3;
ot_languages_len = ARRAY_LENGTH (ot_languages3);
}
if (hb_sorted_array (ot_languages, ot_languages_len).bfind (lang_str, &tag_idx))
{
unsigned int i;
while (tag_idx != 0 &&
@ -291,7 +305,7 @@ hb_ot_tags_from_language (const char *lang_str,
tag_idx--;
for (i = 0;
i < *count &&
tag_idx + i < ARRAY_LENGTH (ot_languages) &&
tag_idx + i < ot_languages_len &&
ot_languages[tag_idx + i].tag != HB_TAG_NONE &&
0 == strcmp (ot_languages[tag_idx + i].language, ot_languages[tag_idx].language);
i++)
@ -459,9 +473,12 @@ hb_ot_tag_to_language (hb_tag_t tag)
return disambiguated_tag;
}
for (i = 0; i < ARRAY_LENGTH (ot_languages); i++)
if (ot_languages[i].tag == tag)
return hb_language_from_string (ot_languages[i].language, -1);
for (i = 0; i < ARRAY_LENGTH (ot_languages2); i++)
if (ot_languages2[i].tag == tag)
return hb_language_from_string (ot_languages2[i].language, -1);
for (i = 0; i < ARRAY_LENGTH (ot_languages3); i++)
if (ot_languages3[i].tag == tag)
return hb_language_from_string (ot_languages3[i].language, -1);
/* Return a custom language in the form of "x-hbot-AABBCCDD".
* If it's three letters long, also guess it's ISO 639-3 and lower-case and
@ -557,13 +574,23 @@ hb_ot_tags_to_script_and_language (hb_tag_t script_tag,
static inline void
test_langs_sorted ()
{
for (unsigned int i = 1; i < ARRAY_LENGTH (ot_languages); i++)
for (unsigned int i = 1; i < ARRAY_LENGTH (ot_languages2); i++)
{
int c = ot_languages[i].cmp (&ot_languages[i - 1]);
int c = ot_languages2[i].cmp (&ot_languages2[i - 1]);
if (c > 0)
{
fprintf (stderr, "ot_languages not sorted at index %d: %s %d %s\n",
i, ot_languages[i-1].language, c, ot_languages[i].language);
fprintf (stderr, "ot_languages2 not sorted at index %d: %s %d %s\n",
i, ot_languages2[i-1].language, c, ot_languages2[i].language);
abort();
}
}
for (unsigned int i = 1; i < ARRAY_LENGTH (ot_languages3); i++)
{
int c = ot_languages3[i].cmp (&ot_languages3[i - 1]);
if (c > 0)
{
fprintf (stderr, "ot_languages3 not sorted at index %d: %s %d %s\n",
i, ot_languages3[i-1].language, c, ot_languages3[i].language);
abort();
}
}