[ot-tags] Speed up hb_ot_tags_from_language()

Part of https://github.com/harfbuzz/harfbuzz/issues/3591

"After that, bulk of the time I suppose is spent in binary-searching the
language table. I suggest we split the language table in 2-letter and
3-letter tags, to speed-up the vast majority of cases that are
2-letter."

benchmark-ot, before:

----------------------------------------------------------------------------------------------
Benchmark                                                    Time             CPU   Iterations
----------------------------------------------------------------------------------------------
BM_hb_ot_tags_from_script_and_language/COMMON zh_CN        112 ns          111 ns      6286271
BM_hb_ot_tags_from_script_and_language/COMMON en_US       60.6 ns         60.4 ns     11671176
BM_hb_ot_tags_from_script_and_language/LATIN en_US        61.3 ns         61.1 ns     11442645
BM_hb_ot_tags_from_script_and_language/COMMON none        4.75 ns         4.74 ns    146997235
BM_hb_ot_tags_from_script_and_language/LATIN none         4.65 ns         4.64 ns    150938747

After:

----------------------------------------------------------------------------------------------
Benchmark                                                    Time             CPU   Iterations
----------------------------------------------------------------------------------------------
BM_hb_ot_tags_from_script_and_language/COMMON zh_CN       89.5 ns         89.2 ns      7747649
BM_hb_ot_tags_from_script_and_language/COMMON en_US       38.5 ns         38.4 ns     18199432
BM_hb_ot_tags_from_script_and_language/LATIN en_US        39.0 ns         38.9 ns     18049238
BM_hb_ot_tags_from_script_and_language/COMMON none        4.53 ns         4.52 ns    154895110
BM_hb_ot_tags_from_script_and_language/LATIN none         4.54 ns         4.52 ns    154762105
This commit is contained in:
Behdad Esfahbod 2022-05-17 14:28:28 -06:00
parent 9baccb9860
commit dd3c858f84
3 changed files with 270 additions and 239 deletions

View File

@ -894,7 +894,6 @@ print ()
print ('#ifndef HB_OT_TAG_TABLE_HH') print ('#ifndef HB_OT_TAG_TABLE_HH')
print ('#define HB_OT_TAG_TABLE_HH') print ('#define HB_OT_TAG_TABLE_HH')
print () print ()
print ('static const LangTag ot_languages[] = {')
def hb_tag (tag): def hb_tag (tag):
"""Convert a tag to ``HB_TAG`` form. """Convert a tag to ``HB_TAG`` form.
@ -944,33 +943,35 @@ def get_matching_language_name (intersection, candidates):
def same_tag (bcp_47_tag, ot_tags): def same_tag (bcp_47_tag, ot_tags):
return len (bcp_47_tag) == 3 and len (ot_tags) == 1 and bcp_47_tag == ot_tags[0].lower () return len (bcp_47_tag) == 3 and len (ot_tags) == 1 and bcp_47_tag == ot_tags[0].lower ()
for language, tags in sorted (ot.from_bcp_47.items ()): for language_len in (2, 3):
if language == '' or '-' in language: print ('static const LangTag ot_languages%d[] = {' % language_len)
continue for language, tags in sorted (ot.from_bcp_47.items ()):
commented_out = same_tag (language, tags) if language == '' or '-' in language:
for i, tag in enumerate (tags, start=1): continue
print ('%s{\"%s\",\t%s},' % ('/*' if commented_out else ' ', language, hb_tag (tag)), end='') if len(language) != language_len: continue
if commented_out: commented_out = same_tag (language, tags)
print ('*/', end='') for i, tag in enumerate (tags, start=1):
print ('\t/* ', end='') print ('%s{\"%s\",\t%s},' % ('/*' if commented_out else ' ', language, hb_tag (tag)), end='')
bcp_47_name = bcp_47.names.get (language, '') if commented_out:
bcp_47_name_candidates = bcp_47_name.split ('\n') print ('*/', end='')
ot_name = ot.names[tag] print ('\t/* ', end='')
scope = bcp_47.scopes.get (language, '') bcp_47_name = bcp_47.names.get (language, '')
if tag == DEFAULT_LANGUAGE_SYSTEM: bcp_47_name_candidates = bcp_47_name.split ('\n')
write (f'{bcp_47_name_candidates[0]}{scope} != {ot.names[language.upper ()]}') ot_name = ot.names[tag]
else: scope = bcp_47.scopes.get (language, '')
intersection = language_name_intersection (bcp_47_name, ot_name) if tag == DEFAULT_LANGUAGE_SYSTEM:
if not intersection: write (f'{bcp_47_name_candidates[0]}{scope} != {ot.names[language.upper ()]}')
write ('%s%s -> %s' % (bcp_47_name_candidates[0], scope, ot_name))
else: else:
name = get_matching_language_name (intersection, bcp_47_name_candidates) intersection = language_name_intersection (bcp_47_name, ot_name)
bcp_47.names[language] = name if not intersection:
write ('%s%s' % (name if len (name) > len (ot_name) else ot_name, scope)) write ('%s%s -> %s' % (bcp_47_name_candidates[0], scope, ot_name))
print (' */') else:
name = get_matching_language_name (intersection, bcp_47_name_candidates)
print ('};') bcp_47.names[language] = name
print () write ('%s%s' % (name if len (name) > len (ot_name) else ot_name, scope))
print (' */')
print ('};')
print ()
print ('/**') print ('/**')
print (' * hb_ot_tags_from_complex_language:') print (' * hb_ot_tags_from_complex_language:')

File diff suppressed because it is too large Load Diff

View File

@ -283,7 +283,21 @@ hb_ot_tags_from_language (const char *lang_str,
ISALPHA (s[1])) ISALPHA (s[1]))
lang_str = s + 1; lang_str = s + 1;
} }
if (hb_sorted_array (ot_languages).bfind (lang_str, &tag_idx)) const LangTag *ot_languages = nullptr;
unsigned ot_languages_len = 0;
const char *dash = strchr (lang_str, '-');
unsigned first_len = dash ? dash - lang_str : limit - lang_str;
if (first_len == 2)
{
ot_languages = ot_languages2;
ot_languages_len = ARRAY_LENGTH (ot_languages2);
}
else if (first_len == 3)
{
ot_languages = ot_languages3;
ot_languages_len = ARRAY_LENGTH (ot_languages3);
}
if (hb_sorted_array (ot_languages, ot_languages_len).bfind (lang_str, &tag_idx))
{ {
unsigned int i; unsigned int i;
while (tag_idx != 0 && while (tag_idx != 0 &&
@ -291,7 +305,7 @@ hb_ot_tags_from_language (const char *lang_str,
tag_idx--; tag_idx--;
for (i = 0; for (i = 0;
i < *count && i < *count &&
tag_idx + i < ARRAY_LENGTH (ot_languages) && tag_idx + i < ot_languages_len &&
ot_languages[tag_idx + i].tag != HB_TAG_NONE && ot_languages[tag_idx + i].tag != HB_TAG_NONE &&
0 == strcmp (ot_languages[tag_idx + i].language, ot_languages[tag_idx].language); 0 == strcmp (ot_languages[tag_idx + i].language, ot_languages[tag_idx].language);
i++) i++)
@ -459,9 +473,12 @@ hb_ot_tag_to_language (hb_tag_t tag)
return disambiguated_tag; return disambiguated_tag;
} }
for (i = 0; i < ARRAY_LENGTH (ot_languages); i++) for (i = 0; i < ARRAY_LENGTH (ot_languages2); i++)
if (ot_languages[i].tag == tag) if (ot_languages2[i].tag == tag)
return hb_language_from_string (ot_languages[i].language, -1); return hb_language_from_string (ot_languages2[i].language, -1);
for (i = 0; i < ARRAY_LENGTH (ot_languages3); i++)
if (ot_languages3[i].tag == tag)
return hb_language_from_string (ot_languages3[i].language, -1);
/* Return a custom language in the form of "x-hbot-AABBCCDD". /* Return a custom language in the form of "x-hbot-AABBCCDD".
* If it's three letters long, also guess it's ISO 639-3 and lower-case and * If it's three letters long, also guess it's ISO 639-3 and lower-case and
@ -557,13 +574,23 @@ hb_ot_tags_to_script_and_language (hb_tag_t script_tag,
static inline void static inline void
test_langs_sorted () test_langs_sorted ()
{ {
for (unsigned int i = 1; i < ARRAY_LENGTH (ot_languages); i++) for (unsigned int i = 1; i < ARRAY_LENGTH (ot_languages2); i++)
{ {
int c = ot_languages[i].cmp (&ot_languages[i - 1]); int c = ot_languages2[i].cmp (&ot_languages2[i - 1]);
if (c > 0) if (c > 0)
{ {
fprintf (stderr, "ot_languages not sorted at index %d: %s %d %s\n", fprintf (stderr, "ot_languages2 not sorted at index %d: %s %d %s\n",
i, ot_languages[i-1].language, c, ot_languages[i].language); i, ot_languages2[i-1].language, c, ot_languages2[i].language);
abort();
}
}
for (unsigned int i = 1; i < ARRAY_LENGTH (ot_languages3); i++)
{
int c = ot_languages3[i].cmp (&ot_languages3[i - 1]);
if (c > 0)
{
fprintf (stderr, "ot_languages3 not sorted at index %d: %s %d %s\n",
i, ot_languages3[i-1].language, c, ot_languages3[i].language);
abort(); abort();
} }
} }