[ot-tags] Speed up hb_ot_tags_from_complex_language()

Part of https://github.com/harfbuzz/harfbuzz/issues/3591

2. All the subtag_matches outside the switch match long strings (>= 6 or so).
   As such, check the tag for such length before going into any of them.

benchmark-ot, before:

----------------------------------------------------------------------------------------------
Benchmark                                                    Time             CPU   Iterations
----------------------------------------------------------------------------------------------
BM_hb_ot_tags_from_script_and_language/COMMON zh_CN        172 ns          171 ns      4083155
BM_hb_ot_tags_from_script_and_language/COMMON en_US        120 ns          119 ns      5849947
BM_hb_ot_tags_from_script_and_language/LATIN en_US         113 ns          112 ns      5840326
BM_hb_ot_tags_from_script_and_language/COMMON none        4.66 ns         4.64 ns    151396224
BM_hb_ot_tags_from_script_and_language/LATIN none         4.66 ns         4.64 ns    149019593

After:

----------------------------------------------------------------------------------------------
Benchmark                                                    Time             CPU   Iterations
----------------------------------------------------------------------------------------------
BM_hb_ot_tags_from_script_and_language/COMMON zh_CN        112 ns          112 ns      6357763
BM_hb_ot_tags_from_script_and_language/COMMON en_US       60.5 ns         60.3 ns     11475091
BM_hb_ot_tags_from_script_and_language/LATIN en_US        54.9 ns         54.8 ns     12575690
BM_hb_ot_tags_from_script_and_language/COMMON none        4.61 ns         4.59 ns    152388450
BM_hb_ot_tags_from_script_and_language/LATIN none         4.66 ns         4.64 ns    151497600
This commit is contained in:
Behdad Esfahbod 2022-05-17 13:34:34 -06:00
parent 26d906b88b
commit 9baccb9860
2 changed files with 95 additions and 72 deletions

View File

@ -1009,6 +1009,24 @@ for initial, group in itertools.groupby ((lt_tags for lt_tags in [
key=lambda lt_tags: lt_tags[0].get_group ()): key=lambda lt_tags: lt_tags[0].get_group ()):
complex_tags[initial] += group complex_tags[initial] += group
# Calculate the min length of the subtags outside the switch
min_subtag_len = 100
for initial, items in sorted (complex_tags.items ()):
if initial != 'und':
continue
for lt, tags in items:
if not tags:
continue
subtag_len = 0
subtag_len += len(lt.script) if lt.script is not None else 0
subtag_len += len(lt.region) if lt.region is not None else 0
subtag_len += len(lt.variant) if lt.variant is not None else 0
min_subtag_len = min(subtag_len, min_subtag_len)
min_subtag_len += 1 # For initial '-'
print (' if (limit - lang_str > %d ||' % min_subtag_len)
print (" (limit - lang_str == %d && *lang_str == '-'))" % min_subtag_len)
print (' {')
for initial, items in sorted (complex_tags.items ()): for initial, items in sorted (complex_tags.items ()):
if initial != 'und': if initial != 'und':
continue continue
@ -1018,29 +1036,30 @@ for initial, items in sorted (complex_tags.items ()):
if lt.variant in bcp_47.prefixes: if lt.variant in bcp_47.prefixes:
expect (next (iter (bcp_47.prefixes[lt.variant])) == lt.language, expect (next (iter (bcp_47.prefixes[lt.variant])) == lt.language,
'%s is not a valid prefix of %s' % (lt.language, lt.variant)) '%s is not a valid prefix of %s' % (lt.language, lt.variant))
print (' if (', end='') print (' if (', end='')
print_subtag_matches (lt.script, False) print_subtag_matches (lt.script, False)
print_subtag_matches (lt.region, False) print_subtag_matches (lt.region, False)
print_subtag_matches (lt.variant, False) print_subtag_matches (lt.variant, False)
print (')') print (')')
print (' {') print (' {')
write (' /* %s */' % bcp_47.get_name (lt)) write (' /* %s */' % bcp_47.get_name (lt))
print () print ()
if len (tags) == 1: if len (tags) == 1:
write (' tags[0] = %s; /* %s */' % (hb_tag (tags[0]), ot.names[tags[0]])) write (' tags[0] = %s; /* %s */' % (hb_tag (tags[0]), ot.names[tags[0]]))
print () print ()
print (' *count = 1;') print (' *count = 1;')
else: else:
print (' hb_tag_t possible_tags[] = {') print (' hb_tag_t possible_tags[] = {')
for tag in tags: for tag in tags:
write (' %s, /* %s */' % (hb_tag (tag), ot.names[tag])) write (' %s, /* %s */' % (hb_tag (tag), ot.names[tag]))
print () print ()
print (' };') print (' };')
print (' for (i = 0; i < %s && i < *count; i++)' % len (tags)) print (' for (i = 0; i < %s && i < *count; i++)' % len (tags))
print (' tags[i] = possible_tags[i];') print (' tags[i] = possible_tags[i];')
print (' *count = i;') print (' *count = i;')
print (' return true;') print (' return true;')
print (' }') print (' }')
print (' }')
print (' switch (lang_str[0])') print (' switch (lang_str[0])')
print (' {') print (' {')

View File

@ -1639,68 +1639,72 @@ hb_ot_tags_from_complex_language (const char *lang_str,
unsigned int *count /* IN/OUT */, unsigned int *count /* IN/OUT */,
hb_tag_t *tags /* OUT */) hb_tag_t *tags /* OUT */)
{ {
if (subtag_matches (lang_str, limit, "-fonnapa")) if (limit - lang_str > 5 ||
(limit - lang_str == 5 && *lang_str == '-'))
{ {
/* Undetermined; North American Phonetic Alphabet */ if (subtag_matches (lang_str, limit, "-fonnapa"))
tags[0] = HB_TAG('A','P','P','H'); /* Phonetic transcription—Americanist conventions */ {
*count = 1; /* Undetermined; North American Phonetic Alphabet */
return true; tags[0] = HB_TAG('A','P','P','H'); /* Phonetic transcription—Americanist conventions */
} *count = 1;
if (subtag_matches (lang_str, limit, "-polyton")) return true;
{ }
/* Modern Greek (1453-); Polytonic Greek */ if (subtag_matches (lang_str, limit, "-polyton"))
tags[0] = HB_TAG('P','G','R',' '); /* Polytonic Greek */ {
*count = 1; /* Modern Greek (1453-); Polytonic Greek */
return true; tags[0] = HB_TAG('P','G','R',' '); /* Polytonic Greek */
} *count = 1;
if (subtag_matches (lang_str, limit, "-arevmda")) return true;
{ }
/* Armenian; Western Armenian (retired code) */ if (subtag_matches (lang_str, limit, "-arevmda"))
tags[0] = HB_TAG('H','Y','E',' '); /* Armenian */ {
*count = 1; /* Armenian; Western Armenian (retired code) */
return true; tags[0] = HB_TAG('H','Y','E',' '); /* Armenian */
} *count = 1;
if (subtag_matches (lang_str, limit, "-provenc")) return true;
{ }
/* Occitan (post 1500); Provençal */ if (subtag_matches (lang_str, limit, "-provenc"))
tags[0] = HB_TAG('P','R','O',' '); /* Provençal / Old Provençal */ {
*count = 1; /* Occitan (post 1500); Provençal */
return true; tags[0] = HB_TAG('P','R','O',' '); /* Provençal / Old Provençal */
} *count = 1;
if (subtag_matches (lang_str, limit, "-fonipa")) return true;
{ }
/* Undetermined; International Phonetic Alphabet */ if (subtag_matches (lang_str, limit, "-fonipa"))
tags[0] = HB_TAG('I','P','P','H'); /* Phonetic transcription—IPA conventions */ {
*count = 1; /* Undetermined; International Phonetic Alphabet */
return true; tags[0] = HB_TAG('I','P','P','H'); /* Phonetic transcription—IPA conventions */
} *count = 1;
if (subtag_matches (lang_str, limit, "-geok")) return true;
{ }
/* Undetermined; Khutsuri (Asomtavruli and Nuskhuri) */ if (subtag_matches (lang_str, limit, "-geok"))
tags[0] = HB_TAG('K','G','E',' '); /* Khutsuri Georgian */ {
*count = 1; /* Undetermined; Khutsuri (Asomtavruli and Nuskhuri) */
return true; tags[0] = HB_TAG('K','G','E',' '); /* Khutsuri Georgian */
} *count = 1;
if (subtag_matches (lang_str, limit, "-syre")) return true;
{ }
/* Undetermined; Syriac (Estrangelo variant) */ if (subtag_matches (lang_str, limit, "-syre"))
tags[0] = HB_TAG('S','Y','R','E'); /* Syriac, Estrangela script-variant (equivalent to ISO 15924 'Syre') */ {
*count = 1; /* Undetermined; Syriac (Estrangelo variant) */
return true; tags[0] = HB_TAG('S','Y','R','E'); /* Syriac, Estrangela script-variant (equivalent to ISO 15924 'Syre') */
} *count = 1;
if (subtag_matches (lang_str, limit, "-syrj")) return true;
{ }
/* Undetermined; Syriac (Western variant) */ if (subtag_matches (lang_str, limit, "-syrj"))
tags[0] = HB_TAG('S','Y','R','J'); /* Syriac, Western script-variant (equivalent to ISO 15924 'Syrj') */ {
*count = 1; /* Undetermined; Syriac (Western variant) */
return true; tags[0] = HB_TAG('S','Y','R','J'); /* Syriac, Western script-variant (equivalent to ISO 15924 'Syrj') */
} *count = 1;
if (subtag_matches (lang_str, limit, "-syrn")) return true;
{ }
/* Undetermined; Syriac (Eastern variant) */ if (subtag_matches (lang_str, limit, "-syrn"))
tags[0] = HB_TAG('S','Y','R','N'); /* Syriac, Eastern script-variant (equivalent to ISO 15924 'Syrn') */ {
*count = 1; /* Undetermined; Syriac (Eastern variant) */
return true; tags[0] = HB_TAG('S','Y','R','N'); /* Syriac, Eastern script-variant (equivalent to ISO 15924 'Syrn') */
*count = 1;
return true;
}
} }
switch (lang_str[0]) switch (lang_str[0])
{ {