Merge pull request #3558 from harfbuzz/set-optimize

[perf] hb_set_t optimizations and perf suite improvements
2022-04-29 18:34:00 -06:00 · 2022-04-29 18:34:00 -06:00 · a4522df378
parent 4de5352a3d 6922a2561f
commit a4522df378
11 changed files with 250 additions and 121 deletions
--- a/Makefile.am
+++ b/Makefile.am
@ -4,7 +4,7 @@ NULL =

 ACLOCAL_AMFLAGS = -I m4

-SUBDIRS = src util test docs
+SUBDIRS = src util test perf docs

 EXTRA_DIST = \
 	autogen.sh \
@ -26,19 +26,6 @@ EXTRA_DIST = \
 	subprojects/ragel.wrap \
 	subprojects/packagefiles/ragel/meson.build \
 	subprojects/ttf-parser.wrap \
-	perf/meson.build \
-	perf/perf-draw.hh \
-	perf/perf-extents.hh \
-	perf/perf-shaping.hh \
-	perf/perf.cc \
-	perf/fonts/Amiri-Regular.ttf \
-	perf/fonts/NotoNastaliqUrdu-Regular.ttf \
-	perf/fonts/NotoSansDevanagari-Regular.ttf \
-	perf/fonts/Roboto-Regular.ttf \
-	perf/texts/en-thelittleprince.txt \
-	perf/texts/en-words.txt \
-	perf/texts/fa-monologue.txt \
-	perf/texts/fa-thelittleprince.txt \
 	mingw-configure.sh \
 	$(NULL)

--- a/configure.ac
+++ b/configure.ac
@ -437,6 +437,7 @@ test/shape/data/text-rendering-tests/Makefile
 test/subset/Makefile
 test/subset/data/Makefile
 test/subset/data/repack_tests/Makefile
+perf/Makefile
 docs/Makefile
 docs/version.xml
 ])
--- a/perf/Makefile.am
+++ b/perf/Makefile.am
@ -0,0 +1,24 @@
+# Process this file with automake to produce Makefile.in
+
+NULL =
+EXTRA_DIST =
+SUBDIRS =
+
+EXTRA_DIST +=
+	meson.build \
+	perf-draw.hh \
+	perf-extents.hh \
+	perf.cc \
+	benchmark-map.cc \
+	benchmark-set.cc \
+	benchmark-shape.cc \
+	benchmark-subset.cc \
+	fonts \
+	texts \
+	$(NULL)
+
+# Convenience targets:
+lib:
+	@$(MAKE) $(AM_MAKEFLAGS) -C $(top_builddir)/src lib
+
+-include $(top_srcdir)/git.mk
--- a/perf/benchmark-map.cc
+++ b/perf/benchmark-map.cc
@ -29,11 +29,13 @@ static void BM_MapInsert(benchmark::State& state) {
  RandomMap(map_size, original);
  assert(hb_map_get_population(original) == map_size);

+  auto needle = map_size / 2;
+  auto v = 0;
  for (auto _ : state) {
    // TODO(garretrieger): create a copy of the original map.
    //                     Needs a hb_map_copy(..) in public api.

-    hb_map_set (original, rand (), rand ());
+    hb_map_set (original, needle++, v++);
  }

  hb_map_destroy(original);
@ -49,9 +51,11 @@ static void BM_MapLookup(benchmark::State& state) {
  RandomMap(map_size, original);
  assert(hb_map_get_population(original) == map_size);

+  auto needle = map_size / 2;
+
  for (auto _ : state) {
    benchmark::DoNotOptimize(
-        hb_map_get (original, rand()));
+        hb_map_get (original, needle++));
  }

  hb_map_destroy(original);
--- a/perf/benchmark-set.cc
+++ b/perf/benchmark-set.cc
@ -74,7 +74,7 @@ BENCHMARK(BM_SetOrderedInsert_1000)
         {2, 512}});          // Density

 /* Single value lookup on sets of various sizes. */
-static void BM_SetLookup(benchmark::State& state) {
+static void BM_SetLookup(benchmark::State& state, unsigned interval) {
  unsigned set_size = state.range(0);
  unsigned max_value = state.range(0) * state.range(1);

@ -82,14 +82,19 @@ static void BM_SetLookup(benchmark::State& state) {
  RandomSet(set_size, max_value, original);
  assert(hb_set_get_population(original) == set_size);

+  auto needle = max_value / 2;
  for (auto _ : state) {
    benchmark::DoNotOptimize(
-        hb_set_has (original, rand() % max_value));
+        hb_set_has (original, (needle += interval) % max_value));
  }

  hb_set_destroy(original);
 }
-BENCHMARK(BM_SetLookup)
+BENCHMARK_CAPTURE(BM_SetLookup, ordered, 3)
+    ->Ranges(
+        {{1 << 10, 1 << 16}, // Set Size
+         {2, 512}});          // Density
+BENCHMARK_CAPTURE(BM_SetLookup, random, 12345)
    ->Ranges(
        {{1 << 10, 1 << 16}, // Set Size
         {2, 512}});          // Density
--- a/perf/benchmark-shape.cc
+++ b/perf/benchmark-shape.cc
@ -0,0 +1,88 @@
+#include "benchmark/benchmark.h"
+#include <cstring>
+
+#include "hb.h"
+
+struct test_input_t
+{
+  const char *text_path;
+  const char *font_path;
+} tests[] =
+{
+  {"perf/texts/fa-thelittleprince.txt",
+   "perf/fonts/Amiri-Regular.ttf"},
+
+  {"perf/texts/fa-thelittleprince.txt",
+   "perf/fonts/NotoNastaliqUrdu-Regular.ttf"},
+
+  {"perf/texts/fa-monologue.txt",
+   "perf/fonts/Amiri-Regular.ttf"},
+
+  {"perf/texts/fa-monologue.txt",
+   "perf/fonts/NotoNastaliqUrdu-Regular.ttf"},
+
+  {"perf/texts/en-thelittleprince.txt",
+   "perf/fonts/Roboto-Regular.ttf"},
+
+  {"perf/texts/en-words.txt",
+   "perf/fonts/Roboto-Regular.ttf"},
+};
+
+static void BM_Shape (benchmark::State &state, const test_input_t &input)
+{
+  hb_font_t *font;
+  {
+    hb_blob_t *blob = hb_blob_create_from_file_or_fail (input.font_path);
+    assert (blob);
+    hb_face_t *face = hb_face_create (blob, 0);
+    hb_blob_destroy (blob);
+    font = hb_font_create (face);
+    hb_face_destroy (face);
+  }
+
+  hb_blob_t *text_blob = hb_blob_create_from_file_or_fail (input.text_path);
+  assert (text_blob);
+  unsigned orig_text_length;
+  const char *orig_text = hb_blob_get_data (text_blob, &orig_text_length);
+
+  hb_buffer_t *buf = hb_buffer_create ();
+  for (auto _ : state)
+  {
+    unsigned text_length = orig_text_length;
+    const char *text = orig_text;
+
+    const char *end;
+    while ((end = (const char *) memchr (text, '\n', text_length)))
+    {
+      hb_buffer_clear_contents (buf);
+      hb_buffer_add_utf8 (buf, text, text_length, 0, end - text);
+      hb_buffer_guess_segment_properties (buf);
+      hb_shape (font, buf, nullptr, 0);
+
+      unsigned skip = end - text + 1;
+      text_length -= skip;
+      text += skip;
+    }
+  }
+  hb_buffer_destroy (buf);
+
+  hb_blob_destroy (text_blob);
+  hb_font_destroy (font);
+}
+
+int main(int argc, char** argv)
+{
+  for (auto& test_input : tests)
+  {
+    char name[1024] = "BM_Shape";
+    strcat (name, strrchr (test_input.text_path, '/'));
+    strcat (name, strrchr (test_input.font_path, '/'));
+
+    benchmark::RegisterBenchmark (name, BM_Shape, test_input)
+     ->Unit(benchmark::kMillisecond);
+  }
+
+  benchmark::Initialize(&argc, argv);
+  benchmark::RunSpecifiedBenchmarks();
+  benchmark::Shutdown();
+}
--- a/perf/meson.build
+++ b/perf/meson.build
@ -21,6 +21,16 @@ benchmark('perf', executable('perf', 'perf.cc',
 ), workdir: meson.current_source_dir() / '..', timeout: 100)


+benchmark('benchmark-shape', executable('benchmark-shape', 'benchmark-shape.cc',
+  dependencies: [
+    google_benchmark_dep,
+  ],
+  cpp_args: [],
+  include_directories: [incconfig, incsrc],
+  link_with: [libharfbuzz],
+  install: false,
+), workdir: meson.current_source_dir() / '..', timeout: 100)
+
 benchmark('benchmark-set', executable('benchmark-set', 'benchmark-set.cc',
  dependencies: [
    google_benchmark_dep,
--- a/perf/perf-shaping.hh
+++ b/perf/perf-shaping.hh
@ -1,65 +0,0 @@
-#include "benchmark/benchmark.h"
-
-#include "hb.h"
-
-static void shape (benchmark::State &state, const char *text_path,
-		   hb_direction_t direction, hb_script_t script,
-		   const char *font_path)
-{
-  hb_font_t *font;
-  {
-    hb_blob_t *blob = hb_blob_create_from_file_or_fail (font_path);
-    assert (blob);
-    hb_face_t *face = hb_face_create (blob, 0);
-    hb_blob_destroy (blob);
-    font = hb_font_create (face);
-    hb_face_destroy (face);
-  }
-
-  hb_blob_t *text_blob = hb_blob_create_from_file_or_fail (text_path);
-  assert (text_blob);
-  unsigned text_length;
-  const char *text = hb_blob_get_data (text_blob, &text_length);
-
-  hb_buffer_t *buf = hb_buffer_create ();
-  for (auto _ : state)
-  {
-    hb_buffer_add_utf8 (buf, text, text_length, 0, -1);
-    hb_buffer_set_direction (buf, direction);
-    hb_buffer_set_script (buf, script);
-    hb_shape (font, buf, nullptr, 0);
-    hb_buffer_clear_contents (buf);
-  }
-  hb_buffer_destroy (buf);
-
-  hb_blob_destroy (text_blob);
-  hb_font_destroy (font);
-}
-
-BENCHMARK_CAPTURE (shape, fa-thelittleprince.txt - Amiri,
-		   "perf/texts/fa-thelittleprince.txt",
-		   HB_DIRECTION_RTL, HB_SCRIPT_ARABIC,
-		   "perf/fonts/Amiri-Regular.ttf");
-BENCHMARK_CAPTURE (shape, fa-thelittleprince.txt - NotoNastaliqUrdu,
-		   "perf/texts/fa-thelittleprince.txt",
-		   HB_DIRECTION_RTL, HB_SCRIPT_ARABIC,
-		   "perf/fonts/NotoNastaliqUrdu-Regular.ttf");
-
-BENCHMARK_CAPTURE (shape, fa-monologue.txt - Amiri,
-		   "perf/texts/fa-monologue.txt",
-		   HB_DIRECTION_RTL, HB_SCRIPT_ARABIC,
-		   "perf/fonts/Amiri-Regular.ttf");
-BENCHMARK_CAPTURE (shape, fa-monologue.txt - NotoNastaliqUrdu,
-		   "perf/texts/fa-monologue.txt",
-		   HB_DIRECTION_RTL, HB_SCRIPT_ARABIC,
-		   "perf/fonts/NotoNastaliqUrdu-Regular.ttf");
-
-BENCHMARK_CAPTURE (shape, en-thelittleprince.txt - Roboto,
-		   "perf/texts/en-thelittleprince.txt",
-		   HB_DIRECTION_LTR, HB_SCRIPT_LATIN,
-		   "perf/fonts/Roboto-Regular.ttf");
-
-BENCHMARK_CAPTURE (shape, en-words.txt - Roboto,
-		   "perf/texts/en-words.txt",
-		   HB_DIRECTION_LTR, HB_SCRIPT_LATIN,
-		   "perf/fonts/Roboto-Regular.ttf");
--- a/perf/perf.cc
+++ b/perf/perf.cc
@ -4,7 +4,6 @@
 #include "config.h"
 #endif

-#include "perf-shaping.hh"
 #ifdef HAVE_FREETYPE
 enum backend_t { HARFBUZZ, FREETYPE, TTF_PARSER };
 #include "perf-extents.hh"
--- a/src/hb-bit-set.hh
+++ b/src/hb-bit-set.hh
@ -874,7 +874,19 @@ struct hb_bit_set_t

  page_t *page_for (hb_codepoint_t g, bool insert = false)
  {
-    page_map_t map = {get_major (g), pages.length};
+    unsigned major = get_major (g);
+
+    /* The extra page_map length is necessary; can't just rely on vector here,
+     * since the next check would be tricked because a null page also has
+     * major==0, which we can't distinguish from an actualy major==0 page... */
+    if (likely (last_page_lookup < page_map.length))
+    {
+      auto &cached_page = page_map.arrayZ[last_page_lookup];
+      if (cached_page.major == major)
+	return &pages[cached_page.index];
+    }
+
+    page_map_t map = {major, pages.length};
    unsigned int i;
    if (!page_map.bfind (map, &i, HB_NOT_FOUND_STORE_CLOSEST))
    {
@ -890,15 +902,31 @@ struct hb_bit_set_t
 	       (page_map.length - 1 - i) * page_map.item_size);
      page_map[i] = map;
    }
+
+    last_page_lookup = i;
    return &pages[page_map[i].index];
  }
  const page_t *page_for (hb_codepoint_t g) const
  {
-    page_map_t key = {get_major (g)};
-    const page_map_t *found = page_map.bsearch (key);
-    if (found)
-      return &pages[found->index];
-    return nullptr;
+    unsigned major = get_major (g);
+
+    /* The extra page_map length is necessary; can't just rely on vector here,
+     * since the next check would be tricked because a null page also has
+     * major==0, which we can't distinguish from an actualy major==0 page... */
+    if (likely (last_page_lookup < page_map.length))
+    {
+      auto &cached_page = page_map.arrayZ[last_page_lookup];
+      if (cached_page.major == major)
+	return &pages[cached_page.index];
+    }
+
+    page_map_t key = {major};
+    unsigned int i;
+    if (!page_map.bfind (key, &i))
+      return nullptr;
+
+    last_page_lookup = i;
+    return &pages[page_map[i].index];
  }
  page_t &page_at (unsigned int i) { return pages[page_map[i].index]; }
  const page_t &page_at (unsigned int i) const { return pages[page_map[i].index]; }
--- a/src/hb-ot-cmap-table.hh
+++ b/src/hb-ot-cmap-table.hh
@ -109,22 +109,26 @@ struct CmapSubtableFormat4

    while (it) {
      // Start a new range
-      start_cp = (*it).first;
-      prev_run_start_cp = (*it).first;
-      run_start_cp = (*it).first;
-      end_cp = (*it).first;
-      last_gid = (*it).second;
-      run_length = 1;
-      prev_delta = 0;
+      {
+        const auto& pair = *it;
+        start_cp = pair.first;
+        prev_run_start_cp = start_cp;
+        run_start_cp = start_cp;
+        end_cp = start_cp;
+        last_gid = pair.second;
+        run_length = 1;
+        prev_delta = 0;
+      }

-      delta = (*it).second - (*it).first;
+      delta = last_gid - start_cp;
      mode = FIRST_SUB_RANGE;
      it++;

      while (it) {
        // Process range
-        hb_codepoint_t next_cp = (*it).first;
-        hb_codepoint_t next_gid = (*it).second;
+        const auto& pair = *it;
+        hb_codepoint_t next_cp = pair.first;
+        hb_codepoint_t next_gid = pair.second;
        if (next_cp != end_cp + 1) {
          // Current range is over, stop processing.
          break;
@ -282,16 +286,15 @@ struct CmapSubtableFormat4
  }

  template<typename Iterator,
-	   hb_requires (hb_is_iterator (Iterator))>
+          hb_requires (hb_is_iterator (Iterator))>
  HBUINT16* serialize_rangeoffset_glyid (hb_serialize_context_t *c,
-					 Iterator it,
+                                         Iterator it,
 					 HBUINT16 *endCode,
 					 HBUINT16 *startCode,
 					 HBINT16 *idDelta,
 					 unsigned segcount)
  {
-    hb_hashmap_t<hb_codepoint_t, hb_codepoint_t> cp_to_gid;
-    + it | hb_sink (cp_to_gid);
+    hb_hashmap_t<hb_codepoint_t, hb_codepoint_t> cp_to_gid { it };

    HBUINT16 *idRangeOffset = c->allocate_size<HBUINT16> (HBUINT16::static_size * segcount);
    if (unlikely (!c->check_success (idRangeOffset))) return nullptr;
@ -323,22 +326,32 @@ struct CmapSubtableFormat4
 		 { return _.first <= 0xFFFF; })
    ;

-    if (format4_iter.len () == 0) return;
+    if (!format4_iter) return;

    unsigned table_initpos = c->length ();
    if (unlikely (!c->extend_min (this))) return;
    this->format = 4;

+    // TODO(grieger): does pre-alloc make this faster?
+    hb_vector_t<hb_pair_t<hb_codepoint_t, hb_codepoint_t>> cp_to_gid {
+      format4_iter
+    };
+
    //serialize endCode[], startCode[], idDelta[]
    HBUINT16* endCode = c->start_embed<HBUINT16> ();
-    unsigned segcount = serialize_find_segcount (format4_iter);
-    if (unlikely (!serialize_start_end_delta_arrays (c, format4_iter, segcount)))
+    unsigned segcount = serialize_find_segcount (cp_to_gid.iter());
+    if (unlikely (!serialize_start_end_delta_arrays (c, cp_to_gid.iter(), segcount)))
      return;

    HBUINT16 *startCode = endCode + segcount + 1;
    HBINT16 *idDelta = ((HBINT16*)startCode) + segcount;

-    HBUINT16 *idRangeOffset = serialize_rangeoffset_glyid (c, format4_iter, endCode, startCode, idDelta, segcount);
+    HBUINT16 *idRangeOffset = serialize_rangeoffset_glyid (c,
+                                                           cp_to_gid.iter (),
+                                                           endCode,
+                                                           startCode,
+                                                           idDelta,
+                                                           segcount);
    if (unlikely (!c->check_success (idRangeOffset))) return;

    this->length = c->length () - table_initpos;
@ -440,14 +453,14 @@ struct CmapSubtableFormat4
 	hb_codepoint_t start = this->startCount[i];
 	hb_codepoint_t end = this->endCount[i];
 	unsigned int rangeOffset = this->idRangeOffset[i];
+        out->add_range(start, end);
 	if (rangeOffset == 0)
 	{
 	  for (hb_codepoint_t codepoint = start; codepoint <= end; codepoint++)
 	  {
 	    hb_codepoint_t gid = (codepoint + this->idDelta[i]) & 0xFFFFu;
 	    if (unlikely (!gid))
-	      continue;
-	    out->add (codepoint);
+              out->del(codepoint);
 	  }
 	}
 	else
@ -456,11 +469,13 @@ struct CmapSubtableFormat4
 	  {
 	    unsigned int index = rangeOffset / 2 + (codepoint - this->startCount[i]) + i - this->segCount;
 	    if (unlikely (index >= this->glyphIdArrayLength))
+            {
+              out->del_range (codepoint, end);
 	      break;
+            }
 	    hb_codepoint_t gid = this->glyphIdArray[index];
 	    if (unlikely (!gid))
-	      continue;
-	    out->add (codepoint);
+              out->del(codepoint);
 	  }
 	}
      }
@ -469,6 +484,8 @@ struct CmapSubtableFormat4
    void collect_mapping (hb_set_t *unicodes, /* OUT */
 			  hb_map_t *mapping /* OUT */) const
    {
+      // TODO(grieger): optimize similar to collect_unicodes
+      // (ie. use add_range())
      unsigned count = this->segCount;
      if (count && this->startCount[count - 1] == 0xFFFFu)
 	count--; /* Skip sentinel segment. */
@ -1448,6 +1465,37 @@ struct EncodingRecord
  DEFINE_SIZE_STATIC (8);
 };

+struct SubtableUnicodesCache {
+
+ private:
+  const void* base;
+  hb_hashmap_t<intptr_t, hb_set_t*> cached_unicodes;
+
+ public:
+  SubtableUnicodesCache(const void* cmap_base)
+      : base(cmap_base), cached_unicodes() {}
+  ~SubtableUnicodesCache()
+  {
+    for (hb_set_t* s : cached_unicodes.values()) {
+      hb_set_destroy (s);
+    }
+  }
+
+  hb_set_t* set_for(const EncodingRecord* record)
+  {
+    if (!cached_unicodes.has ((intptr_t) record)) {
+      hb_set_t* new_set = hb_set_create ();
+      if (!cached_unicodes.set ((intptr_t) record, new_set)) {
+        hb_set_destroy (new_set);
+        return hb_set_get_empty ();
+      }
+      (base+record->subtable).collect_unicodes (cached_unicodes.get ((intptr_t) record));
+    }
+    return cached_unicodes.get ((intptr_t) record);
+  }
+
+};
+
 struct cmap
 {
  static constexpr hb_tag_t tableTag = HB_OT_TAG_cmap;
@ -1467,6 +1515,7 @@ struct cmap
    unsigned format4objidx = 0, format12objidx = 0, format14objidx = 0;
    auto snap = c->snapshot ();

+    SubtableUnicodesCache unicodes_cache (base);
    for (const EncodingRecord& _ : encodingrec_iter)
    {
      if (c->in_error ())
@ -1475,12 +1524,11 @@ struct cmap
      unsigned format = (base+_.subtable).u.format;
      if (format != 4 && format != 12 && format != 14) continue;

-      hb_set_t unicodes_set;
-      (base+_.subtable).collect_unicodes (&unicodes_set);
+      hb_set_t* unicodes_set = unicodes_cache.set_for (&_);

      if (!drop_format_4 && format == 4)
      {
-        c->copy (_, + it | hb_filter (unicodes_set, hb_first), 4u, base, plan, &format4objidx);
+        c->copy (_, + it | hb_filter (*unicodes_set, hb_first), 4u, base, plan, &format4objidx);
        if (c->in_error () && c->only_overflow ())
        {
          // cmap4 overflowed, reset and retry serialization without format 4 subtables.
@ -1495,8 +1543,8 @@ struct cmap

      else if (format == 12)
      {
-        if (_can_drop (_, unicodes_set, base, + it | hb_map (hb_first), encodingrec_iter)) continue;
-        c->copy (_, + it | hb_filter (unicodes_set, hb_first), 12u, base, plan, &format12objidx);
+        if (_can_drop (_, *unicodes_set, base, unicodes_cache, + it | hb_map (hb_first), encodingrec_iter)) continue;
+        c->copy (_, + it | hb_filter (*unicodes_set, hb_first), 12u, base, plan, &format12objidx);
      }
      else if (format == 14) c->copy (_, it, 14u, base, plan, &format14objidx);
    }
@ -1514,6 +1562,7 @@ struct cmap
  bool _can_drop (const EncodingRecord& cmap12,
                  const hb_set_t& cmap12_unicodes,
                  const void* base,
+                  SubtableUnicodesCache& unicodes_cache,
                  Iterator subset_unicodes,
                  EncodingRecordIterator encoding_records)
  {
@ -1544,11 +1593,10 @@ struct cmap
          || (base+_.subtable).get_language() != target_language)
        continue;

-      hb_set_t sibling_unicodes;
-      (base+_.subtable).collect_unicodes (&sibling_unicodes);
+      hb_set_t* sibling_unicodes = unicodes_cache.set_for (&_);

      auto cmap12 = + subset_unicodes | hb_filter (cmap12_unicodes);
-      auto sibling = + subset_unicodes | hb_filter (sibling_unicodes);
+      auto sibling = + subset_unicodes | hb_filter (*sibling_unicodes);
      for (; cmap12 && sibling; cmap12++, sibling++)
      {
        unsigned a = *cmap12;