Merge pull request #3171 from googlefonts/unicode_glyph_invert

[subset] use inverted set for all unicodes.
2021-08-25 16:21:07 -06:00 · 2021-08-25 16:21:07 -06:00 · 18b4aab652
parent e9e6d66cd6 75efade7a4
commit 18b4aab652
2 changed files with 120 additions and 50 deletions
--- a/src/hb-subset-plan.cc
+++ b/src/hb-subset-plan.cc
@ -240,7 +240,12 @@ _populate_unicodes_to_retain (const hb_set_t *unicodes,
 {
  OT::cmap::accelerator_t cmap;
  cmap.init (plan->source);
+  constexpr static const int size_threshold = 4000;

+  if (unicodes->get_population () < size_threshold && glyphs->is_empty ())
+  {
+    // This is the fast path if it's anticipated that size of unicodes
+    // is << than the number of codepoints in the font.
    for (hb_codepoint_t cp : *unicodes)
    {
      hb_codepoint_t gid;
@ -249,33 +254,41 @@ _populate_unicodes_to_retain (const hb_set_t *unicodes,
        DEBUG_MSG(SUBSET, nullptr, "Drop U+%04X; no gid", cp);
        continue;
      }
-    plan->unicodes->add (cp);
+
      plan->codepoint_to_glyph->set (cp, gid);
-    plan->_glyphset_gsub->add (gid);
    }
-
-  if (glyphs->is_empty ())
-  {
    cmap.fini ();
-    return;
-  }
-
+  } else {
    hb_map_t unicode_glyphid_map;
    cmap.collect_mapping (hb_set_get_empty (), &unicode_glyphid_map);
    cmap.fini ();

    for (hb_pair_t<hb_codepoint_t, hb_codepoint_t> cp_gid :
-       + unicode_glyphid_map.iter () | hb_filter (glyphs, hb_second))
+             + unicode_glyphid_map.iter ())
    {
-    plan->unicodes->add (cp_gid.first);
+      if (!unicodes->has (cp_gid.first) && !glyphs->has (cp_gid.second))
+        continue;
+
      plan->codepoint_to_glyph->set (cp_gid.first, cp_gid.second);
    }
+
+    // Add gids which where requested, but not mapped in cmap
+    // TODO(garretrieger): once https://github.com/harfbuzz/harfbuzz/issues/3169
+    //                     is implemented, this can be done with union and del_range
+    for (hb_codepoint_t gid : glyphs->iter ())
+    {
+      if (gid >= plan->source->get_num_glyphs ())
+        break;
+      plan->_glyphset_gsub->add (gid);
+    }
+  }
+
+  + plan->codepoint_to_glyph->keys () | hb_sink (plan->unicodes);
+  + plan->codepoint_to_glyph->values () | hb_sink (plan->_glyphset_gsub);
 }

 static void
 _populate_gids_to_retain (hb_subset_plan_t* plan,
-			  const hb_set_t *unicodes,
-			  const hb_set_t *input_glyphs_to_retain,
 			  bool close_over_gsub,
 			  bool close_over_gpos,
 			  bool close_over_gdef)
@ -292,7 +305,6 @@ _populate_gids_to_retain (hb_subset_plan_t* plan,
  colr.init (plan->source);

  plan->_glyphset_gsub->add (0); // Not-def
-  hb_set_union (plan->_glyphset_gsub, input_glyphs_to_retain);

  _cmap_closure (plan->source, plan->unicodes, plan->_glyphset_gsub);

@ -477,8 +489,6 @@ hb_subset_plan_create (hb_face_t	 *face,
  _populate_unicodes_to_retain (input->unicodes, input->glyphs, plan);

  _populate_gids_to_retain (plan,
-			    input->unicodes,
-			    input->glyphs,
 			    !input->drop_tables->has (HB_OT_TAG_GSUB),
 			    !input->drop_tables->has (HB_OT_TAG_GPOS),
 			    !input->drop_tables->has (HB_OT_TAG_GDEF));
--- a/util/hb-subset.cc
+++ b/util/hb-subset.cc
@ -131,13 +131,6 @@ struct subset_main_t : option_parser_t, face_options_t, output_options_t<false>
    return true;
  }

-  void
-  add_all_unicodes ()
-  {
-    hb_set_t *codepoints = hb_subset_input_unicode_set (input);
-    hb_face_collect_unicodes (face, codepoints);
-  }
-
  void add_options ();

  protected:
@ -165,8 +158,17 @@ parse_gids (const char *name G_GNUC_UNUSED,
 	    GError    **error)
 {
  subset_main_t *subset_main = (subset_main_t *) data;
+  hb_bool_t is_remove = (name[strlen (name) - 1] == '-');
  hb_set_t *gids = hb_subset_input_glyph_set (subset_main->input);

+  if (0 == strcmp (arg, "*"))
+  {
+    hb_set_clear (gids);
+    if (!is_remove)
+      hb_set_invert (gids);
+    return true;
+  }
+
  char *s = (char *) arg;
  char *p;

@ -203,11 +205,17 @@ parse_gids (const char *name G_GNUC_UNUSED,
 		     "Invalid glyph-index range %u-%u", start_code, end_code);
 	return false;
      }
+      if (!is_remove)
        hb_set_add_range (gids, start_code, end_code);
+      else
+        hb_set_del_range (gids, start_code, end_code);
    }
    else
    {
+      if (!is_remove)
        hb_set_add (gids, start_code);
+      else
+        hb_set_del (gids, start_code);
    }

    s = p;
@ -223,8 +231,17 @@ parse_glyphs (const char *name G_GNUC_UNUSED,
 	      GError    **error G_GNUC_UNUSED)
 {
  subset_main_t *subset_main = (subset_main_t *) data;
+  hb_bool_t is_remove = (name[strlen (name) - 1] == '-');
  hb_set_t *gids = hb_subset_input_glyph_set (subset_main->input);

+  if (0 == strcmp (arg, "*"))
+  {
+    hb_set_clear (gids);
+    if (!is_remove)
+      hb_set_invert (gids);
+    return true;
+  }
+
  const char *p = arg;
  const char *p_end = arg + strlen (arg);

@ -248,7 +265,10 @@ parse_glyphs (const char *name G_GNUC_UNUSED,
 	return false;
      }

+      if (!is_remove)
        hb_set_add (gids, gid);
+      else
+        hb_set_del (gids, gid);
    }

    p = end + 1;
@ -265,20 +285,26 @@ parse_text (const char *name G_GNUC_UNUSED,
 	    GError    **error G_GNUC_UNUSED)
 {
  subset_main_t *subset_main = (subset_main_t *) data;
+  hb_bool_t is_remove = (name[strlen (name) - 1] == '-');

+  hb_set_t *unicodes = hb_subset_input_unicode_set (subset_main->input);
  if (0 == strcmp (arg, "*"))
  {
-    subset_main->add_all_unicodes ();
+    hb_set_clear (unicodes);
+    if (!is_remove)
+      hb_set_invert (unicodes);
    return true;
  }

-  hb_set_t *unicodes = hb_subset_input_unicode_set (subset_main->input);
  for (gchar *c = (gchar *) arg;
       *c;
       c = g_utf8_find_next_char(c, nullptr))
  {
    gunichar cp = g_utf8_get_char(c);
+    if (!is_remove)
      hb_set_add (unicodes, cp);
+    else
+      hb_set_del (unicodes, cp);
  }
  return true;
 }
@ -290,16 +316,18 @@ parse_unicodes (const char *name G_GNUC_UNUSED,
 		GError    **error)
 {
  subset_main_t *subset_main = (subset_main_t *) data;
+  hb_bool_t is_remove = (name[strlen (name) - 1] == '-');

+  hb_set_t *unicodes = hb_subset_input_unicode_set (subset_main->input);
  if (0 == strcmp (arg, "*"))
  {
-    subset_main->add_all_unicodes ();
+    hb_set_clear (unicodes);
+    if (!is_remove)
+      hb_set_invert (unicodes);
    return true;
  }

  // XXX TODO Ranges
-  hb_set_t *unicodes = hb_subset_input_unicode_set (subset_main->input);
-
 #define DELIMITERS "<+->{},;&#\\xXuUnNiI\n\t\v\f\r "

  char *s = (char *) arg;
@ -338,11 +366,17 @@ parse_unicodes (const char *name G_GNUC_UNUSED,
 		     "Invalid Unicode range %u-%u", start_code, end_code);
 	return false;
      }
+      if (!is_remove)
        hb_set_add_range (unicodes, start_code, end_code);
+      else
+        hb_set_del_range (unicodes, start_code, end_code);
    }
    else
    {
+      if (!is_remove)
        hb_set_add (unicodes, start_code);
+      else
+        hb_set_del (unicodes, start_code);
    }

    s = p;
@ -667,13 +701,39 @@ subset_main_t::add_options ()

  GOptionEntry glyphset_entries[] =
  {
-    {"gids",		0, 0, G_OPTION_ARG_CALLBACK, (gpointer) &parse_gids,			"Specify glyph IDs or ranges to include in the subset", "list of glyph indices/ranges"},
+    {"gids",		0, 0, G_OPTION_ARG_CALLBACK, (gpointer) &parse_gids,
+     "Specify glyph IDs or ranges to include in the subset.\n"
+     "                                                    "
+     "Use --gids-=... to subtract codepoints from the current set.", "list of glyph indices/ranges or *"},
+    {"gids-",		0, G_OPTION_FLAG_HIDDEN, G_OPTION_ARG_CALLBACK, (gpointer) &parse_gids,			"Specify glyph IDs or ranges to remove from the subset", "list of glyph indices/ranges or *"},
+    {"gids+",		0, G_OPTION_FLAG_HIDDEN, G_OPTION_ARG_CALLBACK, (gpointer) &parse_gids,			"Specify glyph IDs or ranges to include in the subset", "list of glyph indices/ranges or *"},
    {"gids-file",	0, 0, G_OPTION_ARG_CALLBACK, (gpointer) &parse_file_for<parse_gids>,	"Specify file to read glyph IDs or ranges from", "filename"},
-    {"glyphs",		0, 0, G_OPTION_ARG_CALLBACK, (gpointer) &parse_glyphs,			"Specify glyph names to include in the subset", "list of glyph names"},
+    {"glyphs",		0, 0, G_OPTION_ARG_CALLBACK, (gpointer) &parse_glyphs,			"Specify glyph names to include in the subset. Use --glyphs-=... to subtract glyphs from the current set.", "list of glyph names or *"},
+    {"glyphs+",		0, G_OPTION_FLAG_HIDDEN, G_OPTION_ARG_CALLBACK, (gpointer) &parse_glyphs,			"Specify glyph names to include in the subset", "list of glyph names"},
+    {"glyphs-",		0, G_OPTION_FLAG_HIDDEN, G_OPTION_ARG_CALLBACK, (gpointer) &parse_glyphs,			"Specify glyph names to remove from the subset", "list of glyph names"},
+
+
    {"glyphs-file",	0, 0, G_OPTION_ARG_CALLBACK, (gpointer) &parse_file_for<parse_glyphs>,	"Specify file to read glyph names fromt", "filename"},
-    {"text",		0, 0, G_OPTION_ARG_CALLBACK, (gpointer) &parse_text,			"Specify text to include in the subset", "string"},
+
+    {"text",		0, 0, G_OPTION_ARG_CALLBACK, (gpointer) &parse_text,			"Specify text to include in the subset. Use --text-=... to subtract codepoints from the current set.", "string"},
+    {"text-",		0, G_OPTION_FLAG_HIDDEN, G_OPTION_ARG_CALLBACK, (gpointer) &parse_text,			"Specify text to remove from the subset", "string"},
+    {"text+",		0, G_OPTION_FLAG_HIDDEN, G_OPTION_ARG_CALLBACK, (gpointer) &parse_text,			"Specify text to include in the subset", "string"},
+
+
    {"text-file",	0, 0, G_OPTION_ARG_CALLBACK, (gpointer) &parse_file_for<parse_text, false>,"Specify file to read text from", "filename"},
-    {"unicodes",	0, 0, G_OPTION_ARG_CALLBACK, (gpointer) &parse_unicodes,		"Specify Unicode codepoints or ranges to include in the subset", "list of hex numbers/ranges"},
+    {"unicodes",	0, 0, G_OPTION_ARG_CALLBACK, (gpointer) &parse_unicodes,
+     "Specify Unicode codepoints or ranges to include in the subset. Use * to include all codepoints.\n"
+     "                                                    "
+     "--unicodes-=... can be used to subtract codepoints "
+     "from the current set.\n"
+     "                                                    "
+     "For example: --unicodes=* --unicodes-=41,42,43 would create a subset with all codepoints\n"
+     "                                                    "
+     "except for 41, 42, 43.",
+     "list of hex numbers/ranges or *"},
+    {"unicodes-",	0, G_OPTION_FLAG_HIDDEN, G_OPTION_ARG_CALLBACK, (gpointer) &parse_unicodes, "Specify Unicode codepoints or ranges to remove from the subset", "list of hex numbers/ranges or *"},
+    {"unicodes+",	0, G_OPTION_FLAG_HIDDEN, G_OPTION_ARG_CALLBACK, (gpointer) &parse_unicodes, "Specify Unicode codepoints or ranges to include in the subset", "list of hex numbers/ranges or *"},
+
    {"unicodes-file",	0, 0, G_OPTION_ARG_CALLBACK, (gpointer) &parse_file_for<parse_unicodes>,"Specify file to read Unicode codepoints or ranges from", "filename"},
    {nullptr}
  };