[subset] use inverted set for all unicodes.

Modify the code that handles input->unicodes to be safe with possibly inverted sets. Also adds --unicodes-= and --unicodes+= flags.
2021-08-25 13:34:05 -07:00 · 2021-08-25 13:34:05 -07:00 · 8f4f47df7c
parent e9e6d66cd6
commit 8f4f47df7c
2 changed files with 39 additions and 21 deletions
--- a/src/hb-subset-plan.cc
+++ b/src/hb-subset-plan.cc
@ -240,22 +240,24 @@ _populate_unicodes_to_retain (const hb_set_t *unicodes,
 {
  OT::cmap::accelerator_t cmap;
  cmap.init (plan->source);
+  constexpr static const int size_threshold = 65000;

-  for (hb_codepoint_t cp : *unicodes)
+  if (unicodes->get_population () < size_threshold && glyphs->is_empty ())
  {
-    hb_codepoint_t gid;
-    if (!cmap.get_nominal_glyph (cp, &gid))
+    // This is the fast path if it's anticipated that size of unicodes
+    // is << then the number of codepoints in the font.
+    for (hb_codepoint_t cp : *unicodes)
    {
-      DEBUG_MSG(SUBSET, nullptr, "Drop U+%04X; no gid", cp);
-      continue;
+      hb_codepoint_t gid;
+      if (!cmap.get_nominal_glyph (cp, &gid))
+      {
+        DEBUG_MSG(SUBSET, nullptr, "Drop U+%04X; no gid", cp);
+        continue;
+      }
+      plan->unicodes->add (cp);
+      plan->codepoint_to_glyph->set (cp, gid);
+      plan->_glyphset_gsub->add (gid);
    }
-    plan->unicodes->add (cp);
-    plan->codepoint_to_glyph->set (cp, gid);
-    plan->_glyphset_gsub->add (gid);
-  }
-
-  if (glyphs->is_empty ())
-  {
    cmap.fini ();
    return;
  }
@ -265,17 +267,27 @@ _populate_unicodes_to_retain (const hb_set_t *unicodes,
  cmap.fini ();

  for (hb_pair_t<hb_codepoint_t, hb_codepoint_t> cp_gid :
-       + unicode_glyphid_map.iter () | hb_filter (glyphs, hb_second))
+       + unicode_glyphid_map.iter ())
  {
+    if (!unicodes->has (cp_gid.first) && !glyphs->has (cp_gid.second))
+      continue;
+
    plan->unicodes->add (cp_gid.first);
    plan->codepoint_to_glyph->set (cp_gid.first, cp_gid.second);
+    plan->_glyphset_gsub->add (cp_gid.second);
+  }
+
+  // Add gids which where requested, but not mapped in cmap
+  for (hb_codepoint_t gid : glyphs->iter ())
+  {
+    if (gid >= plan->source->get_num_glyphs ())
+      break;
+    plan->_glyphset_gsub->add (gid);
  }
 }

 static void
 _populate_gids_to_retain (hb_subset_plan_t* plan,
-			  const hb_set_t *unicodes,
-			  const hb_set_t *input_glyphs_to_retain,
 			  bool close_over_gsub,
 			  bool close_over_gpos,
 			  bool close_over_gdef)
@ -292,7 +304,6 @@ _populate_gids_to_retain (hb_subset_plan_t* plan,
  colr.init (plan->source);

  plan->_glyphset_gsub->add (0); // Not-def
-  hb_set_union (plan->_glyphset_gsub, input_glyphs_to_retain);

  _cmap_closure (plan->source, plan->unicodes, plan->_glyphset_gsub);

@ -477,8 +488,6 @@ hb_subset_plan_create (hb_face_t	 *face,
  _populate_unicodes_to_retain (input->unicodes, input->glyphs, plan);

  _populate_gids_to_retain (plan,
-			    input->unicodes,
-			    input->glyphs,
 			    !input->drop_tables->has (HB_OT_TAG_GSUB),
 			    !input->drop_tables->has (HB_OT_TAG_GPOS),
 			    !input->drop_tables->has (HB_OT_TAG_GDEF));
--- a/util/hb-subset.cc
+++ b/util/hb-subset.cc
@ -265,20 +265,26 @@ parse_text (const char *name G_GNUC_UNUSED,
 	    GError    **error G_GNUC_UNUSED)
 {
  subset_main_t *subset_main = (subset_main_t *) data;
+  hb_bool_t is_remove = (name[strlen (name) - 1] == '-');

+  hb_set_t *unicodes = hb_subset_input_unicode_set (subset_main->input);
  if (0 == strcmp (arg, "*"))
  {
-    subset_main->add_all_unicodes ();
+    hb_set_clear (unicodes);
+    if (!is_remove)
+      hb_set_invert (unicodes);
    return true;
  }

-  hb_set_t *unicodes = hb_subset_input_unicode_set (subset_main->input);
  for (gchar *c = (gchar *) arg;
       *c;
       c = g_utf8_find_next_char(c, nullptr))
  {
    gunichar cp = g_utf8_get_char(c);
-    hb_set_add (unicodes, cp);
+    if (!is_remove)
+      hb_set_add (unicodes, cp);
+    else
+      hb_set_del (unicodes, cp);
  }
  return true;
 }
@ -674,6 +680,9 @@ subset_main_t::add_options ()
    {"text",		0, 0, G_OPTION_ARG_CALLBACK, (gpointer) &parse_text,			"Specify text to include in the subset", "string"},
    {"text-file",	0, 0, G_OPTION_ARG_CALLBACK, (gpointer) &parse_file_for<parse_text, false>,"Specify file to read text from", "filename"},
    {"unicodes",	0, 0, G_OPTION_ARG_CALLBACK, (gpointer) &parse_unicodes,		"Specify Unicode codepoints or ranges to include in the subset", "list of hex numbers/ranges"},
+    {"unicodes-",	0, 0, G_OPTION_ARG_CALLBACK, (gpointer) &parse_unicodes,		"Specify Unicode codepoints or ranges to include in the subset", "list of hex numbers/ranges"},
+    {"unicodes+",	0, 0, G_OPTION_ARG_CALLBACK, (gpointer) &parse_unicodes,		"Specify Unicode codepoints or ranges to include in the subset", "list of hex numbers/ranges"},
+
    {"unicodes-file",	0, 0, G_OPTION_ARG_CALLBACK, (gpointer) &parse_file_for<parse_unicodes>,"Specify file to read Unicode codepoints or ranges from", "filename"},
    {nullptr}
  };