From 8f4f47df7c42294c06d6bd4f2d0e1b35c4040eb5 Mon Sep 17 00:00:00 2001 From: Garret Rieger Date: Wed, 25 Aug 2021 13:34:05 -0700 Subject: [PATCH 1/4] [subset] use inverted set for all unicodes. Modify the code that handles input->unicodes to be safe with possibly inverted sets. Also adds --unicodes-= and --unicodes+= flags. --- src/hb-subset-plan.cc | 45 ++++++++++++++++++++++++++----------------- util/hb-subset.cc | 15 ++++++++++++--- 2 files changed, 39 insertions(+), 21 deletions(-) diff --git a/src/hb-subset-plan.cc b/src/hb-subset-plan.cc index 9b74ac2ec..4c273a6be 100644 --- a/src/hb-subset-plan.cc +++ b/src/hb-subset-plan.cc @@ -240,22 +240,24 @@ _populate_unicodes_to_retain (const hb_set_t *unicodes, { OT::cmap::accelerator_t cmap; cmap.init (plan->source); + constexpr static const int size_threshold = 65000; - for (hb_codepoint_t cp : *unicodes) + if (unicodes->get_population () < size_threshold && glyphs->is_empty ()) { - hb_codepoint_t gid; - if (!cmap.get_nominal_glyph (cp, &gid)) + // This is the fast path if it's anticipated that size of unicodes + // is << then the number of codepoints in the font. + for (hb_codepoint_t cp : *unicodes) { - DEBUG_MSG(SUBSET, nullptr, "Drop U+%04X; no gid", cp); - continue; + hb_codepoint_t gid; + if (!cmap.get_nominal_glyph (cp, &gid)) + { + DEBUG_MSG(SUBSET, nullptr, "Drop U+%04X; no gid", cp); + continue; + } + plan->unicodes->add (cp); + plan->codepoint_to_glyph->set (cp, gid); + plan->_glyphset_gsub->add (gid); } - plan->unicodes->add (cp); - plan->codepoint_to_glyph->set (cp, gid); - plan->_glyphset_gsub->add (gid); - } - - if (glyphs->is_empty ()) - { cmap.fini (); return; } @@ -265,17 +267,27 @@ _populate_unicodes_to_retain (const hb_set_t *unicodes, cmap.fini (); for (hb_pair_t cp_gid : - + unicode_glyphid_map.iter () | hb_filter (glyphs, hb_second)) + + unicode_glyphid_map.iter ()) { + if (!unicodes->has (cp_gid.first) && !glyphs->has (cp_gid.second)) + continue; + plan->unicodes->add (cp_gid.first); plan->codepoint_to_glyph->set (cp_gid.first, cp_gid.second); + plan->_glyphset_gsub->add (cp_gid.second); + } + + // Add gids which where requested, but not mapped in cmap + for (hb_codepoint_t gid : glyphs->iter ()) + { + if (gid >= plan->source->get_num_glyphs ()) + break; + plan->_glyphset_gsub->add (gid); } } static void _populate_gids_to_retain (hb_subset_plan_t* plan, - const hb_set_t *unicodes, - const hb_set_t *input_glyphs_to_retain, bool close_over_gsub, bool close_over_gpos, bool close_over_gdef) @@ -292,7 +304,6 @@ _populate_gids_to_retain (hb_subset_plan_t* plan, colr.init (plan->source); plan->_glyphset_gsub->add (0); // Not-def - hb_set_union (plan->_glyphset_gsub, input_glyphs_to_retain); _cmap_closure (plan->source, plan->unicodes, plan->_glyphset_gsub); @@ -477,8 +488,6 @@ hb_subset_plan_create (hb_face_t *face, _populate_unicodes_to_retain (input->unicodes, input->glyphs, plan); _populate_gids_to_retain (plan, - input->unicodes, - input->glyphs, !input->drop_tables->has (HB_OT_TAG_GSUB), !input->drop_tables->has (HB_OT_TAG_GPOS), !input->drop_tables->has (HB_OT_TAG_GDEF)); diff --git a/util/hb-subset.cc b/util/hb-subset.cc index 8456ae93a..fc7a156c9 100644 --- a/util/hb-subset.cc +++ b/util/hb-subset.cc @@ -265,20 +265,26 @@ parse_text (const char *name G_GNUC_UNUSED, GError **error G_GNUC_UNUSED) { subset_main_t *subset_main = (subset_main_t *) data; + hb_bool_t is_remove = (name[strlen (name) - 1] == '-'); + hb_set_t *unicodes = hb_subset_input_unicode_set (subset_main->input); if (0 == strcmp (arg, "*")) { - subset_main->add_all_unicodes (); + hb_set_clear (unicodes); + if (!is_remove) + hb_set_invert (unicodes); return true; } - hb_set_t *unicodes = hb_subset_input_unicode_set (subset_main->input); for (gchar *c = (gchar *) arg; *c; c = g_utf8_find_next_char(c, nullptr)) { gunichar cp = g_utf8_get_char(c); - hb_set_add (unicodes, cp); + if (!is_remove) + hb_set_add (unicodes, cp); + else + hb_set_del (unicodes, cp); } return true; } @@ -674,6 +680,9 @@ subset_main_t::add_options () {"text", 0, 0, G_OPTION_ARG_CALLBACK, (gpointer) &parse_text, "Specify text to include in the subset", "string"}, {"text-file", 0, 0, G_OPTION_ARG_CALLBACK, (gpointer) &parse_file_for,"Specify file to read text from", "filename"}, {"unicodes", 0, 0, G_OPTION_ARG_CALLBACK, (gpointer) &parse_unicodes, "Specify Unicode codepoints or ranges to include in the subset", "list of hex numbers/ranges"}, + {"unicodes-", 0, 0, G_OPTION_ARG_CALLBACK, (gpointer) &parse_unicodes, "Specify Unicode codepoints or ranges to include in the subset", "list of hex numbers/ranges"}, + {"unicodes+", 0, 0, G_OPTION_ARG_CALLBACK, (gpointer) &parse_unicodes, "Specify Unicode codepoints or ranges to include in the subset", "list of hex numbers/ranges"}, + {"unicodes-file", 0, 0, G_OPTION_ARG_CALLBACK, (gpointer) &parse_file_for,"Specify file to read Unicode codepoints or ranges from", "filename"}, {nullptr} }; From fa4bf7cf58e9193981c9f4a6da7f15f7ba4332d2 Mon Sep 17 00:00:00 2001 From: Garret Rieger Date: Wed, 25 Aug 2021 14:31:11 -0700 Subject: [PATCH 2/4] [subset] use inverted sets for glyph id input. Adds --gids-=, --glyphs-=, --text-=, --unicodes-= options. Use inverted sets to represent all glyphs and/or all unicodes. --- util/hb-subset.cc | 84 +++++++++++++++++++++++++++++++++++------------ 1 file changed, 63 insertions(+), 21 deletions(-) diff --git a/util/hb-subset.cc b/util/hb-subset.cc index fc7a156c9..2b9935b8a 100644 --- a/util/hb-subset.cc +++ b/util/hb-subset.cc @@ -131,13 +131,6 @@ struct subset_main_t : option_parser_t, face_options_t, output_options_t return true; } - void - add_all_unicodes () - { - hb_set_t *codepoints = hb_subset_input_unicode_set (input); - hb_face_collect_unicodes (face, codepoints); - } - void add_options (); protected: @@ -165,8 +158,17 @@ parse_gids (const char *name G_GNUC_UNUSED, GError **error) { subset_main_t *subset_main = (subset_main_t *) data; + hb_bool_t is_remove = (name[strlen (name) - 1] == '-'); hb_set_t *gids = hb_subset_input_glyph_set (subset_main->input); + if (0 == strcmp (arg, "*")) + { + hb_set_clear (gids); + if (!is_remove) + hb_set_invert (gids); + return true; + } + char *s = (char *) arg; char *p; @@ -203,11 +205,17 @@ parse_gids (const char *name G_GNUC_UNUSED, "Invalid glyph-index range %u-%u", start_code, end_code); return false; } - hb_set_add_range (gids, start_code, end_code); + if (!is_remove) + hb_set_add_range (gids, start_code, end_code); + else + hb_set_del_range (gids, start_code, end_code); } else { - hb_set_add (gids, start_code); + if (!is_remove) + hb_set_add (gids, start_code); + else + hb_set_del (gids, start_code); } s = p; @@ -223,8 +231,17 @@ parse_glyphs (const char *name G_GNUC_UNUSED, GError **error G_GNUC_UNUSED) { subset_main_t *subset_main = (subset_main_t *) data; + hb_bool_t is_remove = (name[strlen (name) - 1] == '-'); hb_set_t *gids = hb_subset_input_glyph_set (subset_main->input); + if (0 == strcmp (arg, "*")) + { + hb_set_clear (gids); + if (!is_remove) + hb_set_invert (gids); + return true; + } + const char *p = arg; const char *p_end = arg + strlen (arg); @@ -248,7 +265,10 @@ parse_glyphs (const char *name G_GNUC_UNUSED, return false; } - hb_set_add (gids, gid); + if (!is_remove) + hb_set_add (gids, gid); + else + hb_set_del (gids, gid); } p = end + 1; @@ -296,16 +316,18 @@ parse_unicodes (const char *name G_GNUC_UNUSED, GError **error) { subset_main_t *subset_main = (subset_main_t *) data; + hb_bool_t is_remove = (name[strlen (name) - 1] == '-'); + hb_set_t *unicodes = hb_subset_input_unicode_set (subset_main->input); if (0 == strcmp (arg, "*")) { - subset_main->add_all_unicodes (); + hb_set_clear (unicodes); + if (!is_remove) + hb_set_invert (unicodes); return true; } // XXX TODO Ranges - hb_set_t *unicodes = hb_subset_input_unicode_set (subset_main->input); - #define DELIMITERS "<+->{},;&#\\xXuUnNiI\n\t\v\f\r " char *s = (char *) arg; @@ -344,11 +366,17 @@ parse_unicodes (const char *name G_GNUC_UNUSED, "Invalid Unicode range %u-%u", start_code, end_code); return false; } - hb_set_add_range (unicodes, start_code, end_code); + if (!is_remove) + hb_set_add_range (unicodes, start_code, end_code); + else + hb_set_del_range (unicodes, start_code, end_code); } else { - hb_set_add (unicodes, start_code); + if (!is_remove) + hb_set_add (unicodes, start_code); + else + hb_set_del (unicodes, start_code); } s = p; @@ -673,15 +701,29 @@ subset_main_t::add_options () GOptionEntry glyphset_entries[] = { - {"gids", 0, 0, G_OPTION_ARG_CALLBACK, (gpointer) &parse_gids, "Specify glyph IDs or ranges to include in the subset", "list of glyph indices/ranges"}, + {"gids", 0, 0, G_OPTION_ARG_CALLBACK, (gpointer) &parse_gids, "Specify glyph IDs or ranges to include in the subset. Use --gids-=... to subtract codepoints from the current set.", "list of glyph indices/ranges or *"}, + {"gids-", 0, G_OPTION_FLAG_HIDDEN, G_OPTION_ARG_CALLBACK, (gpointer) &parse_gids, "Specify glyph IDs or ranges to remove from the subset", "list of glyph indices/ranges or *"}, + {"gids+", 0, G_OPTION_FLAG_HIDDEN, G_OPTION_ARG_CALLBACK, (gpointer) &parse_gids, "Specify glyph IDs or ranges to include in the subset", "list of glyph indices/ranges or *"}, {"gids-file", 0, 0, G_OPTION_ARG_CALLBACK, (gpointer) &parse_file_for, "Specify file to read glyph IDs or ranges from", "filename"}, - {"glyphs", 0, 0, G_OPTION_ARG_CALLBACK, (gpointer) &parse_glyphs, "Specify glyph names to include in the subset", "list of glyph names"}, + {"glyphs", 0, 0, G_OPTION_ARG_CALLBACK, (gpointer) &parse_glyphs, "Specify glyph names to include in the subset. Use --glyphs-=... to subtract glyphs from the current set.", "list of glyph names or *"}, + {"glyphs+", 0, G_OPTION_FLAG_HIDDEN, G_OPTION_ARG_CALLBACK, (gpointer) &parse_glyphs, "Specify glyph names to include in the subset", "list of glyph names"}, + {"glyphs-", 0, G_OPTION_FLAG_HIDDEN, G_OPTION_ARG_CALLBACK, (gpointer) &parse_glyphs, "Specify glyph names to remove from the subset", "list of glyph names"}, + + {"glyphs-file", 0, 0, G_OPTION_ARG_CALLBACK, (gpointer) &parse_file_for, "Specify file to read glyph names fromt", "filename"}, - {"text", 0, 0, G_OPTION_ARG_CALLBACK, (gpointer) &parse_text, "Specify text to include in the subset", "string"}, + + {"text", 0, 0, G_OPTION_ARG_CALLBACK, (gpointer) &parse_text, "Specify text to include in the subset. Use --text-=... to subtract codepoints from the current set.", "string"}, + {"text-", 0, G_OPTION_FLAG_HIDDEN, G_OPTION_ARG_CALLBACK, (gpointer) &parse_text, "Specify text to remove from the subset", "string"}, + {"text+", 0, G_OPTION_FLAG_HIDDEN, G_OPTION_ARG_CALLBACK, (gpointer) &parse_text, "Specify text to include in the subset", "string"}, + + {"text-file", 0, 0, G_OPTION_ARG_CALLBACK, (gpointer) &parse_file_for,"Specify file to read text from", "filename"}, - {"unicodes", 0, 0, G_OPTION_ARG_CALLBACK, (gpointer) &parse_unicodes, "Specify Unicode codepoints or ranges to include in the subset", "list of hex numbers/ranges"}, - {"unicodes-", 0, 0, G_OPTION_ARG_CALLBACK, (gpointer) &parse_unicodes, "Specify Unicode codepoints or ranges to include in the subset", "list of hex numbers/ranges"}, - {"unicodes+", 0, 0, G_OPTION_ARG_CALLBACK, (gpointer) &parse_unicodes, "Specify Unicode codepoints or ranges to include in the subset", "list of hex numbers/ranges"}, + {"unicodes", 0, 0, G_OPTION_ARG_CALLBACK, (gpointer) &parse_unicodes, + "Specify Unicode codepoints or ranges to include in the subset. Use * to include all codepoints. --unicodes-=... can be used to subtract codepoints " + "from the current set. For example: --unicodes=* --unicodes-=41,42,43 would create a subset with all codepoints except for 41, 42, 43.", + "list of hex numbers/ranges or *"}, + {"unicodes-", 0, G_OPTION_FLAG_HIDDEN, G_OPTION_ARG_CALLBACK, (gpointer) &parse_unicodes, "Specify Unicode codepoints or ranges to remove from the subset", "list of hex numbers/ranges or *"}, + {"unicodes+", 0, G_OPTION_FLAG_HIDDEN, G_OPTION_ARG_CALLBACK, (gpointer) &parse_unicodes, "Specify Unicode codepoints or ranges to include in the subset", "list of hex numbers/ranges or *"}, {"unicodes-file", 0, 0, G_OPTION_ARG_CALLBACK, (gpointer) &parse_file_for,"Specify file to read Unicode codepoints or ranges from", "filename"}, {nullptr} From cd07070e41eb020be0669780be4a8517cfb0e9a6 Mon Sep 17 00:00:00 2001 From: Garret Rieger Date: Wed, 25 Aug 2021 14:42:00 -0700 Subject: [PATCH 3/4] [subset] Move plan unicodes and gids population to end of _populate_unicodes... --- src/hb-subset-plan.cc | 55 ++++++++++++++++++++++--------------------- 1 file changed, 28 insertions(+), 27 deletions(-) diff --git a/src/hb-subset-plan.cc b/src/hb-subset-plan.cc index 4c273a6be..04159eadd 100644 --- a/src/hb-subset-plan.cc +++ b/src/hb-subset-plan.cc @@ -240,12 +240,12 @@ _populate_unicodes_to_retain (const hb_set_t *unicodes, { OT::cmap::accelerator_t cmap; cmap.init (plan->source); - constexpr static const int size_threshold = 65000; + constexpr static const int size_threshold = 4000; if (unicodes->get_population () < size_threshold && glyphs->is_empty ()) { // This is the fast path if it's anticipated that size of unicodes - // is << then the number of codepoints in the font. + // is << than the number of codepoints in the font. for (hb_codepoint_t cp : *unicodes) { hb_codepoint_t gid; @@ -254,36 +254,37 @@ _populate_unicodes_to_retain (const hb_set_t *unicodes, DEBUG_MSG(SUBSET, nullptr, "Drop U+%04X; no gid", cp); continue; } - plan->unicodes->add (cp); + plan->codepoint_to_glyph->set (cp, gid); - plan->_glyphset_gsub->add (gid); } cmap.fini (); - return; + } else { + hb_map_t unicode_glyphid_map; + cmap.collect_mapping (hb_set_get_empty (), &unicode_glyphid_map); + cmap.fini (); + + for (hb_pair_t cp_gid : + + unicode_glyphid_map.iter ()) + { + if (!unicodes->has (cp_gid.first) && !glyphs->has (cp_gid.second)) + continue; + + plan->codepoint_to_glyph->set (cp_gid.first, cp_gid.second); + } + + // Add gids which where requested, but not mapped in cmap + // TODO(garretrieger): once https://github.com/harfbuzz/harfbuzz/issues/3169 + // is implemented, this can be done with union and del_range + for (hb_codepoint_t gid : glyphs->iter ()) + { + if (gid >= plan->source->get_num_glyphs ()) + break; + plan->_glyphset_gsub->add (gid); + } } - hb_map_t unicode_glyphid_map; - cmap.collect_mapping (hb_set_get_empty (), &unicode_glyphid_map); - cmap.fini (); - - for (hb_pair_t cp_gid : - + unicode_glyphid_map.iter ()) - { - if (!unicodes->has (cp_gid.first) && !glyphs->has (cp_gid.second)) - continue; - - plan->unicodes->add (cp_gid.first); - plan->codepoint_to_glyph->set (cp_gid.first, cp_gid.second); - plan->_glyphset_gsub->add (cp_gid.second); - } - - // Add gids which where requested, but not mapped in cmap - for (hb_codepoint_t gid : glyphs->iter ()) - { - if (gid >= plan->source->get_num_glyphs ()) - break; - plan->_glyphset_gsub->add (gid); - } + + plan->codepoint_to_glyph->keys () | hb_sink (plan->unicodes); + + plan->codepoint_to_glyph->values () | hb_sink (plan->_glyphset_gsub); } static void From 75efade7a4290413711c00748b38c547021a6f26 Mon Sep 17 00:00:00 2001 From: Garret Rieger Date: Wed, 25 Aug 2021 14:51:21 -0700 Subject: [PATCH 4/4] [subset] format --gids and --unicodes help messages better. --- util/hb-subset.cc | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/util/hb-subset.cc b/util/hb-subset.cc index 2b9935b8a..0f1aebb65 100644 --- a/util/hb-subset.cc +++ b/util/hb-subset.cc @@ -701,7 +701,10 @@ subset_main_t::add_options () GOptionEntry glyphset_entries[] = { - {"gids", 0, 0, G_OPTION_ARG_CALLBACK, (gpointer) &parse_gids, "Specify glyph IDs or ranges to include in the subset. Use --gids-=... to subtract codepoints from the current set.", "list of glyph indices/ranges or *"}, + {"gids", 0, 0, G_OPTION_ARG_CALLBACK, (gpointer) &parse_gids, + "Specify glyph IDs or ranges to include in the subset.\n" + " " + "Use --gids-=... to subtract codepoints from the current set.", "list of glyph indices/ranges or *"}, {"gids-", 0, G_OPTION_FLAG_HIDDEN, G_OPTION_ARG_CALLBACK, (gpointer) &parse_gids, "Specify glyph IDs or ranges to remove from the subset", "list of glyph indices/ranges or *"}, {"gids+", 0, G_OPTION_FLAG_HIDDEN, G_OPTION_ARG_CALLBACK, (gpointer) &parse_gids, "Specify glyph IDs or ranges to include in the subset", "list of glyph indices/ranges or *"}, {"gids-file", 0, 0, G_OPTION_ARG_CALLBACK, (gpointer) &parse_file_for, "Specify file to read glyph IDs or ranges from", "filename"}, @@ -719,8 +722,14 @@ subset_main_t::add_options () {"text-file", 0, 0, G_OPTION_ARG_CALLBACK, (gpointer) &parse_file_for,"Specify file to read text from", "filename"}, {"unicodes", 0, 0, G_OPTION_ARG_CALLBACK, (gpointer) &parse_unicodes, - "Specify Unicode codepoints or ranges to include in the subset. Use * to include all codepoints. --unicodes-=... can be used to subtract codepoints " - "from the current set. For example: --unicodes=* --unicodes-=41,42,43 would create a subset with all codepoints except for 41, 42, 43.", + "Specify Unicode codepoints or ranges to include in the subset. Use * to include all codepoints.\n" + " " + "--unicodes-=... can be used to subtract codepoints " + "from the current set.\n" + " " + "For example: --unicodes=* --unicodes-=41,42,43 would create a subset with all codepoints\n" + " " + "except for 41, 42, 43.", "list of hex numbers/ranges or *"}, {"unicodes-", 0, G_OPTION_FLAG_HIDDEN, G_OPTION_ARG_CALLBACK, (gpointer) &parse_unicodes, "Specify Unicode codepoints or ranges to remove from the subset", "list of hex numbers/ranges or *"}, {"unicodes+", 0, G_OPTION_FLAG_HIDDEN, G_OPTION_ARG_CALLBACK, (gpointer) &parse_unicodes, "Specify Unicode codepoints or ranges to include in the subset", "list of hex numbers/ranges or *"},