[subset] Make cmap4 packing more optimal.

The current CMAP4 implementation uses whatever the current codepoint ranges are and then encodes them as indivudal glyph ids or as a delta if possible. However, it's often possible to save bytes by splitting up existing ranges and encoding parts of them using deltas where the cost of splitting the range is less than encoding each glyph individual.
This commit is contained in:
Garret Rieger 2021-11-25 18:15:35 -08:00 committed by Behdad Esfahbod
parent 8aed5c21a3
commit d9660fd58a
35 changed files with 142 additions and 102 deletions

View File

@ -93,120 +93,168 @@ struct CmapSubtableFormat0
struct CmapSubtableFormat4 struct CmapSubtableFormat4
{ {
template<typename Iterator, template<typename Iterator,
typename Writer,
hb_requires (hb_is_iterator (Iterator))> hb_requires (hb_is_iterator (Iterator))>
HBUINT16* serialize_endcode_array (hb_serialize_context_t *c, void to_ranges (Iterator it, Writer& range_writer)
Iterator it)
{ {
HBUINT16 *endCode = c->start_embed<HBUINT16> (); hb_codepoint_t start_cp, run_start_cp, end_cp, last_gid;
hb_codepoint_t prev_endcp = 0xFFFF; int run_length, delta;
for (const auto& _ : +it) enum {
{ FIRST_RANGE,
if (prev_endcp != 0xFFFF && prev_endcp + 1u != _.first) FOLLOWING_RANGE,
{ } mode;
HBUINT16 end_code;
end_code = prev_endcp; while (it) {
c->copy<HBUINT16> (end_code); // Start a new range
start_cp = (*it).first;
run_start_cp = (*it).first;
end_cp = (*it).first;
last_gid = (*it).second;
run_length = 1;
delta = (*it).second - (*it).first;
mode = FIRST_RANGE;
it++;
while (it) {
// Process range
hb_codepoint_t next_cp = (*it).first;
hb_codepoint_t next_gid = (*it).second;
if (next_cp != end_cp + 1) {
// Current range is over, stop processing.
break;
}
if (next_gid == last_gid + 1) {
// The current run continues.
end_cp = next_cp;
run_length++;
last_gid = next_gid;
it++;
continue;
}
// A new run is starting, decide if we want to commit the current run.
int split_cost = (mode == FIRST_RANGE) ? 8 : 16;
int run_cost = run_length * 2;
if (run_cost >= split_cost) {
commit_current_range(start_cp, run_start_cp, end_cp, delta, split_cost, range_writer);
mode = FOLLOWING_RANGE;
start_cp = next_cp;
}
// Start the new run
run_start_cp = next_cp;
end_cp = next_cp;
delta = next_gid - run_start_cp;
run_length = 1;
last_gid = next_gid;
it++;
} }
prev_endcp = _.first;
// Finalize range
commit_current_range (start_cp, run_start_cp, end_cp, delta, 8, range_writer);
} }
{ if (likely (end_cp != 0xFFFF)) {
// last endCode range_writer (0xFFFF, 0xFFFF, 1);
HBUINT16 endcode; }
endcode = prev_endcp; }
if (unlikely (!c->copy<HBUINT16> (endcode))) return nullptr;
// There must be a final entry with end_code == 0xFFFF. /*
if (prev_endcp != 0xFFFF) * Writes the current range as either one or two ranges depending on what is most efficient.
{ */
HBUINT16 finalcode; template<typename Writer>
finalcode = 0xFFFF; void commit_current_range (hb_codepoint_t start,
if (unlikely (!c->copy<HBUINT16> (finalcode))) return nullptr; hb_codepoint_t run_start,
hb_codepoint_t end,
int run_delta,
int split_cost,
Writer& range_writer) {
bool should_split = false;
if (start < run_start && run_start < end) {
int run_cost = (end - run_start + 1) * 2;
if (run_cost >= split_cost) {
should_split = true;
} }
} }
return endCode; if (should_split) {
range_writer (start, run_start - 1, 0);
range_writer (run_start, end, run_delta);
return;
}
if (start == run_start) {
// Range is only a run
range_writer (start, end, run_delta);
return;
}
// Write only a single non-run range.
run_delta = (start == end) ? run_delta : 0;
range_writer (start, end, run_delta);
} }
template<typename Iterator, template<typename Iterator,
hb_requires (hb_is_iterator (Iterator))> hb_requires (hb_is_iterator (Iterator))>
HBUINT16* serialize_startcode_array (hb_serialize_context_t *c, unsigned serialize_find_segcount (Iterator it) {
Iterator it) struct Counter {
{ unsigned segcount = 0;
HBUINT16 *startCode = c->start_embed<HBUINT16> ();
hb_codepoint_t prev_cp = 0xFFFF;
for (const auto& _ : +it) void operator() (hb_codepoint_t start,
{ hb_codepoint_t end,
if (prev_cp == 0xFFFF || prev_cp + 1u != _.first) int delta) {
{ segcount++;
HBUINT16 start_code;
start_code = _.first;
c->copy<HBUINT16> (start_code);
} }
} counter;
prev_cp = _.first; to_ranges (+it, counter);
} return counter.segcount;
// There must be a final entry with end_code == 0xFFFF.
if (it.len () == 0 || prev_cp != 0xFFFF)
{
HBUINT16 finalcode;
finalcode = 0xFFFF;
if (unlikely (!c->copy<HBUINT16> (finalcode))) return nullptr;
}
return startCode;
} }
template<typename Iterator, template<typename Iterator,
hb_requires (hb_is_iterator (Iterator))> hb_requires (hb_is_iterator (Iterator))>
HBINT16* serialize_idDelta_array (hb_serialize_context_t *c, bool serialize_start_end_delta_arrays (hb_serialize_context_t *c,
Iterator it, Iterator it,
HBUINT16 *endCode, int segcount)
HBUINT16 *startCode,
unsigned segcount)
{ {
unsigned i = 0; struct Writer {
hb_codepoint_t last_gid = 0, start_gid = 0, last_cp = 0xFFFF; hb_serialize_context_t *serializer_;
bool use_delta = true; HBUINT16* end_code_;
HBUINT16* start_code_;
HBINT16* id_delta_;
int index_;
HBINT16 *idDelta = c->start_embed<HBINT16> (); Writer(hb_serialize_context_t *serializer)
if ((char *)idDelta - (char *)startCode != (int) segcount * (int) HBINT16::static_size) : serializer_(serializer),
return nullptr; end_code_(nullptr),
start_code_(nullptr),
for (const auto& _ : +it) id_delta_(nullptr),
{ index_ (0) {}
if (_.first == startCode[i]) void operator() (hb_codepoint_t start,
{ hb_codepoint_t end,
use_delta = true; int delta) {
start_gid = _.second; start_code_[index_] = start;
end_code_[index_] = end;
id_delta_[index_] = delta;
index_++;
} }
else if (_.second != last_gid + 1) use_delta = false; } writer(c);
if (_.first == endCode[i]) writer.end_code_ = c->allocate_size<HBUINT16> (HBUINT16::static_size * segcount);
{ c->allocate_size<HBUINT16> (2); // padding
HBINT16 delta; writer.start_code_ = c->allocate_size<HBUINT16> (HBUINT16::static_size * segcount);
if (use_delta) delta = (int)start_gid - (int)startCode[i]; writer.id_delta_ = c->allocate_size<HBINT16> (HBINT16::static_size * segcount);
else delta = 0;
c->copy<HBINT16> (delta);
i++; if (unlikely (!writer.end_code_ || !writer.start_code_ || !writer.id_delta_)) return false;
}
last_gid = _.second; to_ranges (+it, writer);
last_cp = _.first; return true;
}
if (it.len () == 0 || last_cp != 0xFFFF)
{
HBINT16 delta;
delta = 1;
if (unlikely (!c->copy<HBINT16> (delta))) return nullptr;
}
return idDelta;
} }
template<typename Iterator, template<typename Iterator,
@ -257,22 +305,14 @@ struct CmapSubtableFormat4
if (unlikely (!c->extend_min (this))) return; if (unlikely (!c->extend_min (this))) return;
this->format = 4; this->format = 4;
//serialize endCode[] //serialize endCode[], startCode[], idDelta[]
HBUINT16 *endCode = serialize_endcode_array (c, format4_iter); HBUINT16* endCode = c->start_embed<HBUINT16> ();
if (unlikely (!endCode)) return; unsigned segcount = serialize_find_segcount (format4_iter);
if (unlikely (!serialize_start_end_delta_arrays (c, format4_iter, segcount)))
return;
unsigned segcount = (c->length () - min_size) / HBUINT16::static_size; HBUINT16 *startCode = endCode + segcount + 1;
HBINT16 *idDelta = ((HBINT16*)startCode) + segcount;
// 2 bytes of padding.
if (unlikely (!c->allocate_size<HBUINT16> (HBUINT16::static_size))) return; // 2 bytes of padding.
// serialize startCode[]
HBUINT16 *startCode = serialize_startcode_array (c, format4_iter);
if (unlikely (!startCode)) return;
//serialize idDelta[]
HBINT16 *idDelta = serialize_idDelta_array (c, format4_iter, endCode, startCode, segcount);
if (unlikely (!idDelta)) return;
HBUINT16 *idRangeOffset = serialize_rangeoffset_glyid (c, format4_iter, endCode, startCode, idDelta, segcount); HBUINT16 *idRangeOffset = serialize_rangeoffset_glyid (c, format4_iter, endCode, startCode, idDelta, segcount);
if (unlikely (!c->check_success (idRangeOffset))) return; if (unlikely (!c->check_success (idRangeOffset))) return;