Compare commits
12 Commits
Author | SHA1 | Date |
---|---|---|
Behdad Esfahbod | 90d0200ac7 | |
Behdad Esfahbod | 997809ea4a | |
Behdad Esfahbod | bc25a5694c | |
Behdad Esfahbod | 38c84b2246 | |
Behdad Esfahbod | b7d01281d2 | |
Behdad Esfahbod | 41a7ec0697 | |
Behdad Esfahbod | 6013ead1f5 | |
Behdad Esfahbod | 343da74ec6 | |
Behdad Esfahbod | 6bba9d876a | |
Behdad Esfahbod | 291d30b1ff | |
Behdad Esfahbod | c799742ac1 | |
Behdad Esfahbod | 0033641189 |
|
@ -416,39 +416,47 @@ dump_use_data_SOURCES = dump-use-data.cc hb-ot-shape-complex-use-table.cc
|
||||||
dump_use_data_CPPFLAGS = $(HBCFLAGS)
|
dump_use_data_CPPFLAGS = $(HBCFLAGS)
|
||||||
dump_use_data_LDADD = libharfbuzz.la $(HBLIBS)
|
dump_use_data_LDADD = libharfbuzz.la $(HBLIBS)
|
||||||
|
|
||||||
COMPILED_TESTS = test-algs test-iter test-meta test-number test-ot-tag test-unicode-ranges test-bimap
|
|
||||||
COMPILED_TESTS_CPPFLAGS = $(HBCFLAGS) -DMAIN -UNDEBUG
|
|
||||||
COMPILED_TESTS_LDADD = libharfbuzz.la $(HBLIBS)
|
|
||||||
check_PROGRAMS += $(COMPILED_TESTS)
|
check_PROGRAMS += $(COMPILED_TESTS)
|
||||||
TESTS += $(COMPILED_TESTS)
|
TESTS += $(COMPILED_TESTS)
|
||||||
|
COMPILED_TESTS_CPPFLAGS = $(HBCFLAGS) -DMAIN -UNDEBUG
|
||||||
test_algs_SOURCES = test-algs.cc hb-static.cc
|
COMPILED_TESTS_LDADD = libharfbuzz.la $(HBLIBS)
|
||||||
|
COMPILED_TESTS_SOURCES = \
|
||||||
|
hb-static.cc \
|
||||||
|
$(NULL)
|
||||||
|
COMPILED_TESTS = \
|
||||||
|
test-algs \
|
||||||
|
test-iter \
|
||||||
|
test-meta \
|
||||||
|
test-number \
|
||||||
|
test-simd \
|
||||||
|
test-ot-tag \
|
||||||
|
test-unicode-ranges \
|
||||||
|
test-bimap \
|
||||||
|
$(NULL)
|
||||||
|
test_algs_SOURCES = test-algs.cc $(COMPILED_TESTS_SOURCES)
|
||||||
test_algs_CPPFLAGS = $(COMPILED_TESTS_CPPFLAGS)
|
test_algs_CPPFLAGS = $(COMPILED_TESTS_CPPFLAGS)
|
||||||
test_algs_LDADD = $(COMPILED_TESTS_LDADD)
|
test_algs_LDADD = $(COMPILED_TESTS_LDADD)
|
||||||
|
test_bimap_SOURCES = test-bimap.cc $(COMPILED_TESTS_SOURCES)
|
||||||
test_iter_SOURCES = test-iter.cc hb-static.cc
|
|
||||||
test_iter_CPPFLAGS = $(COMPILED_TESTS_CPPFLAGS)
|
|
||||||
test_iter_LDADD = $(COMPILED_TESTS_LDADD)
|
|
||||||
|
|
||||||
test_meta_SOURCES = test-meta.cc hb-static.cc
|
|
||||||
test_meta_CPPFLAGS = $(COMPILED_TESTS_CPPFLAGS)
|
|
||||||
test_meta_LDADD = $(COMPILED_TESTS_LDADD)
|
|
||||||
|
|
||||||
test_number_SOURCES = test-number.cc hb-number.cc
|
|
||||||
test_number_CPPFLAGS = $(COMPILED_TESTS_CPPFLAGS)
|
|
||||||
test_number_LDADD = $(COMPILED_TESTS_LDADD)
|
|
||||||
|
|
||||||
test_ot_tag_SOURCES = hb-ot-tag.cc
|
|
||||||
test_ot_tag_CPPFLAGS = $(COMPILED_TESTS_CPPFLAGS)
|
|
||||||
test_ot_tag_LDADD = $(COMPILED_TESTS_LDADD)
|
|
||||||
|
|
||||||
test_unicode_ranges_SOURCES = test-unicode-ranges.cc
|
|
||||||
test_unicode_ranges_CPPFLAGS = $(COMPILED_TESTS_CPPFLAGS)
|
|
||||||
test_unicode_ranges_LDADD = $(COMPILED_TESTS_LDADD)
|
|
||||||
|
|
||||||
test_bimap_SOURCES = test-bimap.cc hb-static.cc
|
|
||||||
test_bimap_CPPFLAGS = $(COMPILED_TESTS_CPPFLAGS)
|
test_bimap_CPPFLAGS = $(COMPILED_TESTS_CPPFLAGS)
|
||||||
test_bimap_LDADD = $(COMPILED_TESTS_LDADD)
|
test_bimap_LDADD = $(COMPILED_TESTS_LDADD)
|
||||||
|
test_iter_SOURCES = test-iter.cc $(COMPILED_TESTS_SOURCES)
|
||||||
|
test_iter_CPPFLAGS = $(COMPILED_TESTS_CPPFLAGS)
|
||||||
|
test_iter_LDADD = $(COMPILED_TESTS_LDADD)
|
||||||
|
test_meta_SOURCES = test-meta.cc $(COMPILED_TESTS_SOURCES)
|
||||||
|
test_meta_CPPFLAGS = $(COMPILED_TESTS_CPPFLAGS)
|
||||||
|
test_meta_LDADD = $(COMPILED_TESTS_LDADD)
|
||||||
|
test_number_SOURCES = test-number.cc $(COMPILED_TESTS_SOURCES)
|
||||||
|
test_number_CPPFLAGS = $(COMPILED_TESTS_CPPFLAGS)
|
||||||
|
test_number_LDADD = $(COMPILED_TESTS_LDADD)
|
||||||
|
test_ot_tag_SOURCES = hb-ot-tag.cc $(COMPILED_TESTS_SOURCES)
|
||||||
|
test_ot_tag_CPPFLAGS = $(COMPILED_TESTS_CPPFLAGS)
|
||||||
|
test_ot_tag_LDADD = $(COMPILED_TESTS_LDADD)
|
||||||
|
test_simd_SOURCES = test-simd.cc $(COMPILED_TESTS_SOURCES)
|
||||||
|
test_simd_CPPFLAGS = $(COMPILED_TESTS_CPPFLAGS)
|
||||||
|
test_simd_LDADD = $(COMPILED_TESTS_LDADD)
|
||||||
|
test_unicode_ranges_SOURCES = test-unicode-ranges.cc $(COMPILED_TESTS_SOURCES)
|
||||||
|
test_unicode_ranges_CPPFLAGS = $(COMPILED_TESTS_CPPFLAGS)
|
||||||
|
test_unicode_ranges_LDADD = $(COMPILED_TESTS_LDADD)
|
||||||
|
|
||||||
TESTS_ENVIRONMENT = \
|
TESTS_ENVIRONMENT = \
|
||||||
srcdir="$(srcdir)" \
|
srcdir="$(srcdir)" \
|
||||||
|
|
|
@ -148,6 +148,7 @@ HB_BASE_sources = \
|
||||||
hb-set-digest.hh \
|
hb-set-digest.hh \
|
||||||
hb-set.cc \
|
hb-set.cc \
|
||||||
hb-set.hh \
|
hb-set.hh \
|
||||||
|
hb-simd.hh \
|
||||||
hb-shape-plan.cc \
|
hb-shape-plan.cc \
|
||||||
hb-shape-plan.hh \
|
hb-shape-plan.hh \
|
||||||
hb-shape.cc \
|
hb-shape.cc \
|
||||||
|
|
|
@ -31,8 +31,14 @@
|
||||||
#include "hb-algs.hh"
|
#include "hb-algs.hh"
|
||||||
#include "hb-iter.hh"
|
#include "hb-iter.hh"
|
||||||
#include "hb-null.hh"
|
#include "hb-null.hh"
|
||||||
|
#include "hb-simd.hh"
|
||||||
|
|
||||||
|
|
||||||
|
namespace OT {
|
||||||
|
struct HBGlyphID;
|
||||||
|
struct RangeRecord;
|
||||||
|
}
|
||||||
|
|
||||||
template <typename Type>
|
template <typename Type>
|
||||||
struct hb_sorted_array_t;
|
struct hb_sorted_array_t;
|
||||||
|
|
||||||
|
@ -303,7 +309,7 @@ struct hb_sorted_array_t :
|
||||||
{
|
{
|
||||||
unsigned pos;
|
unsigned pos;
|
||||||
|
|
||||||
if (bsearch_impl (x, &pos))
|
if (bsearch_impl (x, &pos, hb_prioritize))
|
||||||
{
|
{
|
||||||
if (i)
|
if (i)
|
||||||
*i = pos;
|
*i = pos;
|
||||||
|
@ -328,8 +334,9 @@ struct hb_sorted_array_t :
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
bool bsearch_impl (const T &x, unsigned *pos) const
|
bool bsearch_impl (const T &x, unsigned *pos, hb_priority<0>) const
|
||||||
{
|
{
|
||||||
return hb_bsearch_impl (pos,
|
return hb_bsearch_impl (pos,
|
||||||
x,
|
x,
|
||||||
|
@ -338,6 +345,89 @@ struct hb_sorted_array_t :
|
||||||
sizeof (Type),
|
sizeof (Type),
|
||||||
_hb_cmp_method<T, Type>);
|
_hb_cmp_method<T, Type>);
|
||||||
}
|
}
|
||||||
|
#ifndef HB_NO_SIMD
|
||||||
|
template <typename U = Type,
|
||||||
|
hb_enable_if (hb_is_same (hb_decay<U>, OT::HBGlyphID))>
|
||||||
|
bool bsearch_impl (hb_codepoint_t x, unsigned *pos, hb_priority<1>) const
|
||||||
|
{
|
||||||
|
#ifdef HB_SIMD_VERIFY
|
||||||
|
unsigned p0;
|
||||||
|
bool f0 = hb_bsearch_impl (&p0,
|
||||||
|
x,
|
||||||
|
this->arrayZ,
|
||||||
|
this->length,
|
||||||
|
sizeof (Type),
|
||||||
|
_hb_cmp_method<hb_codepoint_t, Type>);
|
||||||
|
unsigned p1;
|
||||||
|
bool f1 = hb_simd_ksearch_glyphid (&p1,
|
||||||
|
x,
|
||||||
|
this->arrayZ,
|
||||||
|
this->length,
|
||||||
|
sizeof (Type));
|
||||||
|
if (f0 != f1 || p0 != p1)
|
||||||
|
{
|
||||||
|
fprintf (stderr, "FAILED: Searching for %d; expected %d @ %d; got %d @ %d\n",
|
||||||
|
x, f0, p0, f1, p1);
|
||||||
|
for (unsigned i = 0; i < this->length; i++)
|
||||||
|
printf ("Glyph %d: %d\n", i,
|
||||||
|
(unsigned) this->arrayZ[i]);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
if (likely (this->length < 64))
|
||||||
|
return hb_bsearch_impl (pos,
|
||||||
|
x,
|
||||||
|
this->arrayZ,
|
||||||
|
this->length,
|
||||||
|
sizeof (Type),
|
||||||
|
_hb_cmp_method<hb_codepoint_t, Type>);
|
||||||
|
return hb_simd_ksearch_glyphid (pos,
|
||||||
|
x,
|
||||||
|
this->arrayZ,
|
||||||
|
this->length,
|
||||||
|
sizeof (Type));
|
||||||
|
}
|
||||||
|
template <typename U = Type,
|
||||||
|
hb_enable_if (hb_is_same (hb_decay<U>, OT::RangeRecord) && false)>
|
||||||
|
bool bsearch_impl (hb_codepoint_t x, unsigned *pos, hb_priority<1>) const
|
||||||
|
{
|
||||||
|
#ifdef HB_SIMD_VERIFY
|
||||||
|
unsigned p0;
|
||||||
|
bool f0 = hb_bsearch_impl (&p0,
|
||||||
|
x,
|
||||||
|
this->arrayZ,
|
||||||
|
this->length,
|
||||||
|
sizeof (Type),
|
||||||
|
_hb_cmp_method<hb_codepoint_t, Type>);
|
||||||
|
unsigned p1;
|
||||||
|
bool f1 = hb_simd_ksearch_glyphid_range (&p1,
|
||||||
|
x,
|
||||||
|
this->arrayZ,
|
||||||
|
this->length,
|
||||||
|
sizeof (Type));
|
||||||
|
if (f0 != f1 || p0 != p1)
|
||||||
|
{
|
||||||
|
fprintf (stderr, "FAILED: Searching for %d; expected %d @ %d; got %d @ %d\n",
|
||||||
|
x, f0, p0, f1, p1);
|
||||||
|
for (unsigned i = 0; i < this->length; i++)
|
||||||
|
printf ("Range %d: %d..%d\n", i,
|
||||||
|
(unsigned) this->arrayZ[i].first,
|
||||||
|
(unsigned) this->arrayZ[i].last);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
if (likely (this->length < 64))
|
||||||
|
return hb_bsearch_impl (pos,
|
||||||
|
x,
|
||||||
|
this->arrayZ,
|
||||||
|
this->length,
|
||||||
|
sizeof (Type),
|
||||||
|
_hb_cmp_method<hb_codepoint_t, Type>);
|
||||||
|
return hb_simd_ksearch_glyphid_range (pos,
|
||||||
|
x,
|
||||||
|
this->arrayZ,
|
||||||
|
this->length,
|
||||||
|
sizeof (Type));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
};
|
};
|
||||||
template <typename T> inline hb_sorted_array_t<T>
|
template <typename T> inline hb_sorted_array_t<T>
|
||||||
hb_sorted_array (T *array, unsigned int length)
|
hb_sorted_array (T *array, unsigned int length)
|
||||||
|
|
|
@ -24,6 +24,7 @@
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include "hb.hh"
|
#include "hb.hh"
|
||||||
|
#include "hb-number.hh"
|
||||||
#include "hb-machinery.hh"
|
#include "hb-machinery.hh"
|
||||||
#include "hb-number.hh"
|
#include "hb-number.hh"
|
||||||
#include "hb-number-parser.hh"
|
#include "hb-number-parser.hh"
|
||||||
|
|
|
@ -0,0 +1,301 @@
|
||||||
|
/*
|
||||||
|
* Copyright © 2019 Facebook, Inc.
|
||||||
|
*
|
||||||
|
* This is part of HarfBuzz, a text shaping library.
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, without written agreement and without
|
||||||
|
* license or royalty fees, to use, copy, modify, and distribute this
|
||||||
|
* software and its documentation for any purpose, provided that the
|
||||||
|
* above copyright notice and the following two paragraphs appear in
|
||||||
|
* all copies of this software.
|
||||||
|
*
|
||||||
|
* IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR
|
||||||
|
* DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
|
||||||
|
* ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN
|
||||||
|
* IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
|
||||||
|
* DAMAGE.
|
||||||
|
*
|
||||||
|
* THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING,
|
||||||
|
* BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS
|
||||||
|
* ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO
|
||||||
|
* PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
|
||||||
|
*
|
||||||
|
* Facebook Author(s): Behdad Esfahbod
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef HB_SIMD_HH
|
||||||
|
#define HB_SIMD_HH
|
||||||
|
|
||||||
|
#include "hb.hh"
|
||||||
|
#include "hb-meta.hh"
|
||||||
|
#include "hb-algs.hh"
|
||||||
|
|
||||||
|
/*
|
||||||
|
* = MOTIVATION
|
||||||
|
*
|
||||||
|
* We hit a performance plateau with HarfBuzz many many years ago. The last
|
||||||
|
* *major* speedup was when hb_set_digest_t was added back in 2012. Since then
|
||||||
|
* we have shaved off a lot or extra work, but none of that matters ince
|
||||||
|
* shaping time is dominated by two hot spots, which no tweaks whatsoever could
|
||||||
|
* speed up any more. SIMD is the only chance to gain a major speedup over
|
||||||
|
* that, and that is what this file is about.
|
||||||
|
*
|
||||||
|
* For heavy fonts, ie. those having dozens, or over a hundred, lookups (Amiri,
|
||||||
|
* IranNastaliq, NotoNastaliqUrdu, heavy Indic fonts, etc), the bottleneck is
|
||||||
|
* just looping over lookups and for each lookup over the buffer contents and
|
||||||
|
* testing wheater current lookup applies to current glyph; normally that would
|
||||||
|
* be a binary search in the Coverage table, but that's where the
|
||||||
|
* hb_set_digest_t speedup came from: hb_set_digest_t are narrow (3 or 4
|
||||||
|
* integers) structures that implement approximate matching, similar to Bloom
|
||||||
|
* Filters or Quotient Filters. These digests do all their work using bitwise
|
||||||
|
* operations, so they can be easily vectorized. Combined with a gather
|
||||||
|
* operation, or just multiple fetches in a row (which should parallelize) when
|
||||||
|
* gather is not available. This will allow us to skip over 8 or 16 glyphs at
|
||||||
|
* a time.
|
||||||
|
*
|
||||||
|
* For fast fonts, like simple Latin fonts, like Roboto, the majority of time
|
||||||
|
* is spent in binary searching in the Coverage table of kern and liga lookups.
|
||||||
|
* We can, again, use vector gather and comparison operations to implement a
|
||||||
|
* 9ary or 17ary search instead of binary search, which will reduce search
|
||||||
|
* depth by 3x / 4x respectively. It's important to keep in mind that a
|
||||||
|
* 16-at-a-time 17ary search is /not/ in any way 17 times faster. Only 4 times
|
||||||
|
* faster at best since the number of search steps compared to binary search is
|
||||||
|
* log(17)/log(2) ~= 4. That should be taken into account while assessing
|
||||||
|
* various designs.
|
||||||
|
*
|
||||||
|
* The rest of this files adds facilities to implement those, and possibly
|
||||||
|
* more.
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* = INTRO to VECTOR EXTENSIONS
|
||||||
|
*
|
||||||
|
* A great series of short articles to get started with vector extensions is:
|
||||||
|
*
|
||||||
|
* https://www.codingame.com/playgrounds/283/sse-avx-vectorization/
|
||||||
|
*
|
||||||
|
* If analyzing existing vector implementation to improve performance, the
|
||||||
|
* following comes handy since it has instruction latencies, throughputs, and
|
||||||
|
* micro-operation breakdown:
|
||||||
|
*
|
||||||
|
* https://www.agner.org/optimize/instruction_tables.pdf
|
||||||
|
*
|
||||||
|
* For x86 programming the following intrinsics guide and cheatsheet are
|
||||||
|
* indispensible:
|
||||||
|
*
|
||||||
|
* https://software.intel.com/sites/landingpage/IntrinsicsGuide/
|
||||||
|
* https://db.in.tum.de/~finis/x86-intrin-cheatsheet-v2.2.pdf
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* = WHICH EXTENSIONS TO TARGET
|
||||||
|
*
|
||||||
|
* There are four vector extensions that matter for integer work like we need:
|
||||||
|
*
|
||||||
|
* On x86 / x86_64 (Intel / AMD), those are (in order of introduction):
|
||||||
|
*
|
||||||
|
* - SSE2 128 bits
|
||||||
|
* - AVX2 256 bits
|
||||||
|
* - AVX-512 512 bits
|
||||||
|
*
|
||||||
|
* On ARM there seems to be just:
|
||||||
|
*
|
||||||
|
* - NEON 128 bits
|
||||||
|
*
|
||||||
|
* Intel also provides an implementation of the NEON intrinsics API backed by SSE:
|
||||||
|
*
|
||||||
|
* https://github.com/intel/ARM_NEON_2_x86_SSE
|
||||||
|
*
|
||||||
|
* So it's possible we can skip implementing SSE2 separately.
|
||||||
|
*
|
||||||
|
* To see which extensions are available on your machine you can, eg.:
|
||||||
|
*
|
||||||
|
* $ gcc -march=native -dM -E - < /dev/null | egrep "SSE|AVX" | sort
|
||||||
|
* #define __AVX__ 1
|
||||||
|
* #define __AVX2__ 1
|
||||||
|
* #define __SSE__ 1
|
||||||
|
* #define __SSE2__ 1
|
||||||
|
* #define __SSE2_MATH__ 1
|
||||||
|
* #define __SSE3__ 1
|
||||||
|
* #define __SSE4_1__ 1
|
||||||
|
* #define __SSE4_2__ 1
|
||||||
|
* #define __SSE_MATH__ 1
|
||||||
|
* #define __SSSE3__ 1
|
||||||
|
*
|
||||||
|
* If no arch is set, my Fedora defaults to enabling SSE2 but not any AVX:
|
||||||
|
*
|
||||||
|
* $ gcc -dM -E - < /dev/null | egrep "SSE|AVX" | sort
|
||||||
|
* #define __SSE__ 1
|
||||||
|
* #define __SSE2__ 1
|
||||||
|
* #define __SSE2_MATH__ 1
|
||||||
|
* #define __SSE_MATH__ 1
|
||||||
|
*
|
||||||
|
* Given that, we should explore an SSE2 target at some point for sure
|
||||||
|
* (possibly via the NEON path). But initially, targetting AVX2 makes most
|
||||||
|
* sense, for the following reasons:
|
||||||
|
*
|
||||||
|
* - The number of the integer operations we do on each vector is very few.
|
||||||
|
* This means that how fast we can load data into the vector registers is of
|
||||||
|
* paramount importance on the resulting performance. AVX2 is the first
|
||||||
|
* extension to add a "gather" operation. We will use that to load 8 or 16
|
||||||
|
* glyphs from non-consecutive memory addresses into one vector.
|
||||||
|
*
|
||||||
|
* - AVX-512 is practically the same as AVX2 but extended to 512 bits instead
|
||||||
|
* of 256. However, it's not widely available on lower-power CPUs. For
|
||||||
|
* example, my 2019 ThinkPad Yoga X1 does *not* support it. We should
|
||||||
|
* definitely explore that, but not initially. Also, it is possible that the
|
||||||
|
* extra memory load that puts will defeat the speedup we can gain from it.
|
||||||
|
* Also do note that for the search usecase, doubling the bitwidth from 256
|
||||||
|
* to 512, as discussed, only has hard max benefit cap of less than 30%
|
||||||
|
* speedup (log(17) / log(9)). Must be implemented and measured carefully.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* DESIGN
|
||||||
|
*
|
||||||
|
* There are a few complexities to using AVX2 though, which we need to
|
||||||
|
* overcome. The main one that sticks out is:
|
||||||
|
*
|
||||||
|
* While the 256 bit integer register (called __m256i) can be used as 16
|
||||||
|
* uint16_t values, there /doens't/ exist a 16bit version of the gather
|
||||||
|
* operation. The closest there is gathers 8 uint32_t values
|
||||||
|
* (_mm_[mask_]i32gather_epi32()). I believe this is because there are only
|
||||||
|
* eight ports to fetch from memory concurrently.
|
||||||
|
*
|
||||||
|
* Whatever the reason, this is wasteful. OpenType glyph indices are 16bit
|
||||||
|
* numbers, so in theory we should be able to process 16 at a time, if there
|
||||||
|
* was a 16bit gather opperator.
|
||||||
|
*
|
||||||
|
* This is also problematic for loading GlyphID values in that we should make
|
||||||
|
* sure the extra two bytes being loaded by the 32bit gather operator does
|
||||||
|
* not cause invalid memory access, before we throw those extra bytes away.
|
||||||
|
*
|
||||||
|
* There are a couple of different approaches we can take to mitigate these:
|
||||||
|
*
|
||||||
|
* Making two gather calls sequentically and swizzling the results together
|
||||||
|
* might work just fine. Specially since the second one will be fulfilled
|
||||||
|
* straight from the L1 cachelines.
|
||||||
|
*
|
||||||
|
* Another snag I hit is that AVX2 only has signed comparisons, not unsigned.
|
||||||
|
* So we add a shift to convert uint16_t numbers to int16_t before comparing.
|
||||||
|
*
|
||||||
|
* Also note that the order of arguments to _mm256_set_epi32() and family
|
||||||
|
* is opposite of what I originally assumed. Docs are correct, just not
|
||||||
|
* what you assume.
|
||||||
|
*
|
||||||
|
* PREFETCH
|
||||||
|
*
|
||||||
|
* We should use prefetch instructions to request load of the next batch of
|
||||||
|
* hb_glyph_info_t's while we process the current batch.
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Choose intrinsics set to use.
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
#if !defined(HB_NO_SIMD) && (defined(HB_SIMD_AVX2) || defined(__AVX2__))
|
||||||
|
|
||||||
|
#pragma GCC target("avx2")
|
||||||
|
|
||||||
|
#include <x86intrin.h>
|
||||||
|
|
||||||
|
/* TODO: Test -mvzeroupper. */
|
||||||
|
|
||||||
|
|
||||||
|
template <bool is_range = false>
|
||||||
|
static inline bool
|
||||||
|
hb_simd_ksearch_glyphid (unsigned *pos, /* Out */
|
||||||
|
hb_codepoint_t k,
|
||||||
|
const void *base,
|
||||||
|
size_t length,
|
||||||
|
size_t stride)
|
||||||
|
{
|
||||||
|
if (unlikely (k & ~0xFFFF))
|
||||||
|
{
|
||||||
|
*pos = length;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
*pos = 0;
|
||||||
|
|
||||||
|
/* Find deptch of search tree. */
|
||||||
|
static const unsigned steps[] = {1, 9, 81, 729, 6561, 59049};
|
||||||
|
unsigned rank = 1;
|
||||||
|
while (rank < ARRAY_LENGTH (steps) && length >= steps[rank])
|
||||||
|
rank++;
|
||||||
|
|
||||||
|
static const __m256i _1x8 = _mm256_set1_epi32 (+1);
|
||||||
|
static const __m256i __1x8 = _mm256_set1_epi32 (-1);
|
||||||
|
static const __m256i _12345678 = _mm256_set_epi32 (8, 7, 6, 5, 4, 3, 2, 1);
|
||||||
|
static const __m256i __32768x16 = _mm256_set1_epi16 (-32768);
|
||||||
|
|
||||||
|
/* Set up key vector. */
|
||||||
|
const __m256i K = _mm256_add_epi16 (_mm256_set1_epi16 ((signed) k - 32768), _1x8);
|
||||||
|
|
||||||
|
while (rank)
|
||||||
|
{
|
||||||
|
unsigned step = steps[--rank];
|
||||||
|
|
||||||
|
/* Load multiple ranges to test against. */
|
||||||
|
const unsigned limit = stride * length;
|
||||||
|
const __m256i limits = _mm256_set1_epi32 (limit + 1);
|
||||||
|
const unsigned pitch = stride * step;
|
||||||
|
const __m256i pitches = _mm256_set1_epi32 (pitch);
|
||||||
|
const __m256i offsets = _mm256_mullo_epi32 (pitches, _12345678);
|
||||||
|
const __m256i mask = _mm256_cmpgt_epi32 (limits, offsets);
|
||||||
|
unsigned back_off = stride;
|
||||||
|
|
||||||
|
/* The actual load... */
|
||||||
|
__m256i V = _mm256_mask_i32gather_epi32 (__1x8,
|
||||||
|
(const int *) ((const char *) base - back_off),
|
||||||
|
offsets, mask, 1);
|
||||||
|
#if __BYTE_ORDER == __LITTLE_ENDIAN
|
||||||
|
static const __m256i bswap16_shuffle = _mm256_set_epi16 (0x0E0F,0x0C0D,0x0A0B,0x0809,
|
||||||
|
0x0607,0x0405,0x0203,0x0001,
|
||||||
|
0x0E0F,0x0C0D,0x0A0B,0x0809,
|
||||||
|
0x0607,0x0405,0x0203,0x0001);
|
||||||
|
V = _mm256_shuffle_epi8 (V, bswap16_shuffle);
|
||||||
|
#endif
|
||||||
|
if (!is_range)
|
||||||
|
{
|
||||||
|
static const __m256i dup_shuffle = _mm256_set_epi16 (0x0D0C,0x0D0C,0x0908,0x0908,
|
||||||
|
0x0504,0x0504,0x0100,0x0100,
|
||||||
|
0x0D0C,0x0D0C,0x0908,0x0908,
|
||||||
|
0x0504,0x0504,0x0100,0x0100);
|
||||||
|
V = _mm256_shuffle_epi8 (V, dup_shuffle);
|
||||||
|
}
|
||||||
|
V = _mm256_add_epi16 (V, __32768x16);
|
||||||
|
|
||||||
|
/* Compare and locate. */
|
||||||
|
unsigned answer = hb_ctz (~_mm256_movemask_epi8 (_mm256_cmpgt_epi16 (K, V))) >> 1;
|
||||||
|
bool found = answer & 1;
|
||||||
|
answer = (answer + 1) >> 1;
|
||||||
|
unsigned move = step * answer;
|
||||||
|
*pos += move;
|
||||||
|
if (found)
|
||||||
|
{
|
||||||
|
*pos -= 1;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
length -= move;
|
||||||
|
base = (const void *) ((const char *) base + stride * move);
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline bool
|
||||||
|
hb_simd_ksearch_glyphid_range (unsigned *pos, /* Out */
|
||||||
|
hb_codepoint_t k,
|
||||||
|
const void *base,
|
||||||
|
size_t length,
|
||||||
|
size_t stride)
|
||||||
|
{
|
||||||
|
return hb_simd_ksearch_glyphid<true> (pos, k, base, length, stride);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#elif !defined(HB_NO_SIMD)
|
||||||
|
#define HB_NO_SIMD
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif /* HB_SIMD_HH */
|
|
@ -608,8 +608,9 @@ struct BEInt<Type, 4>
|
||||||
#include "hb-null.hh" // Requires: hb-meta
|
#include "hb-null.hh" // Requires: hb-meta
|
||||||
#include "hb-algs.hh" // Requires: hb-meta hb-null hb-number
|
#include "hb-algs.hh" // Requires: hb-meta hb-null hb-number
|
||||||
#include "hb-iter.hh" // Requires: hb-algs hb-meta
|
#include "hb-iter.hh" // Requires: hb-algs hb-meta
|
||||||
|
#include "hb-simd.hh" // Requires: hb-algs hb-meta
|
||||||
#include "hb-debug.hh" // Requires: hb-algs hb-atomic
|
#include "hb-debug.hh" // Requires: hb-algs hb-atomic
|
||||||
#include "hb-array.hh" // Requires: hb-algs hb-iter hb-null
|
#include "hb-array.hh" // Requires: hb-algs hb-iter hb-null hb-simd
|
||||||
#include "hb-vector.hh" // Requires: hb-array hb-null
|
#include "hb-vector.hh" // Requires: hb-array hb-null
|
||||||
#include "hb-object.hh" // Requires: hb-atomic hb-mutex hb-vector
|
#include "hb-object.hh" // Requires: hb-atomic hb-mutex hb-vector
|
||||||
|
|
||||||
|
|
|
@ -24,8 +24,7 @@
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include "hb.hh"
|
#include "hb.hh"
|
||||||
#include "hb-number.hh"
|
#include "hb-number.cc"
|
||||||
#include "hb-number-parser.hh"
|
|
||||||
|
|
||||||
|
|
||||||
int
|
int
|
||||||
|
|
|
@ -0,0 +1,86 @@
|
||||||
|
/*
|
||||||
|
* Copyright © 2019 Facebook, Inc.
|
||||||
|
*
|
||||||
|
* This is part of HarfBuzz, a text shaping library.
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, without written agreement and without
|
||||||
|
* license or royalty fees, to use, copy, modify, and distribute this
|
||||||
|
* software and its documentation for any purpose, provided that the
|
||||||
|
* above copyright notice and the following two paragraphs appear in
|
||||||
|
* all copies of this software.
|
||||||
|
*
|
||||||
|
* IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR
|
||||||
|
* DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
|
||||||
|
* ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN
|
||||||
|
* IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
|
||||||
|
* DAMAGE.
|
||||||
|
*
|
||||||
|
* THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING,
|
||||||
|
* BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS
|
||||||
|
* ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO
|
||||||
|
* PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
|
||||||
|
*
|
||||||
|
* Facebook Author(s): Behdad Esfahbod
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "hb.hh"
|
||||||
|
#include "hb-simd.hh"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
struct U16
|
||||||
|
{
|
||||||
|
U16 (unsigned v_)
|
||||||
|
{
|
||||||
|
v[0] = v_ >> 8;
|
||||||
|
v[1] = v_ & 0xFF;
|
||||||
|
}
|
||||||
|
|
||||||
|
uint8_t v[2];
|
||||||
|
};
|
||||||
|
|
||||||
|
int
|
||||||
|
main (int argc, char **argv)
|
||||||
|
{
|
||||||
|
#ifndef HB_NO_SIMD
|
||||||
|
|
||||||
|
const U16 a[] = {1, 2, 5, 10, 16, 19};
|
||||||
|
|
||||||
|
#define TEST(k, f, p) \
|
||||||
|
{ \
|
||||||
|
unsigned pos = 123456789; \
|
||||||
|
bool found = hb_simd_ksearch_glyphid_range (&pos, \
|
||||||
|
k, \
|
||||||
|
a, \
|
||||||
|
ARRAY_LENGTH (a) / 2, \
|
||||||
|
sizeof (a[0]) * 2); \
|
||||||
|
/*printf ("key %d found %d pos %d\n", k, found, pos);*/ \
|
||||||
|
assert (found == f && pos == p); \
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST (0, false, 0);
|
||||||
|
TEST (1, true , 0);
|
||||||
|
TEST (2, true , 0);
|
||||||
|
TEST (3, false, 1);
|
||||||
|
TEST (4, false, 1);
|
||||||
|
TEST (5, true , 1);
|
||||||
|
TEST (6, true , 1);
|
||||||
|
TEST (7, true , 1);
|
||||||
|
TEST (8, true , 1);
|
||||||
|
TEST (9, true , 1);
|
||||||
|
TEST (10, true , 1);
|
||||||
|
TEST (11, false, 2);
|
||||||
|
TEST (12, false, 2);
|
||||||
|
TEST (13, false, 2);
|
||||||
|
TEST (14, false, 2);
|
||||||
|
TEST (15, false, 2);
|
||||||
|
TEST (16, true , 2);
|
||||||
|
TEST (17, true , 2);
|
||||||
|
TEST (18, true , 2);
|
||||||
|
TEST (19, true , 2);
|
||||||
|
TEST (20, false, 3);
|
||||||
|
|
||||||
|
#endif
|
||||||
|
return 0;
|
||||||
|
}
|
Loading…
Reference in New Issue