[simd] Implement 9ary search for RangeRecord

Is good on correctness.  Have not measured performance yet.

Part of https://github.com/harfbuzz/harfbuzz/issues/566
This commit is contained in:
Behdad Esfahbod 2019-12-08 18:55:11 -06:00
parent 291d30b1ff
commit 6bba9d876a
7 changed files with 199 additions and 54 deletions

View File

@ -416,39 +416,47 @@ dump_use_data_SOURCES = dump-use-data.cc hb-ot-shape-complex-use-table.cc
dump_use_data_CPPFLAGS = $(HBCFLAGS)
dump_use_data_LDADD = libharfbuzz.la $(HBLIBS)
COMPILED_TESTS = test-algs test-iter test-meta test-number test-ot-tag test-unicode-ranges test-bimap
COMPILED_TESTS_CPPFLAGS = $(HBCFLAGS) -DMAIN -UNDEBUG
COMPILED_TESTS_LDADD = libharfbuzz.la $(HBLIBS)
check_PROGRAMS += $(COMPILED_TESTS)
TESTS += $(COMPILED_TESTS)
test_algs_SOURCES = test-algs.cc hb-static.cc
COMPILED_TESTS_CPPFLAGS = $(HBCFLAGS) -DMAIN -UNDEBUG
COMPILED_TESTS_LDADD = libharfbuzz.la $(HBLIBS)
COMPILED_TESTS_SOURCES = \
hb-static.cc \
$(NULL)
COMPILED_TESTS = \
test-algs \
test-iter \
test-meta \
test-number \
test-simd \
test-ot-tag \
test-unicode-ranges \
test-bimap \
$(NULL)
test_algs_SOURCES = test-algs.cc $(COMPILED_TESTS_SOURCES)
test_algs_CPPFLAGS = $(COMPILED_TESTS_CPPFLAGS)
test_algs_LDADD = $(COMPILED_TESTS_LDADD)
test_iter_SOURCES = test-iter.cc hb-static.cc
test_iter_CPPFLAGS = $(COMPILED_TESTS_CPPFLAGS)
test_iter_LDADD = $(COMPILED_TESTS_LDADD)
test_meta_SOURCES = test-meta.cc hb-static.cc
test_meta_CPPFLAGS = $(COMPILED_TESTS_CPPFLAGS)
test_meta_LDADD = $(COMPILED_TESTS_LDADD)
test_number_SOURCES = test-number.cc hb-number.cc
test_number_CPPFLAGS = $(COMPILED_TESTS_CPPFLAGS)
test_number_LDADD = $(COMPILED_TESTS_LDADD)
test_ot_tag_SOURCES = hb-ot-tag.cc
test_ot_tag_CPPFLAGS = $(COMPILED_TESTS_CPPFLAGS)
test_ot_tag_LDADD = $(COMPILED_TESTS_LDADD)
test_unicode_ranges_SOURCES = test-unicode-ranges.cc
test_unicode_ranges_CPPFLAGS = $(COMPILED_TESTS_CPPFLAGS)
test_unicode_ranges_LDADD = $(COMPILED_TESTS_LDADD)
test_bimap_SOURCES = test-bimap.cc hb-static.cc
test_bimap_SOURCES = test-bimap.cc $(COMPILED_TESTS_SOURCES)
test_bimap_CPPFLAGS = $(COMPILED_TESTS_CPPFLAGS)
test_bimap_LDADD = $(COMPILED_TESTS_LDADD)
test_iter_SOURCES = test-iter.cc $(COMPILED_TESTS_SOURCES)
test_iter_CPPFLAGS = $(COMPILED_TESTS_CPPFLAGS)
test_iter_LDADD = $(COMPILED_TESTS_LDADD)
test_meta_SOURCES = test-meta.cc $(COMPILED_TESTS_SOURCES)
test_meta_CPPFLAGS = $(COMPILED_TESTS_CPPFLAGS)
test_meta_LDADD = $(COMPILED_TESTS_LDADD)
test_number_SOURCES = test-number.cc $(COMPILED_TESTS_SOURCES)
test_number_CPPFLAGS = $(COMPILED_TESTS_CPPFLAGS)
test_number_LDADD = $(COMPILED_TESTS_LDADD)
test_ot_tag_SOURCES = hb-ot-tag.cc $(COMPILED_TESTS_SOURCES)
test_ot_tag_CPPFLAGS = $(COMPILED_TESTS_CPPFLAGS)
test_ot_tag_LDADD = $(COMPILED_TESTS_LDADD)
test_simd_SOURCES = test-simd.cc $(COMPILED_TESTS_SOURCES)
test_simd_CPPFLAGS = $(COMPILED_TESTS_CPPFLAGS)
test_simd_LDADD = $(COMPILED_TESTS_LDADD)
test_unicode_ranges_SOURCES = test-unicode-ranges.cc $(COMPILED_TESTS_SOURCES)
test_unicode_ranges_CPPFLAGS = $(COMPILED_TESTS_CPPFLAGS)
test_unicode_ranges_LDADD = $(COMPILED_TESTS_LDADD)
TESTS_ENVIRONMENT = \
srcdir="$(srcdir)" \

View File

@ -346,19 +346,17 @@ struct hb_sorted_array_t :
_hb_cmp_method<T, Type>);
}
#ifndef HB_NO_SIMD
#if 0
template <typename U = Type,
hb_enable_if (hb_is_same (hb_decay<U>, OT::RangeRecord))>
bool bsearch_impl (hb_codepoint_t x, unsigned *pos, hb_priority<1>) const
{
return hb_simd_bsearch_glyphid_range (pos,
return hb_simd_ksearch_glyphid_range (pos,
x,
this->arrayZ,
this->length,
sizeof (Type));
}
#endif
#endif
};
template <typename T> inline hb_sorted_array_t<T>
hb_sorted_array (T *array, unsigned int length)

View File

@ -24,6 +24,7 @@
*/
#include "hb.hh"
#include "hb-number.hh"
#include "hb-machinery.hh"
#include "hb-number.hh"
#include "hb-number-parser.hh"

View File

@ -29,6 +29,7 @@
#include "hb.hh"
#include "hb-meta.hh"
#include "hb-algs.hh"
/*
* = MOTIVATION
@ -173,6 +174,12 @@
* might work just fine. Specially since the second one will be fulfilled
* straight from the L1 cachelines.
*
* Another snag I hit is that AVX2 only has signed comparisons, not unsigned.
* So we add a shift to convert uint16_t numbers to int16_t before comparing.
*
* Also note that the order of arguments to _mm256_set_epi32() and family
* is opposite of what I originally assumed. Docs are correct, just not
* what you assume.
*
* PREFETCH
*
@ -194,33 +201,81 @@
/* TODO: Test -mvzeroupper. */
static __m256i x HB_UNUSED;
static inline bool
hb_simd_ksearch_glyphid_range (unsigned *pos, /* Out */
hb_codepoint_t k,
const void *base,
size_t length,
size_t stride)
{
if (unlikely (k & ~0xFFFF))
{
*pos = length;
return false;
}
*pos = 0;
#define HB_2TIMES(x) (x), (x)
#define HB_4TIMES(x) HB_2TIMES(x), HB_2TIMES (x)
#define HB_8TIMES(x) HB_4TIMES(x), HB_4TIMES (x)
#define HB_16TIMES(x) HB_8TIMES (x), HB_8TIMES (x)
/* Find deptch of search tree. */
static const unsigned steps[] = {1, 9, 81, 729, 6561, 59049};
unsigned rank = 1;
while (rank < ARRAY_LENGTH (steps) && length >= steps[rank])
rank++;
static const __m256i _1x8 = _mm256_set_epi32 (HB_8TIMES (1));
static const __m256i stridex8 = _mm256_set_epi32 (HB_8TIMES (stride));
static const __m256i __1x8 = _mm256_set_epi32 (HB_8TIMES (-1));
static const __m256i _12345678 = _mm256_set_epi32 (8, 7, 6, 5, 4, 3, 2, 1);
static const __m256i __32768x16 = _mm256_set_epi16 (HB_16TIMES (-32768));
/* Set up key vector. */
const __m256i K = _mm256_add_epi16 (_mm256_set_epi16 (HB_16TIMES ((signed) k - 32768)), _1x8);
while (rank)
{
unsigned step = steps[--rank];
/* Load multiple ranges to test against. */
const unsigned limit = stride * length;
const __m256i limits = _mm256_set_epi32 (HB_8TIMES (limit));
const unsigned pitch = stride * step;
const __m256i pitches = _mm256_set_epi32 (HB_8TIMES (pitch));
const __m256i offsets = _mm256_sub_epi32 (_mm256_mullo_epi32 (pitches, _12345678), stridex8);
const __m256i mask = _mm256_cmpgt_epi32 (limits, offsets);
/* The actual load... */
__m256i V = _mm256_mask_i32gather_epi32 (__1x8, (const int *) base, offsets, mask, 1);
#if __BYTE_ORDER == __LITTLE_ENDIAN
V = _mm256_add_epi16 (_mm256_slli_epi16 (V, 8),
_mm256_srli_epi16 (V, 8));
#endif
V = _mm256_add_epi16 (V, __32768x16);
/* Compare and locate. */
unsigned answer = hb_ctz (~_mm256_movemask_epi8 (_mm256_cmpgt_epi16 (K, V))) >> 1;
bool found = answer & 1;
answer = (answer + 1) >> 1;
unsigned move = step * answer;
*pos += move;
if (found)
{
*pos -= 1;
return true;
}
length -= move;
base = (const void *) ((const char *) base + stride * move);
}
return false;
}
#elif !defined(HB_NO_SIMD)
#define HB_NO_SIMD
#endif
/*
* Use to implement faster specializations.
*/
#ifndef HB_NO_SIMD
#if 0
static inline bool
hb_simd_bsearch_glyphid_range (unsigned *pos, /* Out */
hb_codepoint_t k,
const void *base,
size_t length,
size_t stride)
{
}
#endif
#endif
#endif /* HB_SIMD_HH */

View File

@ -604,11 +604,11 @@ struct BEInt<Type, 4>
#include "hb-meta.hh"
#include "hb-mutex.hh"
#include "hb-number.hh"
#include "hb-simd.hh" // Requires: hb-meta
#include "hb-atomic.hh" // Requires: hb-meta
#include "hb-null.hh" // Requires: hb-meta
#include "hb-algs.hh" // Requires: hb-meta hb-null hb-number
#include "hb-iter.hh" // Requires: hb-algs hb-meta
#include "hb-simd.hh" // Requires: hb-algs hb-meta
#include "hb-debug.hh" // Requires: hb-algs hb-atomic
#include "hb-array.hh" // Requires: hb-algs hb-iter hb-null hb-simd
#include "hb-vector.hh" // Requires: hb-array hb-null

View File

@ -24,8 +24,7 @@
*/
#include "hb.hh"
#include "hb-number.hh"
#include "hb-number-parser.hh"
#include "hb-number.cc"
int

84
src/test-simd.cc Normal file
View File

@ -0,0 +1,84 @@
/*
* Copyright © 2019 Facebook, Inc.
*
* This is part of HarfBuzz, a text shaping library.
*
* Permission is hereby granted, without written agreement and without
* license or royalty fees, to use, copy, modify, and distribute this
* software and its documentation for any purpose, provided that the
* above copyright notice and the following two paragraphs appear in
* all copies of this software.
*
* IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR
* DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
* ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN
* IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
* DAMAGE.
*
* THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING,
* BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS
* ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO
* PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
*
* Facebook Author(s): Behdad Esfahbod
*/
#include "hb.hh"
#include "hb-simd.hh"
struct U16
{
U16 (unsigned v_)
{
v[0] = v_ >> 8;
v[1] = v_ & 0xFF;
}
uint8_t v[2];
};
int
main (int argc, char **argv)
{
const U16 a[] = {1, 2, 5, 10, 16, 19};
#define TEST(k, f, p) \
{ \
unsigned pos = 123456789; \
bool found = hb_simd_ksearch_glyphid_range (&pos, \
k, \
a, \
ARRAY_LENGTH (a) / 2, \
sizeof (a[0]) * 2); \
/*printf ("key %d found %d pos %d\n", k, found, pos);*/ \
assert (found == f && pos == p); \
}
TEST (0, false, 0);
TEST (1, true , 0);
TEST (2, true , 0);
TEST (3, false, 1);
TEST (4, false, 1);
TEST (5, true , 1);
TEST (6, true , 1);
TEST (7, true , 1);
TEST (8, true , 1);
TEST (9, true , 1);
TEST (10, true , 1);
TEST (11, false, 2);
TEST (12, false, 2);
TEST (13, false, 2);
TEST (14, false, 2);
TEST (15, false, 2);
TEST (16, true , 2);
TEST (17, true , 2);
TEST (18, true , 2);
TEST (19, true , 2);
TEST (20, false, 3);
return 0;
}