From 47943daa15983d20ab1ffd0a237125f1493c5658 Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Thu, 21 May 2020 11:23:00 +0200 Subject: [PATCH] Speed-up 9x7 IDWD by ~20% "bench_dwt -I" time goes from 2.8s to 2.2s --- src/lib/openjp2/dwt.c | 382 +++++++++++++++++++----------------------- 1 file changed, 176 insertions(+), 206 deletions(-) diff --git a/src/lib/openjp2/dwt.c b/src/lib/openjp2/dwt.c index 5710e802..9fef2234 100644 --- a/src/lib/openjp2/dwt.c +++ b/src/lib/openjp2/dwt.c @@ -87,12 +87,14 @@ typedef struct dwt_local { OPJ_INT32 cas; /* 0 = start on even coord, 1 = start on odd coord */ } opj_dwt_t; -typedef union { - OPJ_FLOAT32 f[4]; -} opj_v4_t; +#define NB_ELTS_V8 8 -typedef struct v4dwt_local { - opj_v4_t* wavelet ; +typedef union { + OPJ_FLOAT32 f[NB_ELTS_V8]; +} opj_v8_t; + +typedef struct v8dwt_local { + opj_v8_t* wavelet ; OPJ_INT32 dn ; /* number of elements in high pass band */ OPJ_INT32 sn ; /* number of elements in low pass band */ OPJ_INT32 cas ; /* 0 = start on even coord, 1 = start on odd coord */ @@ -100,7 +102,7 @@ typedef struct v4dwt_local { OPJ_UINT32 win_l_x1; /* end coord in low pass band */ OPJ_UINT32 win_h_x0; /* start coord in high pass band */ OPJ_UINT32 win_h_x1; /* end coord in high pass band */ -} opj_v4dwt_t ; +} opj_v8dwt_t ; /* From table F.4 from the standard */ static const OPJ_FLOAT32 opj_dwt_alpha = -1.586134342f; @@ -170,42 +172,6 @@ static OPJ_UINT32 opj_dwt_max_resolution(opj_tcd_resolution_t* OPJ_RESTRICT r, /* */ /* Inverse 9-7 wavelet transform in 1-D. */ /* */ -static void opj_v4dwt_decode(opj_v4dwt_t* OPJ_RESTRICT dwt); - -static void opj_v4dwt_interleave_h(opj_v4dwt_t* OPJ_RESTRICT dwt, - OPJ_FLOAT32* OPJ_RESTRICT a, - OPJ_UINT32 width, - OPJ_UINT32 remaining_height); - -static void opj_v4dwt_interleave_v(opj_v4dwt_t* OPJ_RESTRICT dwt, - OPJ_FLOAT32* OPJ_RESTRICT a, - OPJ_UINT32 width, - OPJ_UINT32 nb_elts_read); - -#ifdef __SSE__ -static void opj_v4dwt_decode_step1_sse(opj_v4_t* w, - OPJ_UINT32 start, - OPJ_UINT32 end, - const __m128 c); - -static void opj_v4dwt_decode_step2_sse(opj_v4_t* l, opj_v4_t* w, - OPJ_UINT32 start, - OPJ_UINT32 end, - OPJ_UINT32 m, __m128 c); - -#else -static void opj_v4dwt_decode_step1(opj_v4_t* w, - OPJ_UINT32 start, - OPJ_UINT32 end, - const OPJ_FLOAT32 c); - -static void opj_v4dwt_decode_step2(opj_v4_t* l, opj_v4_t* w, - OPJ_UINT32 start, - OPJ_UINT32 end, - OPJ_UINT32 m, - OPJ_FLOAT32 c); - -#endif /*@}*/ @@ -2332,7 +2298,7 @@ static OPJ_BOOL opj_dwt_decode_partial_tile( return OPJ_TRUE; } -static void opj_v4dwt_interleave_h(opj_v4dwt_t* OPJ_RESTRICT dwt, +static void opj_v8dwt_interleave_h(opj_v8dwt_t* OPJ_RESTRICT dwt, OPJ_FLOAT32* OPJ_RESTRICT a, OPJ_UINT32 width, OPJ_UINT32 remaining_height) @@ -2343,39 +2309,69 @@ static void opj_v4dwt_interleave_h(opj_v4dwt_t* OPJ_RESTRICT dwt, OPJ_UINT32 x1 = dwt->win_l_x1; for (k = 0; k < 2; ++k) { - if (remaining_height >= 4 && ((OPJ_SIZE_T) a & 0x0f) == 0 && - ((OPJ_SIZE_T) bi & 0x0f) == 0 && (width & 0x0f) == 0) { + if (remaining_height >= NB_ELTS_V8 && ((OPJ_SIZE_T) a & 0x0f) == 0 && + ((OPJ_SIZE_T) bi & 0x0f) == 0) { /* Fast code path */ for (i = x0; i < x1; ++i) { OPJ_UINT32 j = i; - bi[i * 8 ] = a[j]; + OPJ_FLOAT32* OPJ_RESTRICT dst = bi + i * 2 * NB_ELTS_V8; + dst[0] = a[j]; j += width; - bi[i * 8 + 1] = a[j]; + dst[1] = a[j]; j += width; - bi[i * 8 + 2] = a[j]; + dst[2] = a[j]; j += width; - bi[i * 8 + 3] = a[j]; + dst[3] = a[j]; + j += width; + dst[4] = a[j]; + j += width; + dst[5] = a[j]; + j += width; + dst[6] = a[j]; + j += width; + dst[7] = a[j]; } } else { /* Slow code path */ for (i = x0; i < x1; ++i) { OPJ_UINT32 j = i; - bi[i * 8 ] = a[j]; + OPJ_FLOAT32* OPJ_RESTRICT dst = bi + i * 2 * NB_ELTS_V8; + dst[0] = a[j]; j += width; if (remaining_height == 1) { continue; } - bi[i * 8 + 1] = a[j]; + dst[1] = a[j]; j += width; if (remaining_height == 2) { continue; } - bi[i * 8 + 2] = a[j]; + dst[2] = a[j]; j += width; if (remaining_height == 3) { continue; } - bi[i * 8 + 3] = a[j]; /* This one*/ + dst[3] = a[j]; + j += width; + if (remaining_height == 4) { + continue; + } + dst[4] = a[j]; + j += width; + if (remaining_height == 5) { + continue; + } + dst[5] = a[j]; + j += width; + if (remaining_height == 6) { + continue; + } + dst[6] = a[j]; + j += width; + if (remaining_height == 7) { + continue; + } + dst[7] = a[j]; } } @@ -2386,7 +2382,7 @@ static void opj_v4dwt_interleave_h(opj_v4dwt_t* OPJ_RESTRICT dwt, } } -static void opj_v4dwt_interleave_partial_h(opj_v4dwt_t* dwt, +static void opj_v8dwt_interleave_partial_h(opj_v8dwt_t* dwt, opj_sparse_array_int32_t* sa, OPJ_UINT32 sa_line, OPJ_UINT32 remaining_height) @@ -2399,25 +2395,25 @@ static void opj_v4dwt_interleave_partial_h(opj_v4dwt_t* dwt, dwt->win_l_x1, sa_line + i + 1, /* Nasty cast from float* to int32* */ (OPJ_INT32*)(dwt->wavelet + dwt->cas + 2 * dwt->win_l_x0) + i, - 8, 0, OPJ_TRUE); + 2 * NB_ELTS_V8, 0, OPJ_TRUE); assert(ret); ret = opj_sparse_array_int32_read(sa, (OPJ_UINT32)dwt->sn + dwt->win_h_x0, sa_line + i, (OPJ_UINT32)dwt->sn + dwt->win_h_x1, sa_line + i + 1, /* Nasty cast from float* to int32* */ (OPJ_INT32*)(dwt->wavelet + 1 - dwt->cas + 2 * dwt->win_h_x0) + i, - 8, 0, OPJ_TRUE); + 2 * NB_ELTS_V8, 0, OPJ_TRUE); assert(ret); OPJ_UNUSED(ret); } } -static void opj_v4dwt_interleave_v(opj_v4dwt_t* OPJ_RESTRICT dwt, - OPJ_FLOAT32* OPJ_RESTRICT a, - OPJ_UINT32 width, - OPJ_UINT32 nb_elts_read) +static INLINE void opj_v8dwt_interleave_v(opj_v8dwt_t* OPJ_RESTRICT dwt, + OPJ_FLOAT32* OPJ_RESTRICT a, + OPJ_UINT32 width, + OPJ_UINT32 nb_elts_read) { - opj_v4_t* OPJ_RESTRICT bi = dwt->wavelet + dwt->cas; + opj_v8_t* OPJ_RESTRICT bi = dwt->wavelet + dwt->cas; OPJ_UINT32 i; for (i = dwt->win_l_x0; i < dwt->win_l_x1; ++i) { @@ -2434,7 +2430,7 @@ static void opj_v4dwt_interleave_v(opj_v4dwt_t* OPJ_RESTRICT dwt, } } -static void opj_v4dwt_interleave_partial_v(opj_v4dwt_t* OPJ_RESTRICT dwt, +static void opj_v8dwt_interleave_partial_v(opj_v8dwt_t* OPJ_RESTRICT dwt, opj_sparse_array_int32_t* sa, OPJ_UINT32 sa_col, OPJ_UINT32 nb_elts_read) @@ -2444,44 +2440,36 @@ static void opj_v4dwt_interleave_partial_v(opj_v4dwt_t* OPJ_RESTRICT dwt, sa_col, dwt->win_l_x0, sa_col + nb_elts_read, dwt->win_l_x1, (OPJ_INT32*)(dwt->wavelet + dwt->cas + 2 * dwt->win_l_x0), - 1, 8, OPJ_TRUE); + 1, 2 * NB_ELTS_V8, OPJ_TRUE); assert(ret); ret = opj_sparse_array_int32_read(sa, sa_col, (OPJ_UINT32)dwt->sn + dwt->win_h_x0, sa_col + nb_elts_read, (OPJ_UINT32)dwt->sn + dwt->win_h_x1, (OPJ_INT32*)(dwt->wavelet + 1 - dwt->cas + 2 * dwt->win_h_x0), - 1, 8, OPJ_TRUE); + 1, 2 * NB_ELTS_V8, OPJ_TRUE); assert(ret); OPJ_UNUSED(ret); } #ifdef __SSE__ -static void opj_v4dwt_decode_step1_sse(opj_v4_t* w, +static void opj_v8dwt_decode_step1_sse(opj_v8_t* w, OPJ_UINT32 start, OPJ_UINT32 end, const __m128 c) { __m128* OPJ_RESTRICT vw = (__m128*) w; - OPJ_UINT32 i; - /* 4x unrolled loop */ - vw += 2 * start; - for (i = start; i + 3 < end; i += 4, vw += 8) { - __m128 xmm0 = _mm_mul_ps(vw[0], c); - __m128 xmm2 = _mm_mul_ps(vw[2], c); - __m128 xmm4 = _mm_mul_ps(vw[4], c); - __m128 xmm6 = _mm_mul_ps(vw[6], c); - vw[0] = xmm0; - vw[2] = xmm2; - vw[4] = xmm4; - vw[6] = xmm6; - } - for (; i < end; ++i, vw += 2) { + OPJ_UINT32 i = start; + /* To be adapted if NB_ELTS_V8 changes */ + vw += 4 * start; + /* Note: attempt at loop unrolling x2 doesn't help */ + for (; i < end; ++i, vw += 4) { vw[0] = _mm_mul_ps(vw[0], c); + vw[1] = _mm_mul_ps(vw[1], c); } } -static void opj_v4dwt_decode_step2_sse(opj_v4_t* l, opj_v4_t* w, +static void opj_v8dwt_decode_step2_sse(opj_v8_t* l, opj_v8_t* w, OPJ_UINT32 start, OPJ_UINT32 end, OPJ_UINT32 m, @@ -2489,74 +2477,58 @@ static void opj_v4dwt_decode_step2_sse(opj_v4_t* l, opj_v4_t* w, { __m128* OPJ_RESTRICT vl = (__m128*) l; __m128* OPJ_RESTRICT vw = (__m128*) w; + /* To be adapted if NB_ELTS_V8 changes */ OPJ_UINT32 i; OPJ_UINT32 imax = opj_uint_min(end, m); - __m128 tmp1, tmp2, tmp3; if (start == 0) { - tmp1 = vl[0]; + if (imax >= 1) { + vw[-2] = _mm_add_ps(vw[-2], _mm_mul_ps(_mm_add_ps(vl[0], vw[0]), c)); + vw[-1] = _mm_add_ps(vw[-1], _mm_mul_ps(_mm_add_ps(vl[1], vw[1]), c)); + vw += 4; + start = 1; + } } else { - vw += start * 2; - tmp1 = vw[-3]; + vw += start * 4; } i = start; - - /* 4x loop unrolling */ - for (; i + 3 < imax; i += 4) { - __m128 tmp4, tmp5, tmp6, tmp7, tmp8, tmp9; - tmp2 = vw[-1]; - tmp3 = vw[ 0]; - tmp4 = vw[ 1]; - tmp5 = vw[ 2]; - tmp6 = vw[ 3]; - tmp7 = vw[ 4]; - tmp8 = vw[ 5]; - tmp9 = vw[ 6]; - vw[-1] = _mm_add_ps(tmp2, _mm_mul_ps(_mm_add_ps(tmp1, tmp3), c)); - vw[ 1] = _mm_add_ps(tmp4, _mm_mul_ps(_mm_add_ps(tmp3, tmp5), c)); - vw[ 3] = _mm_add_ps(tmp6, _mm_mul_ps(_mm_add_ps(tmp5, tmp7), c)); - vw[ 5] = _mm_add_ps(tmp8, _mm_mul_ps(_mm_add_ps(tmp7, tmp9), c)); - tmp1 = tmp9; - vw += 8; - } - + /* Note: attempt at loop unrolling x2 doesn't help */ for (; i < imax; ++i) { - tmp2 = vw[-1]; - tmp3 = vw[ 0]; - vw[-1] = _mm_add_ps(tmp2, _mm_mul_ps(_mm_add_ps(tmp1, tmp3), c)); - tmp1 = tmp3; - vw += 2; + vw[-2] = _mm_add_ps(vw[-2], _mm_mul_ps(_mm_add_ps(vw[-4], vw[0]), c)); + vw[-1] = _mm_add_ps(vw[-1], _mm_mul_ps(_mm_add_ps(vw[-3], vw[1]), c)); + vw += 4; } if (m < end) { assert(m + 1 == end); c = _mm_add_ps(c, c); - c = _mm_mul_ps(c, vw[-2]); - vw[-1] = _mm_add_ps(vw[-1], c); + vw[-2] = _mm_add_ps(vw[-2], _mm_mul_ps(c, vw[-4])); + vw[-1] = _mm_add_ps(vw[-1], _mm_mul_ps(c, vw[-3])); } } #else -static void opj_v4dwt_decode_step1(opj_v4_t* w, +static void opj_v8dwt_decode_step1(opj_v8_t* w, OPJ_UINT32 start, OPJ_UINT32 end, const OPJ_FLOAT32 c) { OPJ_FLOAT32* OPJ_RESTRICT fw = (OPJ_FLOAT32*) w; OPJ_UINT32 i; + /* To be adapted if NB_ELTS_V8 changes */ for (i = start; i < end; ++i) { - OPJ_FLOAT32 tmp1 = fw[i * 8 ]; - OPJ_FLOAT32 tmp2 = fw[i * 8 + 1]; - OPJ_FLOAT32 tmp3 = fw[i * 8 + 2]; - OPJ_FLOAT32 tmp4 = fw[i * 8 + 3]; - fw[i * 8 ] = tmp1 * c; - fw[i * 8 + 1] = tmp2 * c; - fw[i * 8 + 2] = tmp3 * c; - fw[i * 8 + 3] = tmp4 * c; + fw[i * 2 * 8 ] = fw[i * 2 * 8 ] * c; + fw[i * 2 * 8 + 1] = fw[i * 2 * 8 + 1] * c; + fw[i * 2 * 8 + 2] = fw[i * 2 * 8 + 2] * c; + fw[i * 2 * 8 + 3] = fw[i * 2 * 8 + 3] * c; + fw[i * 2 * 8 + 4] = fw[i * 2 * 8 + 4] * c; + fw[i * 2 * 8 + 5] = fw[i * 2 * 8 + 5] * c; + fw[i * 2 * 8 + 6] = fw[i * 2 * 8 + 6] * c; + fw[i * 2 * 8 + 7] = fw[i * 2 * 8 + 7] * c; } } -static void opj_v4dwt_decode_step2(opj_v4_t* l, opj_v4_t* w, +static void opj_v8dwt_decode_step2(opj_v8_t* l, opj_v8_t* w, OPJ_UINT32 start, OPJ_UINT32 end, OPJ_UINT32 m, @@ -2567,36 +2539,33 @@ static void opj_v4dwt_decode_step2(opj_v4_t* l, opj_v4_t* w, OPJ_UINT32 i; OPJ_UINT32 imax = opj_uint_min(end, m); if (start > 0) { - fw += 8 * start; - fl = fw - 8; + fw += 2 * NB_ELTS_V8 * start; + fl = fw - 2 * NB_ELTS_V8; } + /* To be adapted if NB_ELTS_V8 changes */ for (i = start; i < imax; ++i) { - OPJ_FLOAT32 tmp1_1 = fl[0]; - OPJ_FLOAT32 tmp1_2 = fl[1]; - OPJ_FLOAT32 tmp1_3 = fl[2]; - OPJ_FLOAT32 tmp1_4 = fl[3]; - OPJ_FLOAT32 tmp2_1 = fw[-4]; - OPJ_FLOAT32 tmp2_2 = fw[-3]; - OPJ_FLOAT32 tmp2_3 = fw[-2]; - OPJ_FLOAT32 tmp2_4 = fw[-1]; - OPJ_FLOAT32 tmp3_1 = fw[0]; - OPJ_FLOAT32 tmp3_2 = fw[1]; - OPJ_FLOAT32 tmp3_3 = fw[2]; - OPJ_FLOAT32 tmp3_4 = fw[3]; - fw[-4] = tmp2_1 + ((tmp1_1 + tmp3_1) * c); - fw[-3] = tmp2_2 + ((tmp1_2 + tmp3_2) * c); - fw[-2] = tmp2_3 + ((tmp1_3 + tmp3_3) * c); - fw[-1] = tmp2_4 + ((tmp1_4 + tmp3_4) * c); + fw[-8] = fw[-8] + ((fl[0] + fw[0]) * c); + fw[-7] = fw[-7] + ((fl[1] + fw[1]) * c); + fw[-6] = fw[-6] + ((fl[2] + fw[2]) * c); + fw[-5] = fw[-5] + ((fl[3] + fw[3]) * c); + fw[-4] = fw[-4] + ((fl[4] + fw[4]) * c); + fw[-3] = fw[-3] + ((fl[5] + fw[5]) * c); + fw[-2] = fw[-2] + ((fl[6] + fw[6]) * c); + fw[-1] = fw[-1] + ((fl[7] + fw[7]) * c); fl = fw; - fw += 8; + fw += 2 * NB_ELTS_V8; } if (m < end) { assert(m + 1 == end); c += c; - fw[-4] = fw[-4] + fl[0] * c; - fw[-3] = fw[-3] + fl[1] * c; - fw[-2] = fw[-2] + fl[2] * c; - fw[-1] = fw[-1] + fl[3] * c; + fw[-8] = fw[-8] + fl[0] * c; + fw[-7] = fw[-7] + fl[1] * c; + fw[-6] = fw[-6] + fl[2] * c; + fw[-5] = fw[-5] + fl[3] * c; + fw[-4] = fw[-4] + fl[4] * c; + fw[-3] = fw[-3] + fl[5] * c; + fw[-2] = fw[-2] + fl[6] * c; + fw[-1] = fw[-1] + fl[7] * c; } } @@ -2605,7 +2574,7 @@ static void opj_v4dwt_decode_step2(opj_v4_t* l, opj_v4_t* w, /* */ /* Inverse 9-7 wavelet transform in 1-D. */ /* */ -static void opj_v4dwt_decode(opj_v4dwt_t* OPJ_RESTRICT dwt) +static void opj_v8dwt_decode(opj_v8dwt_t* OPJ_RESTRICT dwt) { OPJ_INT32 a, b; /* BUG_WEIRD_TWO_INVK (look for this identifier in tcd.c) */ @@ -2630,44 +2599,44 @@ static void opj_v4dwt_decode(opj_v4dwt_t* OPJ_RESTRICT dwt) b = 0; } #ifdef __SSE__ - opj_v4dwt_decode_step1_sse(dwt->wavelet + a, dwt->win_l_x0, dwt->win_l_x1, + opj_v8dwt_decode_step1_sse(dwt->wavelet + a, dwt->win_l_x0, dwt->win_l_x1, _mm_set1_ps(opj_K)); - opj_v4dwt_decode_step1_sse(dwt->wavelet + b, dwt->win_h_x0, dwt->win_h_x1, + opj_v8dwt_decode_step1_sse(dwt->wavelet + b, dwt->win_h_x0, dwt->win_h_x1, _mm_set1_ps(two_invK)); - opj_v4dwt_decode_step2_sse(dwt->wavelet + b, dwt->wavelet + a + 1, + opj_v8dwt_decode_step2_sse(dwt->wavelet + b, dwt->wavelet + a + 1, dwt->win_l_x0, dwt->win_l_x1, (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a), _mm_set1_ps(-opj_dwt_delta)); - opj_v4dwt_decode_step2_sse(dwt->wavelet + a, dwt->wavelet + b + 1, + opj_v8dwt_decode_step2_sse(dwt->wavelet + a, dwt->wavelet + b + 1, dwt->win_h_x0, dwt->win_h_x1, (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b), _mm_set1_ps(-opj_dwt_gamma)); - opj_v4dwt_decode_step2_sse(dwt->wavelet + b, dwt->wavelet + a + 1, + opj_v8dwt_decode_step2_sse(dwt->wavelet + b, dwt->wavelet + a + 1, dwt->win_l_x0, dwt->win_l_x1, (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a), _mm_set1_ps(-opj_dwt_beta)); - opj_v4dwt_decode_step2_sse(dwt->wavelet + a, dwt->wavelet + b + 1, + opj_v8dwt_decode_step2_sse(dwt->wavelet + a, dwt->wavelet + b + 1, dwt->win_h_x0, dwt->win_h_x1, (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b), _mm_set1_ps(-opj_dwt_alpha)); #else - opj_v4dwt_decode_step1(dwt->wavelet + a, dwt->win_l_x0, dwt->win_l_x1, + opj_v8dwt_decode_step1(dwt->wavelet + a, dwt->win_l_x0, dwt->win_l_x1, opj_K); - opj_v4dwt_decode_step1(dwt->wavelet + b, dwt->win_h_x0, dwt->win_h_x1, + opj_v8dwt_decode_step1(dwt->wavelet + b, dwt->win_h_x0, dwt->win_h_x1, two_invK); - opj_v4dwt_decode_step2(dwt->wavelet + b, dwt->wavelet + a + 1, + opj_v8dwt_decode_step2(dwt->wavelet + b, dwt->wavelet + a + 1, dwt->win_l_x0, dwt->win_l_x1, (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a), -opj_dwt_delta); - opj_v4dwt_decode_step2(dwt->wavelet + a, dwt->wavelet + b + 1, + opj_v8dwt_decode_step2(dwt->wavelet + a, dwt->wavelet + b + 1, dwt->win_h_x0, dwt->win_h_x1, (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b), -opj_dwt_gamma); - opj_v4dwt_decode_step2(dwt->wavelet + b, dwt->wavelet + a + 1, + opj_v8dwt_decode_step2(dwt->wavelet + b, dwt->wavelet + a + 1, dwt->win_l_x0, dwt->win_l_x1, (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a), -opj_dwt_beta); - opj_v4dwt_decode_step2(dwt->wavelet + a, dwt->wavelet + b + 1, + opj_v8dwt_decode_step2(dwt->wavelet + a, dwt->wavelet + b + 1, dwt->win_h_x0, dwt->win_h_x1, (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b), -opj_dwt_alpha); @@ -2682,8 +2651,8 @@ static OPJ_BOOL opj_dwt_decode_tile_97(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec, OPJ_UINT32 numres) { - opj_v4dwt_t h; - opj_v4dwt_t v; + opj_v8dwt_t h; + opj_v8dwt_t v; opj_tcd_resolution_t* res = tilec->resolutions; @@ -2706,11 +2675,11 @@ OPJ_BOOL opj_dwt_decode_tile_97(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec, } l_data_size += 5U; /* overflow check */ - if (l_data_size > (SIZE_MAX / sizeof(opj_v4_t))) { + if (l_data_size > (SIZE_MAX / sizeof(opj_v8_t))) { /* FIXME event manager error callback */ return OPJ_FALSE; } - h.wavelet = (opj_v4_t*) opj_aligned_malloc(l_data_size * sizeof(opj_v4_t)); + h.wavelet = (opj_v8_t*) opj_aligned_malloc(l_data_size * sizeof(opj_v8_t)); if (!h.wavelet) { /* FIXME event manager error callback */ return OPJ_FALSE; @@ -2738,35 +2707,36 @@ OPJ_BOOL opj_dwt_decode_tile_97(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec, h.win_l_x1 = (OPJ_UINT32)h.sn; h.win_h_x0 = 0; h.win_h_x1 = (OPJ_UINT32)h.dn; - for (j = 0; j + 3 < rh; j += 4) { + for (j = 0; j + (NB_ELTS_V8 - 1) < rh; j += NB_ELTS_V8) { OPJ_UINT32 k; - opj_v4dwt_interleave_h(&h, aj, w, rh - j); - opj_v4dwt_decode(&h); + opj_v8dwt_interleave_h(&h, aj, w, rh - j); + opj_v8dwt_decode(&h); + /* To be adapted if NB_ELTS_V8 changes */ for (k = 0; k < rw; k++) { aj[k ] = h.wavelet[k].f[0]; aj[k + (OPJ_SIZE_T)w ] = h.wavelet[k].f[1]; aj[k + (OPJ_SIZE_T)w * 2] = h.wavelet[k].f[2]; aj[k + (OPJ_SIZE_T)w * 3] = h.wavelet[k].f[3]; } + for (k = 0; k < rw; k++) { + aj[k + (OPJ_SIZE_T)w * 4] = h.wavelet[k].f[4]; + aj[k + (OPJ_SIZE_T)w * 5] = h.wavelet[k].f[5]; + aj[k + (OPJ_SIZE_T)w * 6] = h.wavelet[k].f[6]; + aj[k + (OPJ_SIZE_T)w * 7] = h.wavelet[k].f[7]; + } - aj += w * 4; + aj += w * NB_ELTS_V8; } if (j < rh) { OPJ_UINT32 k; - opj_v4dwt_interleave_h(&h, aj, w, rh - j); - opj_v4dwt_decode(&h); + opj_v8dwt_interleave_h(&h, aj, w, rh - j); + opj_v8dwt_decode(&h); for (k = 0; k < rw; k++) { - switch (rh - j) { - case 3: - aj[k + (OPJ_SIZE_T)w * 2] = h.wavelet[k].f[2]; - /* FALLTHRU */ - case 2: - aj[k + (OPJ_SIZE_T)w ] = h.wavelet[k].f[1]; - /* FALLTHRU */ - case 1: - aj[k] = h.wavelet[k].f[0]; + OPJ_UINT32 l; + for (l = 0; l < rh - j; l++) { + aj[k + (OPJ_SIZE_T)w * l ] = h.wavelet[k].f[l]; } } } @@ -2779,25 +2749,25 @@ OPJ_BOOL opj_dwt_decode_tile_97(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec, v.win_h_x1 = (OPJ_UINT32)v.dn; aj = (OPJ_FLOAT32*) tilec->data; - for (j = rw; j > 3; j -= 4) { + for (j = rw; j > (NB_ELTS_V8 - 1); j -= NB_ELTS_V8) { OPJ_UINT32 k; - opj_v4dwt_interleave_v(&v, aj, w, 4); - opj_v4dwt_decode(&v); + opj_v8dwt_interleave_v(&v, aj, w, NB_ELTS_V8); + opj_v8dwt_decode(&v); for (k = 0; k < rh; ++k) { - memcpy(&aj[k * (OPJ_SIZE_T)w], &v.wavelet[k], 4 * sizeof(OPJ_FLOAT32)); + memcpy(&aj[k * (OPJ_SIZE_T)w], &v.wavelet[k], NB_ELTS_V8 * sizeof(OPJ_FLOAT32)); } - aj += 4; + aj += NB_ELTS_V8; } - if (rw & 0x03) { + if (rw & (NB_ELTS_V8 - 1)) { OPJ_UINT32 k; - j = rw & 0x03; + j = rw & (NB_ELTS_V8 - 1); - opj_v4dwt_interleave_v(&v, aj, w, j); - opj_v4dwt_decode(&v); + opj_v8dwt_interleave_v(&v, aj, w, j); + opj_v8dwt_decode(&v); for (k = 0; k < rh; ++k) { memcpy(&aj[k * (OPJ_SIZE_T)w], &v.wavelet[k], @@ -2815,8 +2785,8 @@ OPJ_BOOL opj_dwt_decode_partial_97(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec, OPJ_UINT32 numres) { opj_sparse_array_int32_t* sa; - opj_v4dwt_t h; - opj_v4dwt_t v; + opj_v8dwt_t h; + opj_v8dwt_t v; OPJ_UINT32 resno; /* This value matches the maximum left/right extension given in tables */ /* F.2 and F.3 of the standard. Note: in opj_tcd_is_subband_area_of_interest() */ @@ -2873,12 +2843,12 @@ OPJ_BOOL opj_dwt_decode_partial_97(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec, } l_data_size += 5U; /* overflow check */ - if (l_data_size > (SIZE_MAX / sizeof(opj_v4_t))) { + if (l_data_size > (SIZE_MAX / sizeof(opj_v8_t))) { /* FIXME event manager error callback */ opj_sparse_array_int32_free(sa); return OPJ_FALSE; } - h.wavelet = (opj_v4_t*) opj_aligned_malloc(l_data_size * sizeof(opj_v4_t)); + h.wavelet = (opj_v8_t*) opj_aligned_malloc(l_data_size * sizeof(opj_v8_t)); if (!h.wavelet) { /* FIXME event manager error callback */ opj_sparse_array_int32_free(sa); @@ -2973,17 +2943,17 @@ OPJ_BOOL opj_dwt_decode_partial_97(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec, h.win_l_x1 = win_ll_x1; h.win_h_x0 = win_hl_x0; h.win_h_x1 = win_hl_x1; - for (j = 0; j + 3 < rh; j += 4) { - if ((j + 3 >= win_ll_y0 && j < win_ll_y1) || - (j + 3 >= win_lh_y0 + (OPJ_UINT32)v.sn && + for (j = 0; j + (NB_ELTS_V8 - 1) < rh; j += NB_ELTS_V8) { + if ((j + (NB_ELTS_V8 - 1) >= win_ll_y0 && j < win_ll_y1) || + (j + (NB_ELTS_V8 - 1) >= win_lh_y0 + (OPJ_UINT32)v.sn && j < win_lh_y1 + (OPJ_UINT32)v.sn)) { - opj_v4dwt_interleave_partial_h(&h, sa, j, opj_uint_min(4U, rh - j)); - opj_v4dwt_decode(&h); + opj_v8dwt_interleave_partial_h(&h, sa, j, opj_uint_min(NB_ELTS_V8, rh - j)); + opj_v8dwt_decode(&h); if (!opj_sparse_array_int32_write(sa, win_tr_x0, j, - win_tr_x1, j + 4, + win_tr_x1, j + NB_ELTS_V8, (OPJ_INT32*)&h.wavelet[win_tr_x0].f[0], - 4, 1, OPJ_TRUE)) { + NB_ELTS_V8, 1, OPJ_TRUE)) { /* FIXME event manager error callback */ opj_sparse_array_int32_free(sa); opj_aligned_free(h.wavelet); @@ -2993,16 +2963,16 @@ OPJ_BOOL opj_dwt_decode_partial_97(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec, } if (j < rh && - ((j + 3 >= win_ll_y0 && j < win_ll_y1) || - (j + 3 >= win_lh_y0 + (OPJ_UINT32)v.sn && + ((j + (NB_ELTS_V8 - 1) >= win_ll_y0 && j < win_ll_y1) || + (j + (NB_ELTS_V8 - 1) >= win_lh_y0 + (OPJ_UINT32)v.sn && j < win_lh_y1 + (OPJ_UINT32)v.sn))) { - opj_v4dwt_interleave_partial_h(&h, sa, j, rh - j); - opj_v4dwt_decode(&h); + opj_v8dwt_interleave_partial_h(&h, sa, j, rh - j); + opj_v8dwt_decode(&h); if (!opj_sparse_array_int32_write(sa, win_tr_x0, j, win_tr_x1, rh, (OPJ_INT32*)&h.wavelet[win_tr_x0].f[0], - 4, 1, OPJ_TRUE)) { + NB_ELTS_V8, 1, OPJ_TRUE)) { /* FIXME event manager error callback */ opj_sparse_array_int32_free(sa); opj_aligned_free(h.wavelet); @@ -3014,17 +2984,17 @@ OPJ_BOOL opj_dwt_decode_partial_97(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec, v.win_l_x1 = win_ll_y1; v.win_h_x0 = win_lh_y0; v.win_h_x1 = win_lh_y1; - for (j = win_tr_x0; j < win_tr_x1; j += 4) { - OPJ_UINT32 nb_elts = opj_uint_min(4U, win_tr_x1 - j); + for (j = win_tr_x0; j < win_tr_x1; j += NB_ELTS_V8) { + OPJ_UINT32 nb_elts = opj_uint_min(NB_ELTS_V8, win_tr_x1 - j); - opj_v4dwt_interleave_partial_v(&v, sa, j, nb_elts); - opj_v4dwt_decode(&v); + opj_v8dwt_interleave_partial_v(&v, sa, j, nb_elts); + opj_v8dwt_decode(&v); if (!opj_sparse_array_int32_write(sa, j, win_tr_y0, j + nb_elts, win_tr_y1, (OPJ_INT32*)&h.wavelet[win_tr_y0].f[0], - 1, 4, OPJ_TRUE)) { + 1, NB_ELTS_V8, OPJ_TRUE)) { /* FIXME event manager error callback */ opj_sparse_array_int32_free(sa); opj_aligned_free(h.wavelet);