Speed-up 9x7 IDWD by ~20%

"bench_dwt -I" time goes from 2.8s to 2.2s
This commit is contained in:
Even Rouault 2020-05-21 11:23:00 +02:00
parent 0c09062464
commit 47943daa15
No known key found for this signature in database
GPG Key ID: 33EBBFC47B3DD87D
1 changed files with 176 additions and 206 deletions

View File

@ -87,12 +87,14 @@ typedef struct dwt_local {
OPJ_INT32 cas; /* 0 = start on even coord, 1 = start on odd coord */ OPJ_INT32 cas; /* 0 = start on even coord, 1 = start on odd coord */
} opj_dwt_t; } opj_dwt_t;
typedef union { #define NB_ELTS_V8 8
OPJ_FLOAT32 f[4];
} opj_v4_t;
typedef struct v4dwt_local { typedef union {
opj_v4_t* wavelet ; OPJ_FLOAT32 f[NB_ELTS_V8];
} opj_v8_t;
typedef struct v8dwt_local {
opj_v8_t* wavelet ;
OPJ_INT32 dn ; /* number of elements in high pass band */ OPJ_INT32 dn ; /* number of elements in high pass band */
OPJ_INT32 sn ; /* number of elements in low pass band */ OPJ_INT32 sn ; /* number of elements in low pass band */
OPJ_INT32 cas ; /* 0 = start on even coord, 1 = start on odd coord */ OPJ_INT32 cas ; /* 0 = start on even coord, 1 = start on odd coord */
@ -100,7 +102,7 @@ typedef struct v4dwt_local {
OPJ_UINT32 win_l_x1; /* end coord in low pass band */ OPJ_UINT32 win_l_x1; /* end coord in low pass band */
OPJ_UINT32 win_h_x0; /* start coord in high pass band */ OPJ_UINT32 win_h_x0; /* start coord in high pass band */
OPJ_UINT32 win_h_x1; /* end coord in high pass band */ OPJ_UINT32 win_h_x1; /* end coord in high pass band */
} opj_v4dwt_t ; } opj_v8dwt_t ;
/* From table F.4 from the standard */ /* From table F.4 from the standard */
static const OPJ_FLOAT32 opj_dwt_alpha = -1.586134342f; static const OPJ_FLOAT32 opj_dwt_alpha = -1.586134342f;
@ -170,42 +172,6 @@ static OPJ_UINT32 opj_dwt_max_resolution(opj_tcd_resolution_t* OPJ_RESTRICT r,
/* <summary> */ /* <summary> */
/* Inverse 9-7 wavelet transform in 1-D. */ /* Inverse 9-7 wavelet transform in 1-D. */
/* </summary> */ /* </summary> */
static void opj_v4dwt_decode(opj_v4dwt_t* OPJ_RESTRICT dwt);
static void opj_v4dwt_interleave_h(opj_v4dwt_t* OPJ_RESTRICT dwt,
OPJ_FLOAT32* OPJ_RESTRICT a,
OPJ_UINT32 width,
OPJ_UINT32 remaining_height);
static void opj_v4dwt_interleave_v(opj_v4dwt_t* OPJ_RESTRICT dwt,
OPJ_FLOAT32* OPJ_RESTRICT a,
OPJ_UINT32 width,
OPJ_UINT32 nb_elts_read);
#ifdef __SSE__
static void opj_v4dwt_decode_step1_sse(opj_v4_t* w,
OPJ_UINT32 start,
OPJ_UINT32 end,
const __m128 c);
static void opj_v4dwt_decode_step2_sse(opj_v4_t* l, opj_v4_t* w,
OPJ_UINT32 start,
OPJ_UINT32 end,
OPJ_UINT32 m, __m128 c);
#else
static void opj_v4dwt_decode_step1(opj_v4_t* w,
OPJ_UINT32 start,
OPJ_UINT32 end,
const OPJ_FLOAT32 c);
static void opj_v4dwt_decode_step2(opj_v4_t* l, opj_v4_t* w,
OPJ_UINT32 start,
OPJ_UINT32 end,
OPJ_UINT32 m,
OPJ_FLOAT32 c);
#endif
/*@}*/ /*@}*/
@ -2332,7 +2298,7 @@ static OPJ_BOOL opj_dwt_decode_partial_tile(
return OPJ_TRUE; return OPJ_TRUE;
} }
static void opj_v4dwt_interleave_h(opj_v4dwt_t* OPJ_RESTRICT dwt, static void opj_v8dwt_interleave_h(opj_v8dwt_t* OPJ_RESTRICT dwt,
OPJ_FLOAT32* OPJ_RESTRICT a, OPJ_FLOAT32* OPJ_RESTRICT a,
OPJ_UINT32 width, OPJ_UINT32 width,
OPJ_UINT32 remaining_height) OPJ_UINT32 remaining_height)
@ -2343,39 +2309,69 @@ static void opj_v4dwt_interleave_h(opj_v4dwt_t* OPJ_RESTRICT dwt,
OPJ_UINT32 x1 = dwt->win_l_x1; OPJ_UINT32 x1 = dwt->win_l_x1;
for (k = 0; k < 2; ++k) { for (k = 0; k < 2; ++k) {
if (remaining_height >= 4 && ((OPJ_SIZE_T) a & 0x0f) == 0 && if (remaining_height >= NB_ELTS_V8 && ((OPJ_SIZE_T) a & 0x0f) == 0 &&
((OPJ_SIZE_T) bi & 0x0f) == 0 && (width & 0x0f) == 0) { ((OPJ_SIZE_T) bi & 0x0f) == 0) {
/* Fast code path */ /* Fast code path */
for (i = x0; i < x1; ++i) { for (i = x0; i < x1; ++i) {
OPJ_UINT32 j = i; OPJ_UINT32 j = i;
bi[i * 8 ] = a[j]; OPJ_FLOAT32* OPJ_RESTRICT dst = bi + i * 2 * NB_ELTS_V8;
dst[0] = a[j];
j += width; j += width;
bi[i * 8 + 1] = a[j]; dst[1] = a[j];
j += width; j += width;
bi[i * 8 + 2] = a[j]; dst[2] = a[j];
j += width; j += width;
bi[i * 8 + 3] = a[j]; dst[3] = a[j];
j += width;
dst[4] = a[j];
j += width;
dst[5] = a[j];
j += width;
dst[6] = a[j];
j += width;
dst[7] = a[j];
} }
} else { } else {
/* Slow code path */ /* Slow code path */
for (i = x0; i < x1; ++i) { for (i = x0; i < x1; ++i) {
OPJ_UINT32 j = i; OPJ_UINT32 j = i;
bi[i * 8 ] = a[j]; OPJ_FLOAT32* OPJ_RESTRICT dst = bi + i * 2 * NB_ELTS_V8;
dst[0] = a[j];
j += width; j += width;
if (remaining_height == 1) { if (remaining_height == 1) {
continue; continue;
} }
bi[i * 8 + 1] = a[j]; dst[1] = a[j];
j += width; j += width;
if (remaining_height == 2) { if (remaining_height == 2) {
continue; continue;
} }
bi[i * 8 + 2] = a[j]; dst[2] = a[j];
j += width; j += width;
if (remaining_height == 3) { if (remaining_height == 3) {
continue; continue;
} }
bi[i * 8 + 3] = a[j]; /* This one*/ dst[3] = a[j];
j += width;
if (remaining_height == 4) {
continue;
}
dst[4] = a[j];
j += width;
if (remaining_height == 5) {
continue;
}
dst[5] = a[j];
j += width;
if (remaining_height == 6) {
continue;
}
dst[6] = a[j];
j += width;
if (remaining_height == 7) {
continue;
}
dst[7] = a[j];
} }
} }
@ -2386,7 +2382,7 @@ static void opj_v4dwt_interleave_h(opj_v4dwt_t* OPJ_RESTRICT dwt,
} }
} }
static void opj_v4dwt_interleave_partial_h(opj_v4dwt_t* dwt, static void opj_v8dwt_interleave_partial_h(opj_v8dwt_t* dwt,
opj_sparse_array_int32_t* sa, opj_sparse_array_int32_t* sa,
OPJ_UINT32 sa_line, OPJ_UINT32 sa_line,
OPJ_UINT32 remaining_height) OPJ_UINT32 remaining_height)
@ -2399,25 +2395,25 @@ static void opj_v4dwt_interleave_partial_h(opj_v4dwt_t* dwt,
dwt->win_l_x1, sa_line + i + 1, dwt->win_l_x1, sa_line + i + 1,
/* Nasty cast from float* to int32* */ /* Nasty cast from float* to int32* */
(OPJ_INT32*)(dwt->wavelet + dwt->cas + 2 * dwt->win_l_x0) + i, (OPJ_INT32*)(dwt->wavelet + dwt->cas + 2 * dwt->win_l_x0) + i,
8, 0, OPJ_TRUE); 2 * NB_ELTS_V8, 0, OPJ_TRUE);
assert(ret); assert(ret);
ret = opj_sparse_array_int32_read(sa, ret = opj_sparse_array_int32_read(sa,
(OPJ_UINT32)dwt->sn + dwt->win_h_x0, sa_line + i, (OPJ_UINT32)dwt->sn + dwt->win_h_x0, sa_line + i,
(OPJ_UINT32)dwt->sn + dwt->win_h_x1, sa_line + i + 1, (OPJ_UINT32)dwt->sn + dwt->win_h_x1, sa_line + i + 1,
/* Nasty cast from float* to int32* */ /* Nasty cast from float* to int32* */
(OPJ_INT32*)(dwt->wavelet + 1 - dwt->cas + 2 * dwt->win_h_x0) + i, (OPJ_INT32*)(dwt->wavelet + 1 - dwt->cas + 2 * dwt->win_h_x0) + i,
8, 0, OPJ_TRUE); 2 * NB_ELTS_V8, 0, OPJ_TRUE);
assert(ret); assert(ret);
OPJ_UNUSED(ret); OPJ_UNUSED(ret);
} }
} }
static void opj_v4dwt_interleave_v(opj_v4dwt_t* OPJ_RESTRICT dwt, static INLINE void opj_v8dwt_interleave_v(opj_v8dwt_t* OPJ_RESTRICT dwt,
OPJ_FLOAT32* OPJ_RESTRICT a, OPJ_FLOAT32* OPJ_RESTRICT a,
OPJ_UINT32 width, OPJ_UINT32 width,
OPJ_UINT32 nb_elts_read) OPJ_UINT32 nb_elts_read)
{ {
opj_v4_t* OPJ_RESTRICT bi = dwt->wavelet + dwt->cas; opj_v8_t* OPJ_RESTRICT bi = dwt->wavelet + dwt->cas;
OPJ_UINT32 i; OPJ_UINT32 i;
for (i = dwt->win_l_x0; i < dwt->win_l_x1; ++i) { for (i = dwt->win_l_x0; i < dwt->win_l_x1; ++i) {
@ -2434,7 +2430,7 @@ static void opj_v4dwt_interleave_v(opj_v4dwt_t* OPJ_RESTRICT dwt,
} }
} }
static void opj_v4dwt_interleave_partial_v(opj_v4dwt_t* OPJ_RESTRICT dwt, static void opj_v8dwt_interleave_partial_v(opj_v8dwt_t* OPJ_RESTRICT dwt,
opj_sparse_array_int32_t* sa, opj_sparse_array_int32_t* sa,
OPJ_UINT32 sa_col, OPJ_UINT32 sa_col,
OPJ_UINT32 nb_elts_read) OPJ_UINT32 nb_elts_read)
@ -2444,44 +2440,36 @@ static void opj_v4dwt_interleave_partial_v(opj_v4dwt_t* OPJ_RESTRICT dwt,
sa_col, dwt->win_l_x0, sa_col, dwt->win_l_x0,
sa_col + nb_elts_read, dwt->win_l_x1, sa_col + nb_elts_read, dwt->win_l_x1,
(OPJ_INT32*)(dwt->wavelet + dwt->cas + 2 * dwt->win_l_x0), (OPJ_INT32*)(dwt->wavelet + dwt->cas + 2 * dwt->win_l_x0),
1, 8, OPJ_TRUE); 1, 2 * NB_ELTS_V8, OPJ_TRUE);
assert(ret); assert(ret);
ret = opj_sparse_array_int32_read(sa, ret = opj_sparse_array_int32_read(sa,
sa_col, (OPJ_UINT32)dwt->sn + dwt->win_h_x0, sa_col, (OPJ_UINT32)dwt->sn + dwt->win_h_x0,
sa_col + nb_elts_read, (OPJ_UINT32)dwt->sn + dwt->win_h_x1, sa_col + nb_elts_read, (OPJ_UINT32)dwt->sn + dwt->win_h_x1,
(OPJ_INT32*)(dwt->wavelet + 1 - dwt->cas + 2 * dwt->win_h_x0), (OPJ_INT32*)(dwt->wavelet + 1 - dwt->cas + 2 * dwt->win_h_x0),
1, 8, OPJ_TRUE); 1, 2 * NB_ELTS_V8, OPJ_TRUE);
assert(ret); assert(ret);
OPJ_UNUSED(ret); OPJ_UNUSED(ret);
} }
#ifdef __SSE__ #ifdef __SSE__
static void opj_v4dwt_decode_step1_sse(opj_v4_t* w, static void opj_v8dwt_decode_step1_sse(opj_v8_t* w,
OPJ_UINT32 start, OPJ_UINT32 start,
OPJ_UINT32 end, OPJ_UINT32 end,
const __m128 c) const __m128 c)
{ {
__m128* OPJ_RESTRICT vw = (__m128*) w; __m128* OPJ_RESTRICT vw = (__m128*) w;
OPJ_UINT32 i; OPJ_UINT32 i = start;
/* 4x unrolled loop */ /* To be adapted if NB_ELTS_V8 changes */
vw += 2 * start; vw += 4 * start;
for (i = start; i + 3 < end; i += 4, vw += 8) { /* Note: attempt at loop unrolling x2 doesn't help */
__m128 xmm0 = _mm_mul_ps(vw[0], c); for (; i < end; ++i, vw += 4) {
__m128 xmm2 = _mm_mul_ps(vw[2], c);
__m128 xmm4 = _mm_mul_ps(vw[4], c);
__m128 xmm6 = _mm_mul_ps(vw[6], c);
vw[0] = xmm0;
vw[2] = xmm2;
vw[4] = xmm4;
vw[6] = xmm6;
}
for (; i < end; ++i, vw += 2) {
vw[0] = _mm_mul_ps(vw[0], c); vw[0] = _mm_mul_ps(vw[0], c);
vw[1] = _mm_mul_ps(vw[1], c);
} }
} }
static void opj_v4dwt_decode_step2_sse(opj_v4_t* l, opj_v4_t* w, static void opj_v8dwt_decode_step2_sse(opj_v8_t* l, opj_v8_t* w,
OPJ_UINT32 start, OPJ_UINT32 start,
OPJ_UINT32 end, OPJ_UINT32 end,
OPJ_UINT32 m, OPJ_UINT32 m,
@ -2489,74 +2477,58 @@ static void opj_v4dwt_decode_step2_sse(opj_v4_t* l, opj_v4_t* w,
{ {
__m128* OPJ_RESTRICT vl = (__m128*) l; __m128* OPJ_RESTRICT vl = (__m128*) l;
__m128* OPJ_RESTRICT vw = (__m128*) w; __m128* OPJ_RESTRICT vw = (__m128*) w;
/* To be adapted if NB_ELTS_V8 changes */
OPJ_UINT32 i; OPJ_UINT32 i;
OPJ_UINT32 imax = opj_uint_min(end, m); OPJ_UINT32 imax = opj_uint_min(end, m);
__m128 tmp1, tmp2, tmp3;
if (start == 0) { if (start == 0) {
tmp1 = vl[0]; if (imax >= 1) {
vw[-2] = _mm_add_ps(vw[-2], _mm_mul_ps(_mm_add_ps(vl[0], vw[0]), c));
vw[-1] = _mm_add_ps(vw[-1], _mm_mul_ps(_mm_add_ps(vl[1], vw[1]), c));
vw += 4;
start = 1;
}
} else { } else {
vw += start * 2; vw += start * 4;
tmp1 = vw[-3];
} }
i = start; i = start;
/* Note: attempt at loop unrolling x2 doesn't help */
/* 4x loop unrolling */
for (; i + 3 < imax; i += 4) {
__m128 tmp4, tmp5, tmp6, tmp7, tmp8, tmp9;
tmp2 = vw[-1];
tmp3 = vw[ 0];
tmp4 = vw[ 1];
tmp5 = vw[ 2];
tmp6 = vw[ 3];
tmp7 = vw[ 4];
tmp8 = vw[ 5];
tmp9 = vw[ 6];
vw[-1] = _mm_add_ps(tmp2, _mm_mul_ps(_mm_add_ps(tmp1, tmp3), c));
vw[ 1] = _mm_add_ps(tmp4, _mm_mul_ps(_mm_add_ps(tmp3, tmp5), c));
vw[ 3] = _mm_add_ps(tmp6, _mm_mul_ps(_mm_add_ps(tmp5, tmp7), c));
vw[ 5] = _mm_add_ps(tmp8, _mm_mul_ps(_mm_add_ps(tmp7, tmp9), c));
tmp1 = tmp9;
vw += 8;
}
for (; i < imax; ++i) { for (; i < imax; ++i) {
tmp2 = vw[-1]; vw[-2] = _mm_add_ps(vw[-2], _mm_mul_ps(_mm_add_ps(vw[-4], vw[0]), c));
tmp3 = vw[ 0]; vw[-1] = _mm_add_ps(vw[-1], _mm_mul_ps(_mm_add_ps(vw[-3], vw[1]), c));
vw[-1] = _mm_add_ps(tmp2, _mm_mul_ps(_mm_add_ps(tmp1, tmp3), c)); vw += 4;
tmp1 = tmp3;
vw += 2;
} }
if (m < end) { if (m < end) {
assert(m + 1 == end); assert(m + 1 == end);
c = _mm_add_ps(c, c); c = _mm_add_ps(c, c);
c = _mm_mul_ps(c, vw[-2]); vw[-2] = _mm_add_ps(vw[-2], _mm_mul_ps(c, vw[-4]));
vw[-1] = _mm_add_ps(vw[-1], c); vw[-1] = _mm_add_ps(vw[-1], _mm_mul_ps(c, vw[-3]));
} }
} }
#else #else
static void opj_v4dwt_decode_step1(opj_v4_t* w, static void opj_v8dwt_decode_step1(opj_v8_t* w,
OPJ_UINT32 start, OPJ_UINT32 start,
OPJ_UINT32 end, OPJ_UINT32 end,
const OPJ_FLOAT32 c) const OPJ_FLOAT32 c)
{ {
OPJ_FLOAT32* OPJ_RESTRICT fw = (OPJ_FLOAT32*) w; OPJ_FLOAT32* OPJ_RESTRICT fw = (OPJ_FLOAT32*) w;
OPJ_UINT32 i; OPJ_UINT32 i;
/* To be adapted if NB_ELTS_V8 changes */
for (i = start; i < end; ++i) { for (i = start; i < end; ++i) {
OPJ_FLOAT32 tmp1 = fw[i * 8 ]; fw[i * 2 * 8 ] = fw[i * 2 * 8 ] * c;
OPJ_FLOAT32 tmp2 = fw[i * 8 + 1]; fw[i * 2 * 8 + 1] = fw[i * 2 * 8 + 1] * c;
OPJ_FLOAT32 tmp3 = fw[i * 8 + 2]; fw[i * 2 * 8 + 2] = fw[i * 2 * 8 + 2] * c;
OPJ_FLOAT32 tmp4 = fw[i * 8 + 3]; fw[i * 2 * 8 + 3] = fw[i * 2 * 8 + 3] * c;
fw[i * 8 ] = tmp1 * c; fw[i * 2 * 8 + 4] = fw[i * 2 * 8 + 4] * c;
fw[i * 8 + 1] = tmp2 * c; fw[i * 2 * 8 + 5] = fw[i * 2 * 8 + 5] * c;
fw[i * 8 + 2] = tmp3 * c; fw[i * 2 * 8 + 6] = fw[i * 2 * 8 + 6] * c;
fw[i * 8 + 3] = tmp4 * c; fw[i * 2 * 8 + 7] = fw[i * 2 * 8 + 7] * c;
} }
} }
static void opj_v4dwt_decode_step2(opj_v4_t* l, opj_v4_t* w, static void opj_v8dwt_decode_step2(opj_v8_t* l, opj_v8_t* w,
OPJ_UINT32 start, OPJ_UINT32 start,
OPJ_UINT32 end, OPJ_UINT32 end,
OPJ_UINT32 m, OPJ_UINT32 m,
@ -2567,36 +2539,33 @@ static void opj_v4dwt_decode_step2(opj_v4_t* l, opj_v4_t* w,
OPJ_UINT32 i; OPJ_UINT32 i;
OPJ_UINT32 imax = opj_uint_min(end, m); OPJ_UINT32 imax = opj_uint_min(end, m);
if (start > 0) { if (start > 0) {
fw += 8 * start; fw += 2 * NB_ELTS_V8 * start;
fl = fw - 8; fl = fw - 2 * NB_ELTS_V8;
} }
/* To be adapted if NB_ELTS_V8 changes */
for (i = start; i < imax; ++i) { for (i = start; i < imax; ++i) {
OPJ_FLOAT32 tmp1_1 = fl[0]; fw[-8] = fw[-8] + ((fl[0] + fw[0]) * c);
OPJ_FLOAT32 tmp1_2 = fl[1]; fw[-7] = fw[-7] + ((fl[1] + fw[1]) * c);
OPJ_FLOAT32 tmp1_3 = fl[2]; fw[-6] = fw[-6] + ((fl[2] + fw[2]) * c);
OPJ_FLOAT32 tmp1_4 = fl[3]; fw[-5] = fw[-5] + ((fl[3] + fw[3]) * c);
OPJ_FLOAT32 tmp2_1 = fw[-4]; fw[-4] = fw[-4] + ((fl[4] + fw[4]) * c);
OPJ_FLOAT32 tmp2_2 = fw[-3]; fw[-3] = fw[-3] + ((fl[5] + fw[5]) * c);
OPJ_FLOAT32 tmp2_3 = fw[-2]; fw[-2] = fw[-2] + ((fl[6] + fw[6]) * c);
OPJ_FLOAT32 tmp2_4 = fw[-1]; fw[-1] = fw[-1] + ((fl[7] + fw[7]) * c);
OPJ_FLOAT32 tmp3_1 = fw[0];
OPJ_FLOAT32 tmp3_2 = fw[1];
OPJ_FLOAT32 tmp3_3 = fw[2];
OPJ_FLOAT32 tmp3_4 = fw[3];
fw[-4] = tmp2_1 + ((tmp1_1 + tmp3_1) * c);
fw[-3] = tmp2_2 + ((tmp1_2 + tmp3_2) * c);
fw[-2] = tmp2_3 + ((tmp1_3 + tmp3_3) * c);
fw[-1] = tmp2_4 + ((tmp1_4 + tmp3_4) * c);
fl = fw; fl = fw;
fw += 8; fw += 2 * NB_ELTS_V8;
} }
if (m < end) { if (m < end) {
assert(m + 1 == end); assert(m + 1 == end);
c += c; c += c;
fw[-4] = fw[-4] + fl[0] * c; fw[-8] = fw[-8] + fl[0] * c;
fw[-3] = fw[-3] + fl[1] * c; fw[-7] = fw[-7] + fl[1] * c;
fw[-2] = fw[-2] + fl[2] * c; fw[-6] = fw[-6] + fl[2] * c;
fw[-1] = fw[-1] + fl[3] * c; fw[-5] = fw[-5] + fl[3] * c;
fw[-4] = fw[-4] + fl[4] * c;
fw[-3] = fw[-3] + fl[5] * c;
fw[-2] = fw[-2] + fl[6] * c;
fw[-1] = fw[-1] + fl[7] * c;
} }
} }
@ -2605,7 +2574,7 @@ static void opj_v4dwt_decode_step2(opj_v4_t* l, opj_v4_t* w,
/* <summary> */ /* <summary> */
/* Inverse 9-7 wavelet transform in 1-D. */ /* Inverse 9-7 wavelet transform in 1-D. */
/* </summary> */ /* </summary> */
static void opj_v4dwt_decode(opj_v4dwt_t* OPJ_RESTRICT dwt) static void opj_v8dwt_decode(opj_v8dwt_t* OPJ_RESTRICT dwt)
{ {
OPJ_INT32 a, b; OPJ_INT32 a, b;
/* BUG_WEIRD_TWO_INVK (look for this identifier in tcd.c) */ /* BUG_WEIRD_TWO_INVK (look for this identifier in tcd.c) */
@ -2630,44 +2599,44 @@ static void opj_v4dwt_decode(opj_v4dwt_t* OPJ_RESTRICT dwt)
b = 0; b = 0;
} }
#ifdef __SSE__ #ifdef __SSE__
opj_v4dwt_decode_step1_sse(dwt->wavelet + a, dwt->win_l_x0, dwt->win_l_x1, opj_v8dwt_decode_step1_sse(dwt->wavelet + a, dwt->win_l_x0, dwt->win_l_x1,
_mm_set1_ps(opj_K)); _mm_set1_ps(opj_K));
opj_v4dwt_decode_step1_sse(dwt->wavelet + b, dwt->win_h_x0, dwt->win_h_x1, opj_v8dwt_decode_step1_sse(dwt->wavelet + b, dwt->win_h_x0, dwt->win_h_x1,
_mm_set1_ps(two_invK)); _mm_set1_ps(two_invK));
opj_v4dwt_decode_step2_sse(dwt->wavelet + b, dwt->wavelet + a + 1, opj_v8dwt_decode_step2_sse(dwt->wavelet + b, dwt->wavelet + a + 1,
dwt->win_l_x0, dwt->win_l_x1, dwt->win_l_x0, dwt->win_l_x1,
(OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a), (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a),
_mm_set1_ps(-opj_dwt_delta)); _mm_set1_ps(-opj_dwt_delta));
opj_v4dwt_decode_step2_sse(dwt->wavelet + a, dwt->wavelet + b + 1, opj_v8dwt_decode_step2_sse(dwt->wavelet + a, dwt->wavelet + b + 1,
dwt->win_h_x0, dwt->win_h_x1, dwt->win_h_x0, dwt->win_h_x1,
(OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b), (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b),
_mm_set1_ps(-opj_dwt_gamma)); _mm_set1_ps(-opj_dwt_gamma));
opj_v4dwt_decode_step2_sse(dwt->wavelet + b, dwt->wavelet + a + 1, opj_v8dwt_decode_step2_sse(dwt->wavelet + b, dwt->wavelet + a + 1,
dwt->win_l_x0, dwt->win_l_x1, dwt->win_l_x0, dwt->win_l_x1,
(OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a), (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a),
_mm_set1_ps(-opj_dwt_beta)); _mm_set1_ps(-opj_dwt_beta));
opj_v4dwt_decode_step2_sse(dwt->wavelet + a, dwt->wavelet + b + 1, opj_v8dwt_decode_step2_sse(dwt->wavelet + a, dwt->wavelet + b + 1,
dwt->win_h_x0, dwt->win_h_x1, dwt->win_h_x0, dwt->win_h_x1,
(OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b), (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b),
_mm_set1_ps(-opj_dwt_alpha)); _mm_set1_ps(-opj_dwt_alpha));
#else #else
opj_v4dwt_decode_step1(dwt->wavelet + a, dwt->win_l_x0, dwt->win_l_x1, opj_v8dwt_decode_step1(dwt->wavelet + a, dwt->win_l_x0, dwt->win_l_x1,
opj_K); opj_K);
opj_v4dwt_decode_step1(dwt->wavelet + b, dwt->win_h_x0, dwt->win_h_x1, opj_v8dwt_decode_step1(dwt->wavelet + b, dwt->win_h_x0, dwt->win_h_x1,
two_invK); two_invK);
opj_v4dwt_decode_step2(dwt->wavelet + b, dwt->wavelet + a + 1, opj_v8dwt_decode_step2(dwt->wavelet + b, dwt->wavelet + a + 1,
dwt->win_l_x0, dwt->win_l_x1, dwt->win_l_x0, dwt->win_l_x1,
(OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a), (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a),
-opj_dwt_delta); -opj_dwt_delta);
opj_v4dwt_decode_step2(dwt->wavelet + a, dwt->wavelet + b + 1, opj_v8dwt_decode_step2(dwt->wavelet + a, dwt->wavelet + b + 1,
dwt->win_h_x0, dwt->win_h_x1, dwt->win_h_x0, dwt->win_h_x1,
(OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b), (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b),
-opj_dwt_gamma); -opj_dwt_gamma);
opj_v4dwt_decode_step2(dwt->wavelet + b, dwt->wavelet + a + 1, opj_v8dwt_decode_step2(dwt->wavelet + b, dwt->wavelet + a + 1,
dwt->win_l_x0, dwt->win_l_x1, dwt->win_l_x0, dwt->win_l_x1,
(OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a), (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a),
-opj_dwt_beta); -opj_dwt_beta);
opj_v4dwt_decode_step2(dwt->wavelet + a, dwt->wavelet + b + 1, opj_v8dwt_decode_step2(dwt->wavelet + a, dwt->wavelet + b + 1,
dwt->win_h_x0, dwt->win_h_x1, dwt->win_h_x0, dwt->win_h_x1,
(OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b), (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b),
-opj_dwt_alpha); -opj_dwt_alpha);
@ -2682,8 +2651,8 @@ static
OPJ_BOOL opj_dwt_decode_tile_97(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec, OPJ_BOOL opj_dwt_decode_tile_97(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec,
OPJ_UINT32 numres) OPJ_UINT32 numres)
{ {
opj_v4dwt_t h; opj_v8dwt_t h;
opj_v4dwt_t v; opj_v8dwt_t v;
opj_tcd_resolution_t* res = tilec->resolutions; opj_tcd_resolution_t* res = tilec->resolutions;
@ -2706,11 +2675,11 @@ OPJ_BOOL opj_dwt_decode_tile_97(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec,
} }
l_data_size += 5U; l_data_size += 5U;
/* overflow check */ /* overflow check */
if (l_data_size > (SIZE_MAX / sizeof(opj_v4_t))) { if (l_data_size > (SIZE_MAX / sizeof(opj_v8_t))) {
/* FIXME event manager error callback */ /* FIXME event manager error callback */
return OPJ_FALSE; return OPJ_FALSE;
} }
h.wavelet = (opj_v4_t*) opj_aligned_malloc(l_data_size * sizeof(opj_v4_t)); h.wavelet = (opj_v8_t*) opj_aligned_malloc(l_data_size * sizeof(opj_v8_t));
if (!h.wavelet) { if (!h.wavelet) {
/* FIXME event manager error callback */ /* FIXME event manager error callback */
return OPJ_FALSE; return OPJ_FALSE;
@ -2738,35 +2707,36 @@ OPJ_BOOL opj_dwt_decode_tile_97(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec,
h.win_l_x1 = (OPJ_UINT32)h.sn; h.win_l_x1 = (OPJ_UINT32)h.sn;
h.win_h_x0 = 0; h.win_h_x0 = 0;
h.win_h_x1 = (OPJ_UINT32)h.dn; h.win_h_x1 = (OPJ_UINT32)h.dn;
for (j = 0; j + 3 < rh; j += 4) { for (j = 0; j + (NB_ELTS_V8 - 1) < rh; j += NB_ELTS_V8) {
OPJ_UINT32 k; OPJ_UINT32 k;
opj_v4dwt_interleave_h(&h, aj, w, rh - j); opj_v8dwt_interleave_h(&h, aj, w, rh - j);
opj_v4dwt_decode(&h); opj_v8dwt_decode(&h);
/* To be adapted if NB_ELTS_V8 changes */
for (k = 0; k < rw; k++) { for (k = 0; k < rw; k++) {
aj[k ] = h.wavelet[k].f[0]; aj[k ] = h.wavelet[k].f[0];
aj[k + (OPJ_SIZE_T)w ] = h.wavelet[k].f[1]; aj[k + (OPJ_SIZE_T)w ] = h.wavelet[k].f[1];
aj[k + (OPJ_SIZE_T)w * 2] = h.wavelet[k].f[2]; aj[k + (OPJ_SIZE_T)w * 2] = h.wavelet[k].f[2];
aj[k + (OPJ_SIZE_T)w * 3] = h.wavelet[k].f[3]; aj[k + (OPJ_SIZE_T)w * 3] = h.wavelet[k].f[3];
} }
for (k = 0; k < rw; k++) {
aj[k + (OPJ_SIZE_T)w * 4] = h.wavelet[k].f[4];
aj[k + (OPJ_SIZE_T)w * 5] = h.wavelet[k].f[5];
aj[k + (OPJ_SIZE_T)w * 6] = h.wavelet[k].f[6];
aj[k + (OPJ_SIZE_T)w * 7] = h.wavelet[k].f[7];
}
aj += w * 4; aj += w * NB_ELTS_V8;
} }
if (j < rh) { if (j < rh) {
OPJ_UINT32 k; OPJ_UINT32 k;
opj_v4dwt_interleave_h(&h, aj, w, rh - j); opj_v8dwt_interleave_h(&h, aj, w, rh - j);
opj_v4dwt_decode(&h); opj_v8dwt_decode(&h);
for (k = 0; k < rw; k++) { for (k = 0; k < rw; k++) {
switch (rh - j) { OPJ_UINT32 l;
case 3: for (l = 0; l < rh - j; l++) {
aj[k + (OPJ_SIZE_T)w * 2] = h.wavelet[k].f[2]; aj[k + (OPJ_SIZE_T)w * l ] = h.wavelet[k].f[l];
/* FALLTHRU */
case 2:
aj[k + (OPJ_SIZE_T)w ] = h.wavelet[k].f[1];
/* FALLTHRU */
case 1:
aj[k] = h.wavelet[k].f[0];
} }
} }
} }
@ -2779,25 +2749,25 @@ OPJ_BOOL opj_dwt_decode_tile_97(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec,
v.win_h_x1 = (OPJ_UINT32)v.dn; v.win_h_x1 = (OPJ_UINT32)v.dn;
aj = (OPJ_FLOAT32*) tilec->data; aj = (OPJ_FLOAT32*) tilec->data;
for (j = rw; j > 3; j -= 4) { for (j = rw; j > (NB_ELTS_V8 - 1); j -= NB_ELTS_V8) {
OPJ_UINT32 k; OPJ_UINT32 k;
opj_v4dwt_interleave_v(&v, aj, w, 4); opj_v8dwt_interleave_v(&v, aj, w, NB_ELTS_V8);
opj_v4dwt_decode(&v); opj_v8dwt_decode(&v);
for (k = 0; k < rh; ++k) { for (k = 0; k < rh; ++k) {
memcpy(&aj[k * (OPJ_SIZE_T)w], &v.wavelet[k], 4 * sizeof(OPJ_FLOAT32)); memcpy(&aj[k * (OPJ_SIZE_T)w], &v.wavelet[k], NB_ELTS_V8 * sizeof(OPJ_FLOAT32));
} }
aj += 4; aj += NB_ELTS_V8;
} }
if (rw & 0x03) { if (rw & (NB_ELTS_V8 - 1)) {
OPJ_UINT32 k; OPJ_UINT32 k;
j = rw & 0x03; j = rw & (NB_ELTS_V8 - 1);
opj_v4dwt_interleave_v(&v, aj, w, j); opj_v8dwt_interleave_v(&v, aj, w, j);
opj_v4dwt_decode(&v); opj_v8dwt_decode(&v);
for (k = 0; k < rh; ++k) { for (k = 0; k < rh; ++k) {
memcpy(&aj[k * (OPJ_SIZE_T)w], &v.wavelet[k], memcpy(&aj[k * (OPJ_SIZE_T)w], &v.wavelet[k],
@ -2815,8 +2785,8 @@ OPJ_BOOL opj_dwt_decode_partial_97(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec,
OPJ_UINT32 numres) OPJ_UINT32 numres)
{ {
opj_sparse_array_int32_t* sa; opj_sparse_array_int32_t* sa;
opj_v4dwt_t h; opj_v8dwt_t h;
opj_v4dwt_t v; opj_v8dwt_t v;
OPJ_UINT32 resno; OPJ_UINT32 resno;
/* This value matches the maximum left/right extension given in tables */ /* This value matches the maximum left/right extension given in tables */
/* F.2 and F.3 of the standard. Note: in opj_tcd_is_subband_area_of_interest() */ /* F.2 and F.3 of the standard. Note: in opj_tcd_is_subband_area_of_interest() */
@ -2873,12 +2843,12 @@ OPJ_BOOL opj_dwt_decode_partial_97(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec,
} }
l_data_size += 5U; l_data_size += 5U;
/* overflow check */ /* overflow check */
if (l_data_size > (SIZE_MAX / sizeof(opj_v4_t))) { if (l_data_size > (SIZE_MAX / sizeof(opj_v8_t))) {
/* FIXME event manager error callback */ /* FIXME event manager error callback */
opj_sparse_array_int32_free(sa); opj_sparse_array_int32_free(sa);
return OPJ_FALSE; return OPJ_FALSE;
} }
h.wavelet = (opj_v4_t*) opj_aligned_malloc(l_data_size * sizeof(opj_v4_t)); h.wavelet = (opj_v8_t*) opj_aligned_malloc(l_data_size * sizeof(opj_v8_t));
if (!h.wavelet) { if (!h.wavelet) {
/* FIXME event manager error callback */ /* FIXME event manager error callback */
opj_sparse_array_int32_free(sa); opj_sparse_array_int32_free(sa);
@ -2973,17 +2943,17 @@ OPJ_BOOL opj_dwt_decode_partial_97(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec,
h.win_l_x1 = win_ll_x1; h.win_l_x1 = win_ll_x1;
h.win_h_x0 = win_hl_x0; h.win_h_x0 = win_hl_x0;
h.win_h_x1 = win_hl_x1; h.win_h_x1 = win_hl_x1;
for (j = 0; j + 3 < rh; j += 4) { for (j = 0; j + (NB_ELTS_V8 - 1) < rh; j += NB_ELTS_V8) {
if ((j + 3 >= win_ll_y0 && j < win_ll_y1) || if ((j + (NB_ELTS_V8 - 1) >= win_ll_y0 && j < win_ll_y1) ||
(j + 3 >= win_lh_y0 + (OPJ_UINT32)v.sn && (j + (NB_ELTS_V8 - 1) >= win_lh_y0 + (OPJ_UINT32)v.sn &&
j < win_lh_y1 + (OPJ_UINT32)v.sn)) { j < win_lh_y1 + (OPJ_UINT32)v.sn)) {
opj_v4dwt_interleave_partial_h(&h, sa, j, opj_uint_min(4U, rh - j)); opj_v8dwt_interleave_partial_h(&h, sa, j, opj_uint_min(NB_ELTS_V8, rh - j));
opj_v4dwt_decode(&h); opj_v8dwt_decode(&h);
if (!opj_sparse_array_int32_write(sa, if (!opj_sparse_array_int32_write(sa,
win_tr_x0, j, win_tr_x0, j,
win_tr_x1, j + 4, win_tr_x1, j + NB_ELTS_V8,
(OPJ_INT32*)&h.wavelet[win_tr_x0].f[0], (OPJ_INT32*)&h.wavelet[win_tr_x0].f[0],
4, 1, OPJ_TRUE)) { NB_ELTS_V8, 1, OPJ_TRUE)) {
/* FIXME event manager error callback */ /* FIXME event manager error callback */
opj_sparse_array_int32_free(sa); opj_sparse_array_int32_free(sa);
opj_aligned_free(h.wavelet); opj_aligned_free(h.wavelet);
@ -2993,16 +2963,16 @@ OPJ_BOOL opj_dwt_decode_partial_97(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec,
} }
if (j < rh && if (j < rh &&
((j + 3 >= win_ll_y0 && j < win_ll_y1) || ((j + (NB_ELTS_V8 - 1) >= win_ll_y0 && j < win_ll_y1) ||
(j + 3 >= win_lh_y0 + (OPJ_UINT32)v.sn && (j + (NB_ELTS_V8 - 1) >= win_lh_y0 + (OPJ_UINT32)v.sn &&
j < win_lh_y1 + (OPJ_UINT32)v.sn))) { j < win_lh_y1 + (OPJ_UINT32)v.sn))) {
opj_v4dwt_interleave_partial_h(&h, sa, j, rh - j); opj_v8dwt_interleave_partial_h(&h, sa, j, rh - j);
opj_v4dwt_decode(&h); opj_v8dwt_decode(&h);
if (!opj_sparse_array_int32_write(sa, if (!opj_sparse_array_int32_write(sa,
win_tr_x0, j, win_tr_x0, j,
win_tr_x1, rh, win_tr_x1, rh,
(OPJ_INT32*)&h.wavelet[win_tr_x0].f[0], (OPJ_INT32*)&h.wavelet[win_tr_x0].f[0],
4, 1, OPJ_TRUE)) { NB_ELTS_V8, 1, OPJ_TRUE)) {
/* FIXME event manager error callback */ /* FIXME event manager error callback */
opj_sparse_array_int32_free(sa); opj_sparse_array_int32_free(sa);
opj_aligned_free(h.wavelet); opj_aligned_free(h.wavelet);
@ -3014,17 +2984,17 @@ OPJ_BOOL opj_dwt_decode_partial_97(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec,
v.win_l_x1 = win_ll_y1; v.win_l_x1 = win_ll_y1;
v.win_h_x0 = win_lh_y0; v.win_h_x0 = win_lh_y0;
v.win_h_x1 = win_lh_y1; v.win_h_x1 = win_lh_y1;
for (j = win_tr_x0; j < win_tr_x1; j += 4) { for (j = win_tr_x0; j < win_tr_x1; j += NB_ELTS_V8) {
OPJ_UINT32 nb_elts = opj_uint_min(4U, win_tr_x1 - j); OPJ_UINT32 nb_elts = opj_uint_min(NB_ELTS_V8, win_tr_x1 - j);
opj_v4dwt_interleave_partial_v(&v, sa, j, nb_elts); opj_v8dwt_interleave_partial_v(&v, sa, j, nb_elts);
opj_v4dwt_decode(&v); opj_v8dwt_decode(&v);
if (!opj_sparse_array_int32_write(sa, if (!opj_sparse_array_int32_write(sa,
j, win_tr_y0, j, win_tr_y0,
j + nb_elts, win_tr_y1, j + nb_elts, win_tr_y1,
(OPJ_INT32*)&h.wavelet[win_tr_y0].f[0], (OPJ_INT32*)&h.wavelet[win_tr_y0].f[0],
1, 4, OPJ_TRUE)) { 1, NB_ELTS_V8, OPJ_TRUE)) {
/* FIXME event manager error callback */ /* FIXME event manager error callback */
opj_sparse_array_int32_free(sa); opj_sparse_array_int32_free(sa);
opj_aligned_free(h.wavelet); opj_aligned_free(h.wavelet);