diff --git a/src/lib/openjp2/dwt.c b/src/lib/openjp2/dwt.c
index 4b00c83a..bdc91cf5 100644
--- a/src/lib/openjp2/dwt.c
+++ b/src/lib/openjp2/dwt.c
@@ -134,12 +134,12 @@ static void opj_dwt_deinterleave_v(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn,
/**
Forward 5-3 wavelet transform in 1-D
*/
-static void opj_dwt_encode_1(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn,
+static void opj_dwt_encode_1(void *a, OPJ_INT32 dn, OPJ_INT32 sn,
OPJ_INT32 cas);
/**
Forward 9-7 wavelet transform in 1-D
*/
-static void opj_dwt_encode_1_real(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn,
+static void opj_dwt_encode_1_real(void *a, OPJ_INT32 dn, OPJ_INT32 sn,
OPJ_INT32 cas);
/**
Explicit calculation of the Quantization Stepsizes
@@ -156,9 +156,13 @@ static OPJ_BOOL opj_dwt_decode_partial_tile(
opj_tcd_tilecomp_t* tilec,
OPJ_UINT32 numres);
+/* Where void* is a OPJ_INT32* for 5x3 and OPJ_FLOAT32* for 9x7 */
+typedef void (*opj_encode_one_row_fnptr_type)(void *, OPJ_INT32, OPJ_INT32,
+ OPJ_INT32);
+
static OPJ_BOOL opj_dwt_encode_procedure(opj_thread_pool_t* tp,
opj_tcd_tilecomp_t * tilec,
- void (*p_function)(OPJ_INT32 *, OPJ_INT32, OPJ_INT32, OPJ_INT32));
+ opj_encode_one_row_fnptr_type p_function);
static OPJ_UINT32 opj_dwt_max_resolution(opj_tcd_resolution_t* OPJ_RESTRICT r,
OPJ_UINT32 i);
@@ -346,10 +350,11 @@ static void opj_dwt_interleave_v(const opj_dwt_t* v, OPJ_INT32 *a, OPJ_INT32 x)
/* */
/* Forward 5-3 wavelet transform in 1-D. */
/* */
-static void opj_dwt_encode_1(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn,
+static void opj_dwt_encode_1(void *aIn, OPJ_INT32 dn, OPJ_INT32 sn,
OPJ_INT32 cas)
{
OPJ_INT32 i;
+ OPJ_INT32* a = (OPJ_INT32*)aIn;
if (!cas) {
if ((dn > 0) || (sn > 1)) { /* NEW : CASE ONE ELEMENT */
@@ -1039,50 +1044,52 @@ static void opj_idwt53_v(const opj_dwt_t *dwt,
/* */
/* Forward 9-7 wavelet transform in 1-D. */
/* */
-static void opj_dwt_encode_1_real(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn,
+static void opj_dwt_encode_1_real(void *aIn, OPJ_INT32 dn, OPJ_INT32 sn,
OPJ_INT32 cas)
{
OPJ_INT32 i;
+ OPJ_FLOAT32* a = (OPJ_FLOAT32*)aIn;
+
if (!cas) {
if ((dn > 0) || (sn > 1)) { /* NEW : CASE ONE ELEMENT */
for (i = 0; i < dn; i++) {
- OPJ_D(i) -= opj_int_fix_mul(OPJ_S_(i) + OPJ_S_(i + 1), 12993);
+ OPJ_D(i) += opj_dwt_alpha * (OPJ_S_(i) + OPJ_S_(i + 1));
}
for (i = 0; i < sn; i++) {
- OPJ_S(i) -= opj_int_fix_mul(OPJ_D_(i - 1) + OPJ_D_(i), 434);
+ OPJ_S(i) += opj_dwt_beta * (OPJ_D_(i - 1) + OPJ_D_(i));
}
for (i = 0; i < dn; i++) {
- OPJ_D(i) += opj_int_fix_mul(OPJ_S_(i) + OPJ_S_(i + 1), 7233);
+ OPJ_D(i) += opj_dwt_gamma * (OPJ_S_(i) + OPJ_S_(i + 1));
}
for (i = 0; i < sn; i++) {
- OPJ_S(i) += opj_int_fix_mul(OPJ_D_(i - 1) + OPJ_D_(i), 3633);
+ OPJ_S(i) += opj_dwt_delta * (OPJ_D_(i - 1) + OPJ_D_(i));
}
for (i = 0; i < dn; i++) {
- OPJ_D(i) = opj_int_fix_mul(OPJ_D(i), 5038); /*5038 */
+ OPJ_D(i) = opj_K / 2 * OPJ_D(i);
}
for (i = 0; i < sn; i++) {
- OPJ_S(i) = opj_int_fix_mul(OPJ_S(i), 6659); /*6660 */
+ OPJ_S(i) = opj_c13318 / 2 * OPJ_S(i);
}
}
} else {
if ((sn > 0) || (dn > 1)) { /* NEW : CASE ONE ELEMENT */
for (i = 0; i < dn; i++) {
- OPJ_S(i) -= opj_int_fix_mul(OPJ_DD_(i) + OPJ_DD_(i - 1), 12993);
+ OPJ_S(i) += opj_dwt_alpha * (OPJ_DD_(i) + OPJ_DD_(i - 1));
}
for (i = 0; i < sn; i++) {
- OPJ_D(i) -= opj_int_fix_mul(OPJ_SS_(i) + OPJ_SS_(i + 1), 434);
+ OPJ_D(i) += opj_dwt_beta * (OPJ_SS_(i) + OPJ_SS_(i + 1));
}
for (i = 0; i < dn; i++) {
- OPJ_S(i) += opj_int_fix_mul(OPJ_DD_(i) + OPJ_DD_(i - 1), 7233);
+ OPJ_S(i) += opj_dwt_gamma * (OPJ_DD_(i) + OPJ_DD_(i - 1));
}
for (i = 0; i < sn; i++) {
- OPJ_D(i) += opj_int_fix_mul(OPJ_SS_(i) + OPJ_SS_(i + 1), 3633);
+ OPJ_D(i) += opj_dwt_delta * (OPJ_SS_(i) + OPJ_SS_(i + 1));
}
for (i = 0; i < dn; i++) {
- OPJ_S(i) = opj_int_fix_mul(OPJ_S(i), 5038); /*5038 */
+ OPJ_S(i) = opj_K / 2 * OPJ_S(i);
}
for (i = 0; i < sn; i++) {
- OPJ_D(i) = opj_int_fix_mul(OPJ_D(i), 6659); /*6660 */
+ OPJ_D(i) = opj_c13318 / 2 * OPJ_D(i);
}
}
}
@@ -1112,7 +1119,7 @@ typedef struct {
OPJ_INT32 * OPJ_RESTRICT tiledp;
OPJ_UINT32 min_j;
OPJ_UINT32 max_j;
- void (*p_function)(OPJ_INT32 *, OPJ_INT32, OPJ_INT32, OPJ_INT32);
+ opj_encode_one_row_fnptr_type p_function;
} opj_dwt_encode_h_job_t;
static void opj_dwt_encode_h_func(void* user_data, opj_tls_t* tls)
@@ -1143,7 +1150,7 @@ typedef struct {
OPJ_INT32 * OPJ_RESTRICT tiledp;
OPJ_UINT32 min_j;
OPJ_UINT32 max_j;
- void (*p_function)(OPJ_INT32 *, OPJ_INT32, OPJ_INT32, OPJ_INT32);
+ opj_encode_one_row_fnptr_type p_function;
} opj_dwt_encode_v_job_t;
static void opj_dwt_encode_v_func(void* user_data, opj_tls_t* tls)
@@ -1175,7 +1182,7 @@ static void opj_dwt_encode_v_func(void* user_data, opj_tls_t* tls)
/* */
static INLINE OPJ_BOOL opj_dwt_encode_procedure(opj_thread_pool_t* tp,
opj_tcd_tilecomp_t * tilec,
- void (*p_function)(OPJ_INT32 *, OPJ_INT32, OPJ_INT32, OPJ_INT32))
+ opj_encode_one_row_fnptr_type p_function)
{
OPJ_INT32 i;
OPJ_INT32 *bj = 00;
diff --git a/src/lib/openjp2/mct.c b/src/lib/openjp2/mct.c
index 08bc8115..9d79b50a 100644
--- a/src/lib/openjp2/mct.c
+++ b/src/lib/openjp2/mct.c
@@ -209,175 +209,25 @@ OPJ_FLOAT64 opj_mct_getnorm(OPJ_UINT32 compno)
/* */
/* Forward irreversible MCT. */
/* */
-#ifdef __SSE4_1__
void opj_mct_encode_real(
- OPJ_INT32* OPJ_RESTRICT c0,
- OPJ_INT32* OPJ_RESTRICT c1,
- OPJ_INT32* OPJ_RESTRICT c2,
- OPJ_SIZE_T n)
-{
- OPJ_SIZE_T i;
- const OPJ_SIZE_T len = n;
-
- const __m128i ry = _mm_set1_epi32(2449);
- const __m128i gy = _mm_set1_epi32(4809);
- const __m128i by = _mm_set1_epi32(934);
- const __m128i ru = _mm_set1_epi32(1382);
- const __m128i gu = _mm_set1_epi32(2714);
- /* const __m128i bu = _mm_set1_epi32(4096); */
- /* const __m128i rv = _mm_set1_epi32(4096); */
- const __m128i gv = _mm_set1_epi32(3430);
- const __m128i bv = _mm_set1_epi32(666);
- const __m128i mulround = _mm_shuffle_epi32(_mm_cvtsi32_si128(4096),
- _MM_SHUFFLE(1, 0, 1, 0));
-
- for (i = 0; i < (len & ~3U); i += 4) {
- __m128i lo, hi;
- __m128i y, u, v;
- __m128i r = _mm_load_si128((const __m128i *) & (c0[i]));
- __m128i g = _mm_load_si128((const __m128i *) & (c1[i]));
- __m128i b = _mm_load_si128((const __m128i *) & (c2[i]));
-
- lo = r;
- hi = _mm_shuffle_epi32(r, _MM_SHUFFLE(3, 3, 1, 1));
- lo = _mm_mul_epi32(lo, ry);
- hi = _mm_mul_epi32(hi, ry);
- lo = _mm_add_epi64(lo, mulround);
- hi = _mm_add_epi64(hi, mulround);
- lo = _mm_srli_epi64(lo, 13);
- hi = _mm_slli_epi64(hi, 32 - 13);
- y = _mm_blend_epi16(lo, hi, 0xCC);
-
- lo = g;
- hi = _mm_shuffle_epi32(g, _MM_SHUFFLE(3, 3, 1, 1));
- lo = _mm_mul_epi32(lo, gy);
- hi = _mm_mul_epi32(hi, gy);
- lo = _mm_add_epi64(lo, mulround);
- hi = _mm_add_epi64(hi, mulround);
- lo = _mm_srli_epi64(lo, 13);
- hi = _mm_slli_epi64(hi, 32 - 13);
- y = _mm_add_epi32(y, _mm_blend_epi16(lo, hi, 0xCC));
-
- lo = b;
- hi = _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 3, 1, 1));
- lo = _mm_mul_epi32(lo, by);
- hi = _mm_mul_epi32(hi, by);
- lo = _mm_add_epi64(lo, mulround);
- hi = _mm_add_epi64(hi, mulround);
- lo = _mm_srli_epi64(lo, 13);
- hi = _mm_slli_epi64(hi, 32 - 13);
- y = _mm_add_epi32(y, _mm_blend_epi16(lo, hi, 0xCC));
- _mm_store_si128((__m128i *) & (c0[i]), y);
-
- /*lo = b;
- hi = _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 3, 1, 1));
- lo = _mm_mul_epi32(lo, mulround);
- hi = _mm_mul_epi32(hi, mulround);*/
- lo = _mm_cvtepi32_epi64(_mm_shuffle_epi32(b, _MM_SHUFFLE(3, 2, 2, 0)));
- hi = _mm_cvtepi32_epi64(_mm_shuffle_epi32(b, _MM_SHUFFLE(3, 2, 3, 1)));
- lo = _mm_slli_epi64(lo, 12);
- hi = _mm_slli_epi64(hi, 12);
- lo = _mm_add_epi64(lo, mulround);
- hi = _mm_add_epi64(hi, mulround);
- lo = _mm_srli_epi64(lo, 13);
- hi = _mm_slli_epi64(hi, 32 - 13);
- u = _mm_blend_epi16(lo, hi, 0xCC);
-
- lo = r;
- hi = _mm_shuffle_epi32(r, _MM_SHUFFLE(3, 3, 1, 1));
- lo = _mm_mul_epi32(lo, ru);
- hi = _mm_mul_epi32(hi, ru);
- lo = _mm_add_epi64(lo, mulround);
- hi = _mm_add_epi64(hi, mulround);
- lo = _mm_srli_epi64(lo, 13);
- hi = _mm_slli_epi64(hi, 32 - 13);
- u = _mm_sub_epi32(u, _mm_blend_epi16(lo, hi, 0xCC));
-
- lo = g;
- hi = _mm_shuffle_epi32(g, _MM_SHUFFLE(3, 3, 1, 1));
- lo = _mm_mul_epi32(lo, gu);
- hi = _mm_mul_epi32(hi, gu);
- lo = _mm_add_epi64(lo, mulround);
- hi = _mm_add_epi64(hi, mulround);
- lo = _mm_srli_epi64(lo, 13);
- hi = _mm_slli_epi64(hi, 32 - 13);
- u = _mm_sub_epi32(u, _mm_blend_epi16(lo, hi, 0xCC));
- _mm_store_si128((__m128i *) & (c1[i]), u);
-
- /*lo = r;
- hi = _mm_shuffle_epi32(r, _MM_SHUFFLE(3, 3, 1, 1));
- lo = _mm_mul_epi32(lo, mulround);
- hi = _mm_mul_epi32(hi, mulround);*/
- lo = _mm_cvtepi32_epi64(_mm_shuffle_epi32(r, _MM_SHUFFLE(3, 2, 2, 0)));
- hi = _mm_cvtepi32_epi64(_mm_shuffle_epi32(r, _MM_SHUFFLE(3, 2, 3, 1)));
- lo = _mm_slli_epi64(lo, 12);
- hi = _mm_slli_epi64(hi, 12);
- lo = _mm_add_epi64(lo, mulround);
- hi = _mm_add_epi64(hi, mulround);
- lo = _mm_srli_epi64(lo, 13);
- hi = _mm_slli_epi64(hi, 32 - 13);
- v = _mm_blend_epi16(lo, hi, 0xCC);
-
- lo = g;
- hi = _mm_shuffle_epi32(g, _MM_SHUFFLE(3, 3, 1, 1));
- lo = _mm_mul_epi32(lo, gv);
- hi = _mm_mul_epi32(hi, gv);
- lo = _mm_add_epi64(lo, mulround);
- hi = _mm_add_epi64(hi, mulround);
- lo = _mm_srli_epi64(lo, 13);
- hi = _mm_slli_epi64(hi, 32 - 13);
- v = _mm_sub_epi32(v, _mm_blend_epi16(lo, hi, 0xCC));
-
- lo = b;
- hi = _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 3, 1, 1));
- lo = _mm_mul_epi32(lo, bv);
- hi = _mm_mul_epi32(hi, bv);
- lo = _mm_add_epi64(lo, mulround);
- hi = _mm_add_epi64(hi, mulround);
- lo = _mm_srli_epi64(lo, 13);
- hi = _mm_slli_epi64(hi, 32 - 13);
- v = _mm_sub_epi32(v, _mm_blend_epi16(lo, hi, 0xCC));
- _mm_store_si128((__m128i *) & (c2[i]), v);
- }
- for (; i < len; ++i) {
- OPJ_INT32 r = c0[i];
- OPJ_INT32 g = c1[i];
- OPJ_INT32 b = c2[i];
- OPJ_INT32 y = opj_int_fix_mul(r, 2449) + opj_int_fix_mul(g,
- 4809) + opj_int_fix_mul(b, 934);
- OPJ_INT32 u = -opj_int_fix_mul(r, 1382) - opj_int_fix_mul(g,
- 2714) + opj_int_fix_mul(b, 4096);
- OPJ_INT32 v = opj_int_fix_mul(r, 4096) - opj_int_fix_mul(g,
- 3430) - opj_int_fix_mul(b, 666);
- c0[i] = y;
- c1[i] = u;
- c2[i] = v;
- }
-}
-#else
-void opj_mct_encode_real(
- OPJ_INT32* OPJ_RESTRICT c0,
- OPJ_INT32* OPJ_RESTRICT c1,
- OPJ_INT32* OPJ_RESTRICT c2,
+ OPJ_FLOAT32* OPJ_RESTRICT c0,
+ OPJ_FLOAT32* OPJ_RESTRICT c1,
+ OPJ_FLOAT32* OPJ_RESTRICT c2,
OPJ_SIZE_T n)
{
OPJ_SIZE_T i;
for (i = 0; i < n; ++i) {
- OPJ_INT32 r = c0[i];
- OPJ_INT32 g = c1[i];
- OPJ_INT32 b = c2[i];
- OPJ_INT32 y = opj_int_fix_mul(r, 2449) + opj_int_fix_mul(g,
- 4809) + opj_int_fix_mul(b, 934);
- OPJ_INT32 u = -opj_int_fix_mul(r, 1382) - opj_int_fix_mul(g,
- 2714) + opj_int_fix_mul(b, 4096);
- OPJ_INT32 v = opj_int_fix_mul(r, 4096) - opj_int_fix_mul(g,
- 3430) - opj_int_fix_mul(b, 666);
+ OPJ_FLOAT32 r = c0[i];
+ OPJ_FLOAT32 g = c1[i];
+ OPJ_FLOAT32 b = c2[i];
+ OPJ_FLOAT32 y = 0.299f * r + 0.587f * g + 0.114f * b;
+ OPJ_FLOAT32 u = -0.16875f * r - 0.331260f * g + 0.5f * b;
+ OPJ_FLOAT32 v = 0.5f * r - 0.41869f * g - 0.08131f * b;
c0[i] = y;
c1[i] = u;
c2[i] = v;
}
}
-#endif
/* */
/* Inverse irreversible MCT. */
diff --git a/src/lib/openjp2/mct.h b/src/lib/openjp2/mct.h
index 2e37ce73..3e1f5e49 100644
--- a/src/lib/openjp2/mct.h
+++ b/src/lib/openjp2/mct.h
@@ -85,8 +85,9 @@ Apply an irreversible multi-component transform to an image
@param c2 Samples blue component
@param n Number of samples for each component
*/
-void opj_mct_encode_real(OPJ_INT32* OPJ_RESTRICT c0, OPJ_INT32* OPJ_RESTRICT c1,
- OPJ_INT32* OPJ_RESTRICT c2, OPJ_SIZE_T n);
+void opj_mct_encode_real(OPJ_FLOAT32* OPJ_RESTRICT c0,
+ OPJ_FLOAT32* OPJ_RESTRICT c1,
+ OPJ_FLOAT32* OPJ_RESTRICT c2, OPJ_SIZE_T n);
/**
Apply an irreversible multi-component inverse transform to an image
@param c0 Samples for luminance component
diff --git a/src/lib/openjp2/t1.c b/src/lib/openjp2/t1.c
index 1b9556ea..8d5feadf 100644
--- a/src/lib/openjp2/t1.c
+++ b/src/lib/openjp2/t1.c
@@ -2194,16 +2194,11 @@ static void opj_t1_clbl_encode_processor(void* user_data, opj_tls_t* tls)
tileIndex += tileLineAdvance;
}
} else { /* if (tccp->qmfbid == 0) */
- const OPJ_INT32 bandconst = 8192 * 8192 / ((OPJ_INT32) floor(
- band->stepsize * 8192));
-
for (j = 0; j < cblk_h; ++j) {
for (i = 0; i < cblk_w; ++i) {
- OPJ_INT32 tmp = tiledp[tileIndex];
- tiledp[tileIndex] =
- opj_int_fix_mul_t1(
- tmp,
- bandconst);
+ OPJ_FLOAT32 tmp = ((OPJ_FLOAT32*)tiledp)[tileIndex];
+ tiledp[tileIndex] = (OPJ_INT32)opj_lrintf((tmp / band->stepsize) *
+ (1 << T1_NMSEDEC_FRACBITS));
tileIndex++;
}
tileIndex += tileLineAdvance;
diff --git a/src/lib/openjp2/tcd.c b/src/lib/openjp2/tcd.c
index d5d60aaf..503dc472 100644
--- a/src/lib/openjp2/tcd.c
+++ b/src/lib/openjp2/tcd.c
@@ -2411,7 +2411,8 @@ static OPJ_BOOL opj_tcd_dc_level_shift_encode(opj_tcd_t *p_tcd)
}
} else {
for (i = 0; i < l_nb_elem; ++i) {
- *l_current_ptr = (*l_current_ptr - l_tccp->m_dc_level_shift) * (1 << 11);
+ *((OPJ_FLOAT32 *) l_current_ptr) = (OPJ_FLOAT32)(*l_current_ptr -
+ l_tccp->m_dc_level_shift);
++l_current_ptr;
}
}
@@ -2469,8 +2470,11 @@ static OPJ_BOOL opj_tcd_mct_encode(opj_tcd_t *p_tcd)
opj_free(l_data);
} else if (l_tcp->tccps->qmfbid == 0) {
- opj_mct_encode_real(l_tile->comps[0].data, l_tile->comps[1].data,
- l_tile->comps[2].data, samples);
+ opj_mct_encode_real(
+ (OPJ_FLOAT32*)l_tile->comps[0].data,
+ (OPJ_FLOAT32*)l_tile->comps[1].data,
+ (OPJ_FLOAT32*)l_tile->comps[2].data,
+ samples);
} else {
opj_mct_encode(l_tile->comps[0].data, l_tile->comps[1].data,
l_tile->comps[2].data, samples);