diff --git a/src/lib/openjp2/dwt.c b/src/lib/openjp2/dwt.c index 4b00c83a..bdc91cf5 100644 --- a/src/lib/openjp2/dwt.c +++ b/src/lib/openjp2/dwt.c @@ -134,12 +134,12 @@ static void opj_dwt_deinterleave_v(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn, /** Forward 5-3 wavelet transform in 1-D */ -static void opj_dwt_encode_1(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn, +static void opj_dwt_encode_1(void *a, OPJ_INT32 dn, OPJ_INT32 sn, OPJ_INT32 cas); /** Forward 9-7 wavelet transform in 1-D */ -static void opj_dwt_encode_1_real(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn, +static void opj_dwt_encode_1_real(void *a, OPJ_INT32 dn, OPJ_INT32 sn, OPJ_INT32 cas); /** Explicit calculation of the Quantization Stepsizes @@ -156,9 +156,13 @@ static OPJ_BOOL opj_dwt_decode_partial_tile( opj_tcd_tilecomp_t* tilec, OPJ_UINT32 numres); +/* Where void* is a OPJ_INT32* for 5x3 and OPJ_FLOAT32* for 9x7 */ +typedef void (*opj_encode_one_row_fnptr_type)(void *, OPJ_INT32, OPJ_INT32, + OPJ_INT32); + static OPJ_BOOL opj_dwt_encode_procedure(opj_thread_pool_t* tp, opj_tcd_tilecomp_t * tilec, - void (*p_function)(OPJ_INT32 *, OPJ_INT32, OPJ_INT32, OPJ_INT32)); + opj_encode_one_row_fnptr_type p_function); static OPJ_UINT32 opj_dwt_max_resolution(opj_tcd_resolution_t* OPJ_RESTRICT r, OPJ_UINT32 i); @@ -346,10 +350,11 @@ static void opj_dwt_interleave_v(const opj_dwt_t* v, OPJ_INT32 *a, OPJ_INT32 x) /* */ /* Forward 5-3 wavelet transform in 1-D. */ /* */ -static void opj_dwt_encode_1(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn, +static void opj_dwt_encode_1(void *aIn, OPJ_INT32 dn, OPJ_INT32 sn, OPJ_INT32 cas) { OPJ_INT32 i; + OPJ_INT32* a = (OPJ_INT32*)aIn; if (!cas) { if ((dn > 0) || (sn > 1)) { /* NEW : CASE ONE ELEMENT */ @@ -1039,50 +1044,52 @@ static void opj_idwt53_v(const opj_dwt_t *dwt, /* */ /* Forward 9-7 wavelet transform in 1-D. */ /* */ -static void opj_dwt_encode_1_real(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn, +static void opj_dwt_encode_1_real(void *aIn, OPJ_INT32 dn, OPJ_INT32 sn, OPJ_INT32 cas) { OPJ_INT32 i; + OPJ_FLOAT32* a = (OPJ_FLOAT32*)aIn; + if (!cas) { if ((dn > 0) || (sn > 1)) { /* NEW : CASE ONE ELEMENT */ for (i = 0; i < dn; i++) { - OPJ_D(i) -= opj_int_fix_mul(OPJ_S_(i) + OPJ_S_(i + 1), 12993); + OPJ_D(i) += opj_dwt_alpha * (OPJ_S_(i) + OPJ_S_(i + 1)); } for (i = 0; i < sn; i++) { - OPJ_S(i) -= opj_int_fix_mul(OPJ_D_(i - 1) + OPJ_D_(i), 434); + OPJ_S(i) += opj_dwt_beta * (OPJ_D_(i - 1) + OPJ_D_(i)); } for (i = 0; i < dn; i++) { - OPJ_D(i) += opj_int_fix_mul(OPJ_S_(i) + OPJ_S_(i + 1), 7233); + OPJ_D(i) += opj_dwt_gamma * (OPJ_S_(i) + OPJ_S_(i + 1)); } for (i = 0; i < sn; i++) { - OPJ_S(i) += opj_int_fix_mul(OPJ_D_(i - 1) + OPJ_D_(i), 3633); + OPJ_S(i) += opj_dwt_delta * (OPJ_D_(i - 1) + OPJ_D_(i)); } for (i = 0; i < dn; i++) { - OPJ_D(i) = opj_int_fix_mul(OPJ_D(i), 5038); /*5038 */ + OPJ_D(i) = opj_K / 2 * OPJ_D(i); } for (i = 0; i < sn; i++) { - OPJ_S(i) = opj_int_fix_mul(OPJ_S(i), 6659); /*6660 */ + OPJ_S(i) = opj_c13318 / 2 * OPJ_S(i); } } } else { if ((sn > 0) || (dn > 1)) { /* NEW : CASE ONE ELEMENT */ for (i = 0; i < dn; i++) { - OPJ_S(i) -= opj_int_fix_mul(OPJ_DD_(i) + OPJ_DD_(i - 1), 12993); + OPJ_S(i) += opj_dwt_alpha * (OPJ_DD_(i) + OPJ_DD_(i - 1)); } for (i = 0; i < sn; i++) { - OPJ_D(i) -= opj_int_fix_mul(OPJ_SS_(i) + OPJ_SS_(i + 1), 434); + OPJ_D(i) += opj_dwt_beta * (OPJ_SS_(i) + OPJ_SS_(i + 1)); } for (i = 0; i < dn; i++) { - OPJ_S(i) += opj_int_fix_mul(OPJ_DD_(i) + OPJ_DD_(i - 1), 7233); + OPJ_S(i) += opj_dwt_gamma * (OPJ_DD_(i) + OPJ_DD_(i - 1)); } for (i = 0; i < sn; i++) { - OPJ_D(i) += opj_int_fix_mul(OPJ_SS_(i) + OPJ_SS_(i + 1), 3633); + OPJ_D(i) += opj_dwt_delta * (OPJ_SS_(i) + OPJ_SS_(i + 1)); } for (i = 0; i < dn; i++) { - OPJ_S(i) = opj_int_fix_mul(OPJ_S(i), 5038); /*5038 */ + OPJ_S(i) = opj_K / 2 * OPJ_S(i); } for (i = 0; i < sn; i++) { - OPJ_D(i) = opj_int_fix_mul(OPJ_D(i), 6659); /*6660 */ + OPJ_D(i) = opj_c13318 / 2 * OPJ_D(i); } } } @@ -1112,7 +1119,7 @@ typedef struct { OPJ_INT32 * OPJ_RESTRICT tiledp; OPJ_UINT32 min_j; OPJ_UINT32 max_j; - void (*p_function)(OPJ_INT32 *, OPJ_INT32, OPJ_INT32, OPJ_INT32); + opj_encode_one_row_fnptr_type p_function; } opj_dwt_encode_h_job_t; static void opj_dwt_encode_h_func(void* user_data, opj_tls_t* tls) @@ -1143,7 +1150,7 @@ typedef struct { OPJ_INT32 * OPJ_RESTRICT tiledp; OPJ_UINT32 min_j; OPJ_UINT32 max_j; - void (*p_function)(OPJ_INT32 *, OPJ_INT32, OPJ_INT32, OPJ_INT32); + opj_encode_one_row_fnptr_type p_function; } opj_dwt_encode_v_job_t; static void opj_dwt_encode_v_func(void* user_data, opj_tls_t* tls) @@ -1175,7 +1182,7 @@ static void opj_dwt_encode_v_func(void* user_data, opj_tls_t* tls) /* */ static INLINE OPJ_BOOL opj_dwt_encode_procedure(opj_thread_pool_t* tp, opj_tcd_tilecomp_t * tilec, - void (*p_function)(OPJ_INT32 *, OPJ_INT32, OPJ_INT32, OPJ_INT32)) + opj_encode_one_row_fnptr_type p_function) { OPJ_INT32 i; OPJ_INT32 *bj = 00; diff --git a/src/lib/openjp2/mct.c b/src/lib/openjp2/mct.c index 08bc8115..9d79b50a 100644 --- a/src/lib/openjp2/mct.c +++ b/src/lib/openjp2/mct.c @@ -209,175 +209,25 @@ OPJ_FLOAT64 opj_mct_getnorm(OPJ_UINT32 compno) /* */ /* Forward irreversible MCT. */ /* */ -#ifdef __SSE4_1__ void opj_mct_encode_real( - OPJ_INT32* OPJ_RESTRICT c0, - OPJ_INT32* OPJ_RESTRICT c1, - OPJ_INT32* OPJ_RESTRICT c2, - OPJ_SIZE_T n) -{ - OPJ_SIZE_T i; - const OPJ_SIZE_T len = n; - - const __m128i ry = _mm_set1_epi32(2449); - const __m128i gy = _mm_set1_epi32(4809); - const __m128i by = _mm_set1_epi32(934); - const __m128i ru = _mm_set1_epi32(1382); - const __m128i gu = _mm_set1_epi32(2714); - /* const __m128i bu = _mm_set1_epi32(4096); */ - /* const __m128i rv = _mm_set1_epi32(4096); */ - const __m128i gv = _mm_set1_epi32(3430); - const __m128i bv = _mm_set1_epi32(666); - const __m128i mulround = _mm_shuffle_epi32(_mm_cvtsi32_si128(4096), - _MM_SHUFFLE(1, 0, 1, 0)); - - for (i = 0; i < (len & ~3U); i += 4) { - __m128i lo, hi; - __m128i y, u, v; - __m128i r = _mm_load_si128((const __m128i *) & (c0[i])); - __m128i g = _mm_load_si128((const __m128i *) & (c1[i])); - __m128i b = _mm_load_si128((const __m128i *) & (c2[i])); - - lo = r; - hi = _mm_shuffle_epi32(r, _MM_SHUFFLE(3, 3, 1, 1)); - lo = _mm_mul_epi32(lo, ry); - hi = _mm_mul_epi32(hi, ry); - lo = _mm_add_epi64(lo, mulround); - hi = _mm_add_epi64(hi, mulround); - lo = _mm_srli_epi64(lo, 13); - hi = _mm_slli_epi64(hi, 32 - 13); - y = _mm_blend_epi16(lo, hi, 0xCC); - - lo = g; - hi = _mm_shuffle_epi32(g, _MM_SHUFFLE(3, 3, 1, 1)); - lo = _mm_mul_epi32(lo, gy); - hi = _mm_mul_epi32(hi, gy); - lo = _mm_add_epi64(lo, mulround); - hi = _mm_add_epi64(hi, mulround); - lo = _mm_srli_epi64(lo, 13); - hi = _mm_slli_epi64(hi, 32 - 13); - y = _mm_add_epi32(y, _mm_blend_epi16(lo, hi, 0xCC)); - - lo = b; - hi = _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 3, 1, 1)); - lo = _mm_mul_epi32(lo, by); - hi = _mm_mul_epi32(hi, by); - lo = _mm_add_epi64(lo, mulround); - hi = _mm_add_epi64(hi, mulround); - lo = _mm_srli_epi64(lo, 13); - hi = _mm_slli_epi64(hi, 32 - 13); - y = _mm_add_epi32(y, _mm_blend_epi16(lo, hi, 0xCC)); - _mm_store_si128((__m128i *) & (c0[i]), y); - - /*lo = b; - hi = _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 3, 1, 1)); - lo = _mm_mul_epi32(lo, mulround); - hi = _mm_mul_epi32(hi, mulround);*/ - lo = _mm_cvtepi32_epi64(_mm_shuffle_epi32(b, _MM_SHUFFLE(3, 2, 2, 0))); - hi = _mm_cvtepi32_epi64(_mm_shuffle_epi32(b, _MM_SHUFFLE(3, 2, 3, 1))); - lo = _mm_slli_epi64(lo, 12); - hi = _mm_slli_epi64(hi, 12); - lo = _mm_add_epi64(lo, mulround); - hi = _mm_add_epi64(hi, mulround); - lo = _mm_srli_epi64(lo, 13); - hi = _mm_slli_epi64(hi, 32 - 13); - u = _mm_blend_epi16(lo, hi, 0xCC); - - lo = r; - hi = _mm_shuffle_epi32(r, _MM_SHUFFLE(3, 3, 1, 1)); - lo = _mm_mul_epi32(lo, ru); - hi = _mm_mul_epi32(hi, ru); - lo = _mm_add_epi64(lo, mulround); - hi = _mm_add_epi64(hi, mulround); - lo = _mm_srli_epi64(lo, 13); - hi = _mm_slli_epi64(hi, 32 - 13); - u = _mm_sub_epi32(u, _mm_blend_epi16(lo, hi, 0xCC)); - - lo = g; - hi = _mm_shuffle_epi32(g, _MM_SHUFFLE(3, 3, 1, 1)); - lo = _mm_mul_epi32(lo, gu); - hi = _mm_mul_epi32(hi, gu); - lo = _mm_add_epi64(lo, mulround); - hi = _mm_add_epi64(hi, mulround); - lo = _mm_srli_epi64(lo, 13); - hi = _mm_slli_epi64(hi, 32 - 13); - u = _mm_sub_epi32(u, _mm_blend_epi16(lo, hi, 0xCC)); - _mm_store_si128((__m128i *) & (c1[i]), u); - - /*lo = r; - hi = _mm_shuffle_epi32(r, _MM_SHUFFLE(3, 3, 1, 1)); - lo = _mm_mul_epi32(lo, mulround); - hi = _mm_mul_epi32(hi, mulround);*/ - lo = _mm_cvtepi32_epi64(_mm_shuffle_epi32(r, _MM_SHUFFLE(3, 2, 2, 0))); - hi = _mm_cvtepi32_epi64(_mm_shuffle_epi32(r, _MM_SHUFFLE(3, 2, 3, 1))); - lo = _mm_slli_epi64(lo, 12); - hi = _mm_slli_epi64(hi, 12); - lo = _mm_add_epi64(lo, mulround); - hi = _mm_add_epi64(hi, mulround); - lo = _mm_srli_epi64(lo, 13); - hi = _mm_slli_epi64(hi, 32 - 13); - v = _mm_blend_epi16(lo, hi, 0xCC); - - lo = g; - hi = _mm_shuffle_epi32(g, _MM_SHUFFLE(3, 3, 1, 1)); - lo = _mm_mul_epi32(lo, gv); - hi = _mm_mul_epi32(hi, gv); - lo = _mm_add_epi64(lo, mulround); - hi = _mm_add_epi64(hi, mulround); - lo = _mm_srli_epi64(lo, 13); - hi = _mm_slli_epi64(hi, 32 - 13); - v = _mm_sub_epi32(v, _mm_blend_epi16(lo, hi, 0xCC)); - - lo = b; - hi = _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 3, 1, 1)); - lo = _mm_mul_epi32(lo, bv); - hi = _mm_mul_epi32(hi, bv); - lo = _mm_add_epi64(lo, mulround); - hi = _mm_add_epi64(hi, mulround); - lo = _mm_srli_epi64(lo, 13); - hi = _mm_slli_epi64(hi, 32 - 13); - v = _mm_sub_epi32(v, _mm_blend_epi16(lo, hi, 0xCC)); - _mm_store_si128((__m128i *) & (c2[i]), v); - } - for (; i < len; ++i) { - OPJ_INT32 r = c0[i]; - OPJ_INT32 g = c1[i]; - OPJ_INT32 b = c2[i]; - OPJ_INT32 y = opj_int_fix_mul(r, 2449) + opj_int_fix_mul(g, - 4809) + opj_int_fix_mul(b, 934); - OPJ_INT32 u = -opj_int_fix_mul(r, 1382) - opj_int_fix_mul(g, - 2714) + opj_int_fix_mul(b, 4096); - OPJ_INT32 v = opj_int_fix_mul(r, 4096) - opj_int_fix_mul(g, - 3430) - opj_int_fix_mul(b, 666); - c0[i] = y; - c1[i] = u; - c2[i] = v; - } -} -#else -void opj_mct_encode_real( - OPJ_INT32* OPJ_RESTRICT c0, - OPJ_INT32* OPJ_RESTRICT c1, - OPJ_INT32* OPJ_RESTRICT c2, + OPJ_FLOAT32* OPJ_RESTRICT c0, + OPJ_FLOAT32* OPJ_RESTRICT c1, + OPJ_FLOAT32* OPJ_RESTRICT c2, OPJ_SIZE_T n) { OPJ_SIZE_T i; for (i = 0; i < n; ++i) { - OPJ_INT32 r = c0[i]; - OPJ_INT32 g = c1[i]; - OPJ_INT32 b = c2[i]; - OPJ_INT32 y = opj_int_fix_mul(r, 2449) + opj_int_fix_mul(g, - 4809) + opj_int_fix_mul(b, 934); - OPJ_INT32 u = -opj_int_fix_mul(r, 1382) - opj_int_fix_mul(g, - 2714) + opj_int_fix_mul(b, 4096); - OPJ_INT32 v = opj_int_fix_mul(r, 4096) - opj_int_fix_mul(g, - 3430) - opj_int_fix_mul(b, 666); + OPJ_FLOAT32 r = c0[i]; + OPJ_FLOAT32 g = c1[i]; + OPJ_FLOAT32 b = c2[i]; + OPJ_FLOAT32 y = 0.299f * r + 0.587f * g + 0.114f * b; + OPJ_FLOAT32 u = -0.16875f * r - 0.331260f * g + 0.5f * b; + OPJ_FLOAT32 v = 0.5f * r - 0.41869f * g - 0.08131f * b; c0[i] = y; c1[i] = u; c2[i] = v; } } -#endif /* */ /* Inverse irreversible MCT. */ diff --git a/src/lib/openjp2/mct.h b/src/lib/openjp2/mct.h index 2e37ce73..3e1f5e49 100644 --- a/src/lib/openjp2/mct.h +++ b/src/lib/openjp2/mct.h @@ -85,8 +85,9 @@ Apply an irreversible multi-component transform to an image @param c2 Samples blue component @param n Number of samples for each component */ -void opj_mct_encode_real(OPJ_INT32* OPJ_RESTRICT c0, OPJ_INT32* OPJ_RESTRICT c1, - OPJ_INT32* OPJ_RESTRICT c2, OPJ_SIZE_T n); +void opj_mct_encode_real(OPJ_FLOAT32* OPJ_RESTRICT c0, + OPJ_FLOAT32* OPJ_RESTRICT c1, + OPJ_FLOAT32* OPJ_RESTRICT c2, OPJ_SIZE_T n); /** Apply an irreversible multi-component inverse transform to an image @param c0 Samples for luminance component diff --git a/src/lib/openjp2/t1.c b/src/lib/openjp2/t1.c index 1b9556ea..8d5feadf 100644 --- a/src/lib/openjp2/t1.c +++ b/src/lib/openjp2/t1.c @@ -2194,16 +2194,11 @@ static void opj_t1_clbl_encode_processor(void* user_data, opj_tls_t* tls) tileIndex += tileLineAdvance; } } else { /* if (tccp->qmfbid == 0) */ - const OPJ_INT32 bandconst = 8192 * 8192 / ((OPJ_INT32) floor( - band->stepsize * 8192)); - for (j = 0; j < cblk_h; ++j) { for (i = 0; i < cblk_w; ++i) { - OPJ_INT32 tmp = tiledp[tileIndex]; - tiledp[tileIndex] = - opj_int_fix_mul_t1( - tmp, - bandconst); + OPJ_FLOAT32 tmp = ((OPJ_FLOAT32*)tiledp)[tileIndex]; + tiledp[tileIndex] = (OPJ_INT32)opj_lrintf((tmp / band->stepsize) * + (1 << T1_NMSEDEC_FRACBITS)); tileIndex++; } tileIndex += tileLineAdvance; diff --git a/src/lib/openjp2/tcd.c b/src/lib/openjp2/tcd.c index d5d60aaf..503dc472 100644 --- a/src/lib/openjp2/tcd.c +++ b/src/lib/openjp2/tcd.c @@ -2411,7 +2411,8 @@ static OPJ_BOOL opj_tcd_dc_level_shift_encode(opj_tcd_t *p_tcd) } } else { for (i = 0; i < l_nb_elem; ++i) { - *l_current_ptr = (*l_current_ptr - l_tccp->m_dc_level_shift) * (1 << 11); + *((OPJ_FLOAT32 *) l_current_ptr) = (OPJ_FLOAT32)(*l_current_ptr - + l_tccp->m_dc_level_shift); ++l_current_ptr; } } @@ -2469,8 +2470,11 @@ static OPJ_BOOL opj_tcd_mct_encode(opj_tcd_t *p_tcd) opj_free(l_data); } else if (l_tcp->tccps->qmfbid == 0) { - opj_mct_encode_real(l_tile->comps[0].data, l_tile->comps[1].data, - l_tile->comps[2].data, samples); + opj_mct_encode_real( + (OPJ_FLOAT32*)l_tile->comps[0].data, + (OPJ_FLOAT32*)l_tile->comps[1].data, + (OPJ_FLOAT32*)l_tile->comps[2].data, + samples); } else { opj_mct_encode(l_tile->comps[0].data, l_tile->comps[1].data, l_tile->comps[2].data, samples);