Encoder: use floating-point operations for irreversible transformation

This commit is contained in:
Even Rouault 2020-05-18 20:15:07 +02:00
parent 99107d5e46
commit 00cff6f5c0
No known key found for this signature in database
GPG Key ID: 33EBBFC47B3DD87D
5 changed files with 49 additions and 192 deletions

View File

@ -134,12 +134,12 @@ static void opj_dwt_deinterleave_v(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn,
/**
Forward 5-3 wavelet transform in 1-D
*/
static void opj_dwt_encode_1(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn,
static void opj_dwt_encode_1(void *a, OPJ_INT32 dn, OPJ_INT32 sn,
OPJ_INT32 cas);
/**
Forward 9-7 wavelet transform in 1-D
*/
static void opj_dwt_encode_1_real(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn,
static void opj_dwt_encode_1_real(void *a, OPJ_INT32 dn, OPJ_INT32 sn,
OPJ_INT32 cas);
/**
Explicit calculation of the Quantization Stepsizes
@ -156,9 +156,13 @@ static OPJ_BOOL opj_dwt_decode_partial_tile(
opj_tcd_tilecomp_t* tilec,
OPJ_UINT32 numres);
/* Where void* is a OPJ_INT32* for 5x3 and OPJ_FLOAT32* for 9x7 */
typedef void (*opj_encode_one_row_fnptr_type)(void *, OPJ_INT32, OPJ_INT32,
OPJ_INT32);
static OPJ_BOOL opj_dwt_encode_procedure(opj_thread_pool_t* tp,
opj_tcd_tilecomp_t * tilec,
void (*p_function)(OPJ_INT32 *, OPJ_INT32, OPJ_INT32, OPJ_INT32));
opj_encode_one_row_fnptr_type p_function);
static OPJ_UINT32 opj_dwt_max_resolution(opj_tcd_resolution_t* OPJ_RESTRICT r,
OPJ_UINT32 i);
@ -346,10 +350,11 @@ static void opj_dwt_interleave_v(const opj_dwt_t* v, OPJ_INT32 *a, OPJ_INT32 x)
/* <summary> */
/* Forward 5-3 wavelet transform in 1-D. */
/* </summary> */
static void opj_dwt_encode_1(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn,
static void opj_dwt_encode_1(void *aIn, OPJ_INT32 dn, OPJ_INT32 sn,
OPJ_INT32 cas)
{
OPJ_INT32 i;
OPJ_INT32* a = (OPJ_INT32*)aIn;
if (!cas) {
if ((dn > 0) || (sn > 1)) { /* NEW : CASE ONE ELEMENT */
@ -1039,50 +1044,52 @@ static void opj_idwt53_v(const opj_dwt_t *dwt,
/* <summary> */
/* Forward 9-7 wavelet transform in 1-D. */
/* </summary> */
static void opj_dwt_encode_1_real(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn,
static void opj_dwt_encode_1_real(void *aIn, OPJ_INT32 dn, OPJ_INT32 sn,
OPJ_INT32 cas)
{
OPJ_INT32 i;
OPJ_FLOAT32* a = (OPJ_FLOAT32*)aIn;
if (!cas) {
if ((dn > 0) || (sn > 1)) { /* NEW : CASE ONE ELEMENT */
for (i = 0; i < dn; i++) {
OPJ_D(i) -= opj_int_fix_mul(OPJ_S_(i) + OPJ_S_(i + 1), 12993);
OPJ_D(i) += opj_dwt_alpha * (OPJ_S_(i) + OPJ_S_(i + 1));
}
for (i = 0; i < sn; i++) {
OPJ_S(i) -= opj_int_fix_mul(OPJ_D_(i - 1) + OPJ_D_(i), 434);
OPJ_S(i) += opj_dwt_beta * (OPJ_D_(i - 1) + OPJ_D_(i));
}
for (i = 0; i < dn; i++) {
OPJ_D(i) += opj_int_fix_mul(OPJ_S_(i) + OPJ_S_(i + 1), 7233);
OPJ_D(i) += opj_dwt_gamma * (OPJ_S_(i) + OPJ_S_(i + 1));
}
for (i = 0; i < sn; i++) {
OPJ_S(i) += opj_int_fix_mul(OPJ_D_(i - 1) + OPJ_D_(i), 3633);
OPJ_S(i) += opj_dwt_delta * (OPJ_D_(i - 1) + OPJ_D_(i));
}
for (i = 0; i < dn; i++) {
OPJ_D(i) = opj_int_fix_mul(OPJ_D(i), 5038); /*5038 */
OPJ_D(i) = opj_K / 2 * OPJ_D(i);
}
for (i = 0; i < sn; i++) {
OPJ_S(i) = opj_int_fix_mul(OPJ_S(i), 6659); /*6660 */
OPJ_S(i) = opj_c13318 / 2 * OPJ_S(i);
}
}
} else {
if ((sn > 0) || (dn > 1)) { /* NEW : CASE ONE ELEMENT */
for (i = 0; i < dn; i++) {
OPJ_S(i) -= opj_int_fix_mul(OPJ_DD_(i) + OPJ_DD_(i - 1), 12993);
OPJ_S(i) += opj_dwt_alpha * (OPJ_DD_(i) + OPJ_DD_(i - 1));
}
for (i = 0; i < sn; i++) {
OPJ_D(i) -= opj_int_fix_mul(OPJ_SS_(i) + OPJ_SS_(i + 1), 434);
OPJ_D(i) += opj_dwt_beta * (OPJ_SS_(i) + OPJ_SS_(i + 1));
}
for (i = 0; i < dn; i++) {
OPJ_S(i) += opj_int_fix_mul(OPJ_DD_(i) + OPJ_DD_(i - 1), 7233);
OPJ_S(i) += opj_dwt_gamma * (OPJ_DD_(i) + OPJ_DD_(i - 1));
}
for (i = 0; i < sn; i++) {
OPJ_D(i) += opj_int_fix_mul(OPJ_SS_(i) + OPJ_SS_(i + 1), 3633);
OPJ_D(i) += opj_dwt_delta * (OPJ_SS_(i) + OPJ_SS_(i + 1));
}
for (i = 0; i < dn; i++) {
OPJ_S(i) = opj_int_fix_mul(OPJ_S(i), 5038); /*5038 */
OPJ_S(i) = opj_K / 2 * OPJ_S(i);
}
for (i = 0; i < sn; i++) {
OPJ_D(i) = opj_int_fix_mul(OPJ_D(i), 6659); /*6660 */
OPJ_D(i) = opj_c13318 / 2 * OPJ_D(i);
}
}
}
@ -1112,7 +1119,7 @@ typedef struct {
OPJ_INT32 * OPJ_RESTRICT tiledp;
OPJ_UINT32 min_j;
OPJ_UINT32 max_j;
void (*p_function)(OPJ_INT32 *, OPJ_INT32, OPJ_INT32, OPJ_INT32);
opj_encode_one_row_fnptr_type p_function;
} opj_dwt_encode_h_job_t;
static void opj_dwt_encode_h_func(void* user_data, opj_tls_t* tls)
@ -1143,7 +1150,7 @@ typedef struct {
OPJ_INT32 * OPJ_RESTRICT tiledp;
OPJ_UINT32 min_j;
OPJ_UINT32 max_j;
void (*p_function)(OPJ_INT32 *, OPJ_INT32, OPJ_INT32, OPJ_INT32);
opj_encode_one_row_fnptr_type p_function;
} opj_dwt_encode_v_job_t;
static void opj_dwt_encode_v_func(void* user_data, opj_tls_t* tls)
@ -1175,7 +1182,7 @@ static void opj_dwt_encode_v_func(void* user_data, opj_tls_t* tls)
/* </summary> */
static INLINE OPJ_BOOL opj_dwt_encode_procedure(opj_thread_pool_t* tp,
opj_tcd_tilecomp_t * tilec,
void (*p_function)(OPJ_INT32 *, OPJ_INT32, OPJ_INT32, OPJ_INT32))
opj_encode_one_row_fnptr_type p_function)
{
OPJ_INT32 i;
OPJ_INT32 *bj = 00;

View File

@ -209,175 +209,25 @@ OPJ_FLOAT64 opj_mct_getnorm(OPJ_UINT32 compno)
/* <summary> */
/* Forward irreversible MCT. */
/* </summary> */
#ifdef __SSE4_1__
void opj_mct_encode_real(
OPJ_INT32* OPJ_RESTRICT c0,
OPJ_INT32* OPJ_RESTRICT c1,
OPJ_INT32* OPJ_RESTRICT c2,
OPJ_SIZE_T n)
{
OPJ_SIZE_T i;
const OPJ_SIZE_T len = n;
const __m128i ry = _mm_set1_epi32(2449);
const __m128i gy = _mm_set1_epi32(4809);
const __m128i by = _mm_set1_epi32(934);
const __m128i ru = _mm_set1_epi32(1382);
const __m128i gu = _mm_set1_epi32(2714);
/* const __m128i bu = _mm_set1_epi32(4096); */
/* const __m128i rv = _mm_set1_epi32(4096); */
const __m128i gv = _mm_set1_epi32(3430);
const __m128i bv = _mm_set1_epi32(666);
const __m128i mulround = _mm_shuffle_epi32(_mm_cvtsi32_si128(4096),
_MM_SHUFFLE(1, 0, 1, 0));
for (i = 0; i < (len & ~3U); i += 4) {
__m128i lo, hi;
__m128i y, u, v;
__m128i r = _mm_load_si128((const __m128i *) & (c0[i]));
__m128i g = _mm_load_si128((const __m128i *) & (c1[i]));
__m128i b = _mm_load_si128((const __m128i *) & (c2[i]));
lo = r;
hi = _mm_shuffle_epi32(r, _MM_SHUFFLE(3, 3, 1, 1));
lo = _mm_mul_epi32(lo, ry);
hi = _mm_mul_epi32(hi, ry);
lo = _mm_add_epi64(lo, mulround);
hi = _mm_add_epi64(hi, mulround);
lo = _mm_srli_epi64(lo, 13);
hi = _mm_slli_epi64(hi, 32 - 13);
y = _mm_blend_epi16(lo, hi, 0xCC);
lo = g;
hi = _mm_shuffle_epi32(g, _MM_SHUFFLE(3, 3, 1, 1));
lo = _mm_mul_epi32(lo, gy);
hi = _mm_mul_epi32(hi, gy);
lo = _mm_add_epi64(lo, mulround);
hi = _mm_add_epi64(hi, mulround);
lo = _mm_srli_epi64(lo, 13);
hi = _mm_slli_epi64(hi, 32 - 13);
y = _mm_add_epi32(y, _mm_blend_epi16(lo, hi, 0xCC));
lo = b;
hi = _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 3, 1, 1));
lo = _mm_mul_epi32(lo, by);
hi = _mm_mul_epi32(hi, by);
lo = _mm_add_epi64(lo, mulround);
hi = _mm_add_epi64(hi, mulround);
lo = _mm_srli_epi64(lo, 13);
hi = _mm_slli_epi64(hi, 32 - 13);
y = _mm_add_epi32(y, _mm_blend_epi16(lo, hi, 0xCC));
_mm_store_si128((__m128i *) & (c0[i]), y);
/*lo = b;
hi = _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 3, 1, 1));
lo = _mm_mul_epi32(lo, mulround);
hi = _mm_mul_epi32(hi, mulround);*/
lo = _mm_cvtepi32_epi64(_mm_shuffle_epi32(b, _MM_SHUFFLE(3, 2, 2, 0)));
hi = _mm_cvtepi32_epi64(_mm_shuffle_epi32(b, _MM_SHUFFLE(3, 2, 3, 1)));
lo = _mm_slli_epi64(lo, 12);
hi = _mm_slli_epi64(hi, 12);
lo = _mm_add_epi64(lo, mulround);
hi = _mm_add_epi64(hi, mulround);
lo = _mm_srli_epi64(lo, 13);
hi = _mm_slli_epi64(hi, 32 - 13);
u = _mm_blend_epi16(lo, hi, 0xCC);
lo = r;
hi = _mm_shuffle_epi32(r, _MM_SHUFFLE(3, 3, 1, 1));
lo = _mm_mul_epi32(lo, ru);
hi = _mm_mul_epi32(hi, ru);
lo = _mm_add_epi64(lo, mulround);
hi = _mm_add_epi64(hi, mulround);
lo = _mm_srli_epi64(lo, 13);
hi = _mm_slli_epi64(hi, 32 - 13);
u = _mm_sub_epi32(u, _mm_blend_epi16(lo, hi, 0xCC));
lo = g;
hi = _mm_shuffle_epi32(g, _MM_SHUFFLE(3, 3, 1, 1));
lo = _mm_mul_epi32(lo, gu);
hi = _mm_mul_epi32(hi, gu);
lo = _mm_add_epi64(lo, mulround);
hi = _mm_add_epi64(hi, mulround);
lo = _mm_srli_epi64(lo, 13);
hi = _mm_slli_epi64(hi, 32 - 13);
u = _mm_sub_epi32(u, _mm_blend_epi16(lo, hi, 0xCC));
_mm_store_si128((__m128i *) & (c1[i]), u);
/*lo = r;
hi = _mm_shuffle_epi32(r, _MM_SHUFFLE(3, 3, 1, 1));
lo = _mm_mul_epi32(lo, mulround);
hi = _mm_mul_epi32(hi, mulround);*/
lo = _mm_cvtepi32_epi64(_mm_shuffle_epi32(r, _MM_SHUFFLE(3, 2, 2, 0)));
hi = _mm_cvtepi32_epi64(_mm_shuffle_epi32(r, _MM_SHUFFLE(3, 2, 3, 1)));
lo = _mm_slli_epi64(lo, 12);
hi = _mm_slli_epi64(hi, 12);
lo = _mm_add_epi64(lo, mulround);
hi = _mm_add_epi64(hi, mulround);
lo = _mm_srli_epi64(lo, 13);
hi = _mm_slli_epi64(hi, 32 - 13);
v = _mm_blend_epi16(lo, hi, 0xCC);
lo = g;
hi = _mm_shuffle_epi32(g, _MM_SHUFFLE(3, 3, 1, 1));
lo = _mm_mul_epi32(lo, gv);
hi = _mm_mul_epi32(hi, gv);
lo = _mm_add_epi64(lo, mulround);
hi = _mm_add_epi64(hi, mulround);
lo = _mm_srli_epi64(lo, 13);
hi = _mm_slli_epi64(hi, 32 - 13);
v = _mm_sub_epi32(v, _mm_blend_epi16(lo, hi, 0xCC));
lo = b;
hi = _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 3, 1, 1));
lo = _mm_mul_epi32(lo, bv);
hi = _mm_mul_epi32(hi, bv);
lo = _mm_add_epi64(lo, mulround);
hi = _mm_add_epi64(hi, mulround);
lo = _mm_srli_epi64(lo, 13);
hi = _mm_slli_epi64(hi, 32 - 13);
v = _mm_sub_epi32(v, _mm_blend_epi16(lo, hi, 0xCC));
_mm_store_si128((__m128i *) & (c2[i]), v);
}
for (; i < len; ++i) {
OPJ_INT32 r = c0[i];
OPJ_INT32 g = c1[i];
OPJ_INT32 b = c2[i];
OPJ_INT32 y = opj_int_fix_mul(r, 2449) + opj_int_fix_mul(g,
4809) + opj_int_fix_mul(b, 934);
OPJ_INT32 u = -opj_int_fix_mul(r, 1382) - opj_int_fix_mul(g,
2714) + opj_int_fix_mul(b, 4096);
OPJ_INT32 v = opj_int_fix_mul(r, 4096) - opj_int_fix_mul(g,
3430) - opj_int_fix_mul(b, 666);
c0[i] = y;
c1[i] = u;
c2[i] = v;
}
}
#else
void opj_mct_encode_real(
OPJ_INT32* OPJ_RESTRICT c0,
OPJ_INT32* OPJ_RESTRICT c1,
OPJ_INT32* OPJ_RESTRICT c2,
OPJ_FLOAT32* OPJ_RESTRICT c0,
OPJ_FLOAT32* OPJ_RESTRICT c1,
OPJ_FLOAT32* OPJ_RESTRICT c2,
OPJ_SIZE_T n)
{
OPJ_SIZE_T i;
for (i = 0; i < n; ++i) {
OPJ_INT32 r = c0[i];
OPJ_INT32 g = c1[i];
OPJ_INT32 b = c2[i];
OPJ_INT32 y = opj_int_fix_mul(r, 2449) + opj_int_fix_mul(g,
4809) + opj_int_fix_mul(b, 934);
OPJ_INT32 u = -opj_int_fix_mul(r, 1382) - opj_int_fix_mul(g,
2714) + opj_int_fix_mul(b, 4096);
OPJ_INT32 v = opj_int_fix_mul(r, 4096) - opj_int_fix_mul(g,
3430) - opj_int_fix_mul(b, 666);
OPJ_FLOAT32 r = c0[i];
OPJ_FLOAT32 g = c1[i];
OPJ_FLOAT32 b = c2[i];
OPJ_FLOAT32 y = 0.299f * r + 0.587f * g + 0.114f * b;
OPJ_FLOAT32 u = -0.16875f * r - 0.331260f * g + 0.5f * b;
OPJ_FLOAT32 v = 0.5f * r - 0.41869f * g - 0.08131f * b;
c0[i] = y;
c1[i] = u;
c2[i] = v;
}
}
#endif
/* <summary> */
/* Inverse irreversible MCT. */

View File

@ -85,8 +85,9 @@ Apply an irreversible multi-component transform to an image
@param c2 Samples blue component
@param n Number of samples for each component
*/
void opj_mct_encode_real(OPJ_INT32* OPJ_RESTRICT c0, OPJ_INT32* OPJ_RESTRICT c1,
OPJ_INT32* OPJ_RESTRICT c2, OPJ_SIZE_T n);
void opj_mct_encode_real(OPJ_FLOAT32* OPJ_RESTRICT c0,
OPJ_FLOAT32* OPJ_RESTRICT c1,
OPJ_FLOAT32* OPJ_RESTRICT c2, OPJ_SIZE_T n);
/**
Apply an irreversible multi-component inverse transform to an image
@param c0 Samples for luminance component

View File

@ -2194,16 +2194,11 @@ static void opj_t1_clbl_encode_processor(void* user_data, opj_tls_t* tls)
tileIndex += tileLineAdvance;
}
} else { /* if (tccp->qmfbid == 0) */
const OPJ_INT32 bandconst = 8192 * 8192 / ((OPJ_INT32) floor(
band->stepsize * 8192));
for (j = 0; j < cblk_h; ++j) {
for (i = 0; i < cblk_w; ++i) {
OPJ_INT32 tmp = tiledp[tileIndex];
tiledp[tileIndex] =
opj_int_fix_mul_t1(
tmp,
bandconst);
OPJ_FLOAT32 tmp = ((OPJ_FLOAT32*)tiledp)[tileIndex];
tiledp[tileIndex] = (OPJ_INT32)opj_lrintf((tmp / band->stepsize) *
(1 << T1_NMSEDEC_FRACBITS));
tileIndex++;
}
tileIndex += tileLineAdvance;

View File

@ -2411,7 +2411,8 @@ static OPJ_BOOL opj_tcd_dc_level_shift_encode(opj_tcd_t *p_tcd)
}
} else {
for (i = 0; i < l_nb_elem; ++i) {
*l_current_ptr = (*l_current_ptr - l_tccp->m_dc_level_shift) * (1 << 11);
*((OPJ_FLOAT32 *) l_current_ptr) = (OPJ_FLOAT32)(*l_current_ptr -
l_tccp->m_dc_level_shift);
++l_current_ptr;
}
}
@ -2469,8 +2470,11 @@ static OPJ_BOOL opj_tcd_mct_encode(opj_tcd_t *p_tcd)
opj_free(l_data);
} else if (l_tcp->tccps->qmfbid == 0) {
opj_mct_encode_real(l_tile->comps[0].data, l_tile->comps[1].data,
l_tile->comps[2].data, samples);
opj_mct_encode_real(
(OPJ_FLOAT32*)l_tile->comps[0].data,
(OPJ_FLOAT32*)l_tile->comps[1].data,
(OPJ_FLOAT32*)l_tile->comps[2].data,
samples);
} else {
opj_mct_encode(l_tile->comps[0].data, l_tile->comps[1].data,
l_tile->comps[2].data, samples);