From 97eb7e0bf17b476d516262e0af462ec7eeb8f505 Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Wed, 29 Apr 2020 11:50:17 +0200 Subject: [PATCH 01/24] Add multithreading support in the T1 (entropy phase) encoder - API wise, opj_codec_set_threads() can be used on the encoding side - opj_compress has a -threads switch similar to opj_uncompress --- src/bin/jp2/opj_compress.c | 35 ++++- src/lib/openjp2/openjpeg.c | 6 + src/lib/openjp2/openjpeg.h | 11 +- src/lib/openjp2/t1.c | 296 ++++++++++++++++++++++++------------- src/lib/openjp2/t1.h | 4 +- src/lib/openjp2/tcd.c | 16 +- 6 files changed, 239 insertions(+), 129 deletions(-) diff --git a/src/bin/jp2/opj_compress.c b/src/bin/jp2/opj_compress.c index cbc30fba..68274840 100644 --- a/src/bin/jp2/opj_compress.c +++ b/src/bin/jp2/opj_compress.c @@ -301,6 +301,10 @@ static void encode_help_display(void) fprintf(stdout, " Currently supports only RPCL order.\n"); fprintf(stdout, "-C \n"); fprintf(stdout, " Add in the comment marker segment.\n"); + if (opj_has_thread_support()) { + fprintf(stdout, " -threads \n" + " Number of threads to use for encoding or ALL_CPUS for all available cores.\n"); + } /* UniPG>> */ #ifdef USE_JPWL fprintf(stdout, "-W \n"); @@ -579,7 +583,8 @@ static int parse_cmdline_encoder(int argc, char **argv, img_fol_t *img_fol, raw_cparameters_t *raw_cp, char *indexfilename, size_t indexfilename_size, int* pOutFramerate, - OPJ_BOOL* pOutPLT) + OPJ_BOOL* pOutPLT, + int* pOutNumThreads) { OPJ_UINT32 i, j; int totlen, c; @@ -596,7 +601,8 @@ static int parse_cmdline_encoder(int argc, char **argv, {"jpip", NO_ARG, NULL, 'J'}, {"mct", REQ_ARG, NULL, 'Y'}, {"IMF", REQ_ARG, NULL, 'Z'}, - {"PLT", NO_ARG, NULL, 'A'} + {"PLT", NO_ARG, NULL, 'A'}, + {"threads", REQ_ARG, NULL, 'B'} }; /* parse the command line */ @@ -1679,6 +1685,19 @@ static int parse_cmdline_encoder(int argc, char **argv, } break; + /* ----------------------------------------------------- */ + case 'B': { /* Number of threads */ + if (strcmp(opj_optarg, "ALL_CPUS") == 0) { + *pOutNumThreads = opj_get_num_cpus(); + if (*pOutNumThreads == 1) { + *pOutNumThreads = 0; + } + } else { + sscanf(opj_optarg, "%d", pOutNumThreads); + } + } + break; + /* ------------------------------------------------------ */ @@ -1860,6 +1879,7 @@ int main(int argc, char **argv) OPJ_FLOAT64 t = opj_clock(); OPJ_BOOL PLT = OPJ_FALSE; + int num_threads = 0; /* set encoding parameters to default values */ opj_set_default_encoder_parameters(¶meters); @@ -1880,7 +1900,7 @@ int main(int argc, char **argv) parameters.tcp_mct = (char) 255; /* This will be set later according to the input image or the provided option */ if (parse_cmdline_encoder(argc, argv, ¶meters, &img_fol, &raw_cp, - indexfilename, sizeof(indexfilename), &framerate, &PLT) == 1) { + indexfilename, sizeof(indexfilename), &framerate, &PLT, &num_threads) == 1) { ret = 1; goto fin; } @@ -2141,6 +2161,15 @@ int main(int argc, char **argv) } } + if (num_threads >= 1 && + !opj_codec_set_threads(l_codec, num_threads)) { + fprintf(stderr, "failed to set number of threads\n"); + opj_destroy_codec(l_codec); + opj_image_destroy(image); + ret = 1; + goto fin; + } + /* open a byte stream for writing and allocate memory for all tiles */ l_stream = opj_stream_create_default_file_stream(parameters.outfile, OPJ_FALSE); if (! l_stream) { diff --git a/src/lib/openjp2/openjpeg.c b/src/lib/openjp2/openjpeg.c index 1e2d60a6..9c9b6eb0 100644 --- a/src/lib/openjp2/openjpeg.c +++ b/src/lib/openjp2/openjpeg.c @@ -657,6 +657,9 @@ opj_codec_t* OPJ_CALLCONV opj_create_compress(OPJ_CODEC_FORMAT p_format) const char* const*, struct opj_event_mgr *)) opj_j2k_encoder_set_extra_options; + l_codec->opj_set_threads = + (OPJ_BOOL(*)(void * p_codec, OPJ_UINT32 num_threads)) opj_j2k_set_threads; + l_codec->m_codec = opj_j2k_create_compress(); if (! l_codec->m_codec) { opj_free(l_codec); @@ -700,6 +703,9 @@ opj_codec_t* OPJ_CALLCONV opj_create_compress(OPJ_CODEC_FORMAT p_format) const char* const*, struct opj_event_mgr *)) opj_jp2_encoder_set_extra_options; + l_codec->opj_set_threads = + (OPJ_BOOL(*)(void * p_codec, OPJ_UINT32 num_threads)) opj_jp2_set_threads; + l_codec->m_codec = opj_jp2_create(OPJ_FALSE); if (! l_codec->m_codec) { opj_free(l_codec); diff --git a/src/lib/openjp2/openjpeg.h b/src/lib/openjp2/openjpeg.h index da84f399..4bbd9a8b 100644 --- a/src/lib/openjp2/openjpeg.h +++ b/src/lib/openjp2/openjpeg.h @@ -1348,15 +1348,14 @@ OPJ_API OPJ_BOOL OPJ_CALLCONV opj_setup_decoder(opj_codec_t *p_codec, * number, or "ALL_CPUS". If OPJ_NUM_THREADS is set and this function is called, * this function will override the behaviour of the environment variable. * - * Currently this function must be called after opj_setup_decoder() and - * before opj_read_header(). + * This function must be called after opj_setup_decoder() and + * before opj_read_header() for the decoding side, or after opj_setup_encoder() + * and before opj_start_compress() for the encoding side. * - * Note: currently only has effect on the decompressor. - * - * @param p_codec decompressor handler + * @param p_codec decompressor or compressor handler * @param num_threads number of threads. * - * @return OPJ_TRUE if the decoder is correctly set + * @return OPJ_TRUE if the function is successful. */ OPJ_API OPJ_BOOL OPJ_CALLCONV opj_codec_set_threads(opj_codec_t *p_codec, int num_threads); diff --git a/src/lib/openjp2/t1.c b/src/lib/openjp2/t1.c index f6f76711..1b9556ea 100644 --- a/src/lib/openjp2/t1.c +++ b/src/lib/openjp2/t1.c @@ -177,18 +177,18 @@ static OPJ_FLOAT64 opj_t1_getwmsedec( const OPJ_FLOAT64 * mct_norms, OPJ_UINT32 mct_numcomps); -static void opj_t1_encode_cblk(opj_t1_t *t1, - opj_tcd_cblk_enc_t* cblk, - OPJ_UINT32 orient, - OPJ_UINT32 compno, - OPJ_UINT32 level, - OPJ_UINT32 qmfbid, - OPJ_FLOAT64 stepsize, - OPJ_UINT32 cblksty, - OPJ_UINT32 numcomps, - opj_tcd_tile_t * tile, - const OPJ_FLOAT64 * mct_norms, - OPJ_UINT32 mct_numcomps); +/** Return "cumwmsedec" that should be used to increase tile->distotile */ +static double opj_t1_encode_cblk(opj_t1_t *t1, + opj_tcd_cblk_enc_t* cblk, + OPJ_UINT32 orient, + OPJ_UINT32 compno, + OPJ_UINT32 level, + OPJ_UINT32 qmfbid, + OPJ_FLOAT64 stepsize, + OPJ_UINT32 cblksty, + OPJ_UINT32 numcomps, + const OPJ_FLOAT64 * mct_norms, + OPJ_UINT32 mct_numcomps); /** Decode 1 code-block @@ -2100,124 +2100,210 @@ static OPJ_BOOL opj_t1_decode_cblk(opj_t1_t *t1, } +typedef struct { + OPJ_UINT32 compno; + OPJ_UINT32 resno; + opj_tcd_cblk_enc_t* cblk; + opj_tcd_tile_t *tile; + opj_tcd_band_t* band; + opj_tcd_tilecomp_t* tilec; + opj_tccp_t* tccp; + const OPJ_FLOAT64 * mct_norms; + OPJ_UINT32 mct_numcomps; + volatile OPJ_BOOL* pret; + opj_mutex_t* mutex; +} opj_t1_cblk_encode_processing_job_t; + +/** Procedure to deal with a asynchronous code-block encoding job. + * + * @param user_data Pointer to a opj_t1_cblk_encode_processing_job_t* structure + * @param tls TLS handle. + */ +static void opj_t1_clbl_encode_processor(void* user_data, opj_tls_t* tls) +{ + opj_t1_cblk_encode_processing_job_t* job = + (opj_t1_cblk_encode_processing_job_t*)user_data; + opj_tcd_cblk_enc_t* cblk = job->cblk; + const opj_tcd_band_t* band = job->band; + const opj_tcd_tilecomp_t* tilec = job->tilec; + const opj_tccp_t* tccp = job->tccp; + const OPJ_UINT32 resno = job->resno; + opj_t1_t* t1; + const OPJ_UINT32 tile_w = (OPJ_UINT32)(tilec->x1 - tilec->x0); + + OPJ_INT32* OPJ_RESTRICT tiledp; + OPJ_UINT32 cblk_w; + OPJ_UINT32 cblk_h; + OPJ_UINT32 i, j, tileLineAdvance; + OPJ_SIZE_T tileIndex = 0; + + OPJ_INT32 x = cblk->x0 - band->x0; + OPJ_INT32 y = cblk->y0 - band->y0; + + if (!*(job->pret)) { + opj_free(job); + return; + } + + t1 = (opj_t1_t*) opj_tls_get(tls, OPJ_TLS_KEY_T1); + if (t1 == NULL) { + t1 = opj_t1_create(OPJ_TRUE); /* OPJ_TRUE == T1 for encoding */ + opj_tls_set(tls, OPJ_TLS_KEY_T1, t1, opj_t1_destroy_wrapper); + } + + if (band->bandno & 1) { + opj_tcd_resolution_t *pres = &tilec->resolutions[resno - 1]; + x += pres->x1 - pres->x0; + } + if (band->bandno & 2) { + opj_tcd_resolution_t *pres = &tilec->resolutions[resno - 1]; + y += pres->y1 - pres->y0; + } + + if (!opj_t1_allocate_buffers( + t1, + (OPJ_UINT32)(cblk->x1 - cblk->x0), + (OPJ_UINT32)(cblk->y1 - cblk->y0))) { + *(job->pret) = OPJ_FALSE; + opj_free(job); + return; + } + + cblk_w = t1->w; + cblk_h = t1->h; + tileLineAdvance = tile_w - cblk_w; + + tiledp = &tilec->data[(OPJ_SIZE_T)y * tile_w + (OPJ_SIZE_T)x]; + t1->data = tiledp; + t1->data_stride = tile_w; + if (tccp->qmfbid == 1) { + /* Do multiplication on unsigned type, even if the + * underlying type is signed, to avoid potential + * int overflow on large value (the output will be + * incorrect in such situation, but whatever...) + * This assumes complement-to-2 signed integer + * representation + * Fixes https://github.com/uclouvain/openjpeg/issues/1053 + */ + OPJ_UINT32* OPJ_RESTRICT tiledp_u = (OPJ_UINT32*) tiledp; + for (j = 0; j < cblk_h; ++j) { + for (i = 0; i < cblk_w; ++i) { + tiledp_u[tileIndex] <<= T1_NMSEDEC_FRACBITS; + tileIndex++; + } + tileIndex += tileLineAdvance; + } + } else { /* if (tccp->qmfbid == 0) */ + const OPJ_INT32 bandconst = 8192 * 8192 / ((OPJ_INT32) floor( + band->stepsize * 8192)); + + for (j = 0; j < cblk_h; ++j) { + for (i = 0; i < cblk_w; ++i) { + OPJ_INT32 tmp = tiledp[tileIndex]; + tiledp[tileIndex] = + opj_int_fix_mul_t1( + tmp, + bandconst); + tileIndex++; + } + tileIndex += tileLineAdvance; + } + } + + { + OPJ_FLOAT64 cumwmsedec = + opj_t1_encode_cblk( + t1, + cblk, + band->bandno, + job->compno, + tilec->numresolutions - 1 - resno, + tccp->qmfbid, + band->stepsize, + tccp->cblksty, + job->tile->numcomps, + job->mct_norms, + job->mct_numcomps); + if (job->mutex) { + opj_mutex_lock(job->mutex); + } + job->tile->distotile += cumwmsedec; + if (job->mutex) { + opj_mutex_unlock(job->mutex); + } + } + + opj_free(job); +} -OPJ_BOOL opj_t1_encode_cblks(opj_t1_t *t1, +OPJ_BOOL opj_t1_encode_cblks(opj_tcd_t* tcd, opj_tcd_tile_t *tile, opj_tcp_t *tcp, const OPJ_FLOAT64 * mct_norms, OPJ_UINT32 mct_numcomps ) { + volatile OPJ_BOOL ret = OPJ_TRUE; + opj_thread_pool_t* tp = tcd->thread_pool; OPJ_UINT32 compno, resno, bandno, precno, cblkno; + opj_mutex_t* mutex = opj_mutex_create(); tile->distotile = 0; /* fixed_quality */ for (compno = 0; compno < tile->numcomps; ++compno) { opj_tcd_tilecomp_t* tilec = &tile->comps[compno]; opj_tccp_t* tccp = &tcp->tccps[compno]; - OPJ_UINT32 tile_w = (OPJ_UINT32)(tilec->x1 - tilec->x0); for (resno = 0; resno < tilec->numresolutions; ++resno) { opj_tcd_resolution_t *res = &tilec->resolutions[resno]; for (bandno = 0; bandno < res->numbands; ++bandno) { opj_tcd_band_t* OPJ_RESTRICT band = &res->bands[bandno]; - OPJ_INT32 bandconst; /* Skip empty bands */ if (opj_tcd_is_band_empty(band)) { continue; } - - bandconst = 8192 * 8192 / ((OPJ_INT32) floor(band->stepsize * 8192)); for (precno = 0; precno < res->pw * res->ph; ++precno) { opj_tcd_precinct_t *prc = &band->precincts[precno]; for (cblkno = 0; cblkno < prc->cw * prc->ch; ++cblkno) { opj_tcd_cblk_enc_t* cblk = &prc->cblks.enc[cblkno]; - OPJ_INT32* OPJ_RESTRICT tiledp; - OPJ_UINT32 cblk_w; - OPJ_UINT32 cblk_h; - OPJ_UINT32 i, j, tileLineAdvance; - OPJ_SIZE_T tileIndex = 0; - OPJ_INT32 x = cblk->x0 - band->x0; - OPJ_INT32 y = cblk->y0 - band->y0; - if (band->bandno & 1) { - opj_tcd_resolution_t *pres = &tilec->resolutions[resno - 1]; - x += pres->x1 - pres->x0; + opj_t1_cblk_encode_processing_job_t* job = + (opj_t1_cblk_encode_processing_job_t*) opj_calloc(1, + sizeof(opj_t1_cblk_encode_processing_job_t)); + if (!job) { + ret = OPJ_FALSE; + goto end; } - if (band->bandno & 2) { - opj_tcd_resolution_t *pres = &tilec->resolutions[resno - 1]; - y += pres->y1 - pres->y0; - } - - if (!opj_t1_allocate_buffers( - t1, - (OPJ_UINT32)(cblk->x1 - cblk->x0), - (OPJ_UINT32)(cblk->y1 - cblk->y0))) { - return OPJ_FALSE; - } - - cblk_w = t1->w; - cblk_h = t1->h; - tileLineAdvance = tile_w - cblk_w; - - tiledp = &tilec->data[(OPJ_SIZE_T)y * tile_w + (OPJ_SIZE_T)x]; - t1->data = tiledp; - t1->data_stride = tile_w; - if (tccp->qmfbid == 1) { - /* Do multiplication on unsigned type, even if the - * underlying type is signed, to avoid potential - * int overflow on large value (the output will be - * incorrect in such situation, but whatever...) - * This assumes complement-to-2 signed integer - * representation - * Fixes https://github.com/uclouvain/openjpeg/issues/1053 - */ - OPJ_UINT32* OPJ_RESTRICT tiledp_u = (OPJ_UINT32*) tiledp; - for (j = 0; j < cblk_h; ++j) { - for (i = 0; i < cblk_w; ++i) { - tiledp_u[tileIndex] <<= T1_NMSEDEC_FRACBITS; - tileIndex++; - } - tileIndex += tileLineAdvance; - } - } else { /* if (tccp->qmfbid == 0) */ - for (j = 0; j < cblk_h; ++j) { - for (i = 0; i < cblk_w; ++i) { - OPJ_INT32 tmp = tiledp[tileIndex]; - tiledp[tileIndex] = - opj_int_fix_mul_t1( - tmp, - bandconst); - tileIndex++; - } - tileIndex += tileLineAdvance; - } - } - - opj_t1_encode_cblk( - t1, - cblk, - band->bandno, - compno, - tilec->numresolutions - 1 - resno, - tccp->qmfbid, - band->stepsize, - tccp->cblksty, - tile->numcomps, - tile, - mct_norms, - mct_numcomps); + job->compno = compno; + job->tile = tile; + job->resno = resno; + job->cblk = cblk; + job->band = band; + job->tilec = tilec; + job->tccp = tccp; + job->mct_norms = mct_norms; + job->mct_numcomps = mct_numcomps; + job->pret = &ret; + job->mutex = mutex; + opj_thread_pool_submit_job(tp, opj_t1_clbl_encode_processor, job); } /* cblkno */ } /* precno */ } /* bandno */ } /* resno */ } /* compno */ - return OPJ_TRUE; + +end: + opj_thread_pool_wait_completion(tcd->thread_pool, 0); + if (mutex) { + opj_mutex_destroy(mutex); + } + + return ret; } /* Returns whether the pass (bpno, passtype) is terminated */ @@ -2252,18 +2338,17 @@ static int opj_t1_enc_is_term_pass(opj_tcd_cblk_enc_t* cblk, /** mod fixed_quality */ -static void opj_t1_encode_cblk(opj_t1_t *t1, - opj_tcd_cblk_enc_t* cblk, - OPJ_UINT32 orient, - OPJ_UINT32 compno, - OPJ_UINT32 level, - OPJ_UINT32 qmfbid, - OPJ_FLOAT64 stepsize, - OPJ_UINT32 cblksty, - OPJ_UINT32 numcomps, - opj_tcd_tile_t * tile, - const OPJ_FLOAT64 * mct_norms, - OPJ_UINT32 mct_numcomps) +static OPJ_FLOAT64 opj_t1_encode_cblk(opj_t1_t *t1, + opj_tcd_cblk_enc_t* cblk, + OPJ_UINT32 orient, + OPJ_UINT32 compno, + OPJ_UINT32 level, + OPJ_UINT32 qmfbid, + OPJ_FLOAT64 stepsize, + OPJ_UINT32 cblksty, + OPJ_UINT32 numcomps, + const OPJ_FLOAT64 * mct_norms, + OPJ_UINT32 mct_numcomps) { OPJ_FLOAT64 cumwmsedec = 0.0; @@ -2297,7 +2382,7 @@ static void opj_t1_encode_cblk(opj_t1_t *t1, T1_NMSEDEC_FRACBITS) : 0; if (cblk->numbps == 0) { cblk->totalpasses = 0; - return; + return cumwmsedec; } bpno = (OPJ_INT32)(cblk->numbps - 1); @@ -2343,7 +2428,6 @@ static void opj_t1_encode_cblk(opj_t1_t *t1, tempwmsedec = opj_t1_getwmsedec(nmsedec, compno, level, orient, bpno, qmfbid, stepsize, numcomps, mct_norms, mct_numcomps) ; cumwmsedec += tempwmsedec; - tile->distotile += tempwmsedec; pass->distortiondec = cumwmsedec; if (opj_t1_enc_is_term_pass(cblk, cblksty, bpno, passtype)) { @@ -2425,4 +2509,6 @@ static void opj_t1_encode_cblk(opj_t1_t *t1, } } #endif + + return cumwmsedec; } diff --git a/src/lib/openjp2/t1.h b/src/lib/openjp2/t1.h index 171dfb0a..bc8a8111 100644 --- a/src/lib/openjp2/t1.h +++ b/src/lib/openjp2/t1.h @@ -216,13 +216,13 @@ typedef struct opj_t1 { /** Encode the code-blocks of a tile -@param t1 T1 handle +@param tcd TCD handle @param tile The tile to encode @param tcp Tile coding parameters @param mct_norms FIXME DOC @param mct_numcomps Number of components used for MCT */ -OPJ_BOOL opj_t1_encode_cblks(opj_t1_t *t1, +OPJ_BOOL opj_t1_encode_cblks(opj_tcd_t* tcd, opj_tcd_tile_t *tile, opj_tcp_t *tcp, const OPJ_FLOAT64 * mct_norms, diff --git a/src/lib/openjp2/tcd.c b/src/lib/openjp2/tcd.c index 3a1c3026..108462ca 100644 --- a/src/lib/openjp2/tcd.c +++ b/src/lib/openjp2/tcd.c @@ -2506,16 +2506,10 @@ static OPJ_BOOL opj_tcd_dwt_encode(opj_tcd_t *p_tcd) static OPJ_BOOL opj_tcd_t1_encode(opj_tcd_t *p_tcd) { - opj_t1_t * l_t1; const OPJ_FLOAT64 * l_mct_norms; OPJ_UINT32 l_mct_numcomps = 0U; opj_tcp_t * l_tcp = p_tcd->tcp; - l_t1 = opj_t1_create(OPJ_TRUE); - if (l_t1 == 00) { - return OPJ_FALSE; - } - if (l_tcp->mct == 1) { l_mct_numcomps = 3U; /* irreversible encoding */ @@ -2529,13 +2523,9 @@ static OPJ_BOOL opj_tcd_t1_encode(opj_tcd_t *p_tcd) l_mct_norms = (const OPJ_FLOAT64 *)(l_tcp->mct_norms); } - if (! opj_t1_encode_cblks(l_t1, p_tcd->tcd_image->tiles, l_tcp, l_mct_norms, - l_mct_numcomps)) { - opj_t1_destroy(l_t1); - return OPJ_FALSE; - } - - opj_t1_destroy(l_t1); + return opj_t1_encode_cblks(p_tcd, + p_tcd->tcd_image->tiles, l_tcp, l_mct_norms, + l_mct_numcomps); return OPJ_TRUE; } From 07d1f775a1ef95496b0c78b18f671dac41983320 Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Thu, 30 Apr 2020 11:52:42 +0200 Subject: [PATCH 02/24] Add multithreaded support in the DWT encoder. Update the bench_dwt utility to have a -decode/-encode switch Measured performance gains for DWT encoder on a Intel(R) Core(TM) i7-6700HQ CPU @ 2.60GHz (4 cores, hyper threaded) Encoding time: $ ./bin/bench_dwt -encode -num_threads 1 time for dwt_encode: total = 8.348 s, wallclock = 8.352 s $ ./bin/bench_dwt -encode -num_threads 2 time for dwt_encode: total = 9.776 s, wallclock = 4.904 s $ ./bin/bench_dwt -encode -num_threads 4 time for dwt_encode: total = 13.188 s, wallclock = 3.310 s $ ./bin/bench_dwt -encode -num_threads 8 time for dwt_encode: total = 30.024 s, wallclock = 4.064 s Scaling is probably limited by memory access patterns causing memory access to be the bottleneck. The slightly worse results with threads==8 than with thread==4 is due to hyperthreading being not appropriate here. --- CMakeLists.txt | 2 + src/lib/openjp2/CMakeLists.txt | 4 +- src/lib/openjp2/bench_dwt.c | 56 +++++- src/lib/openjp2/dwt.c | 262 ++++++++++++++++++++++------ src/lib/openjp2/dwt.h | 8 +- src/lib/openjp2/tcd.c | 4 +- tools/ctest_scripts/travis-ci.cmake | 3 + 7 files changed, 275 insertions(+), 64 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3ea2424a..050264a7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -256,7 +256,9 @@ if(BUILD_JPIP_SERVER) endif() add_subdirectory(src/lib) option(BUILD_LUTS_GENERATOR "Build utility to generate t1_luts.h" OFF) +if(UNIX) option(BUILD_UNIT_TESTS "Build unit tests (bench_dwt, test_sparse_array, etc..)" OFF) +endif() #----------------------------------------------------------------------------- # Build Applications diff --git a/src/lib/openjp2/CMakeLists.txt b/src/lib/openjp2/CMakeLists.txt index b2714858..9f79b9c3 100644 --- a/src/lib/openjp2/CMakeLists.txt +++ b/src/lib/openjp2/CMakeLists.txt @@ -199,7 +199,7 @@ if(OPJ_USE_THREAD AND Threads_FOUND AND CMAKE_USE_PTHREADS_INIT) TARGET_LINK_LIBRARIES(${OPENJPEG_LIBRARY_NAME} ${CMAKE_THREAD_LIBS_INIT}) endif(OPJ_USE_THREAD AND Threads_FOUND AND CMAKE_USE_PTHREADS_INIT) -if(BUILD_UNIT_TESTS) +if(BUILD_UNIT_TESTS AND UNIX) add_executable(bench_dwt bench_dwt.c) if(UNIX) target_link_libraries(bench_dwt m ${OPENJPEG_LIBRARY_NAME}) @@ -215,4 +215,4 @@ if(BUILD_UNIT_TESTS) if(OPJ_USE_THREAD AND Threads_FOUND AND CMAKE_USE_PTHREADS_INIT) target_link_libraries(test_sparse_array ${CMAKE_THREAD_LIBS_INIT}) endif(OPJ_USE_THREAD AND Threads_FOUND AND CMAKE_USE_PTHREADS_INIT) -endif(BUILD_UNIT_TESTS) +endif(BUILD_UNIT_TESTS AND UNIX) diff --git a/src/lib/openjp2/bench_dwt.c b/src/lib/openjp2/bench_dwt.c index 8cb64d06..2b274145 100644 --- a/src/lib/openjp2/bench_dwt.c +++ b/src/lib/openjp2/bench_dwt.c @@ -67,6 +67,7 @@ void init_tilec(opj_tcd_tilecomp_t * l_tilec, l_tilec->data[i] = getValue((OPJ_UINT32)i); } l_tilec->numresolutions = numresolutions; + l_tilec->minimum_num_resolutions = numresolutions; l_tilec->resolutions = (opj_tcd_resolution_t*) opj_calloc( l_tilec->numresolutions, sizeof(opj_tcd_resolution_t)); @@ -98,9 +99,9 @@ void free_tilec(opj_tcd_tilecomp_t * l_tilec) void usage(void) { printf( - "bench_dwt [-size value] [-check] [-display] [-num_resolutions val]\n"); + "bench_dwt [-decode|encode] [-size value] [-check] [-display]\n"); printf( - " [-offset x y] [-num_threads val]\n"); + " [-num_resolutions val] [-offset x y] [-num_threads val]\n"); exit(1); } @@ -131,6 +132,17 @@ OPJ_FLOAT64 opj_clock(void) #endif } +static OPJ_FLOAT64 opj_wallclock(void) +{ +#ifdef _WIN32 + return opj_clock(); +#else + struct timeval tv; + gettimeofday(&tv, NULL); + return (OPJ_FLOAT64)tv.tv_sec + 1e-6 * (OPJ_FLOAT64)tv.tv_usec; +#endif +} + int main(int argc, char** argv) { int num_threads = 0; @@ -146,12 +158,18 @@ int main(int argc, char** argv) OPJ_BOOL check = OPJ_FALSE; OPJ_INT32 size = 16384 - 1; OPJ_FLOAT64 start, stop; + OPJ_FLOAT64 start_wc, stop_wc; OPJ_UINT32 offset_x = ((OPJ_UINT32)size + 1) / 2 - 1; OPJ_UINT32 offset_y = ((OPJ_UINT32)size + 1) / 2 - 1; OPJ_UINT32 num_resolutions = 6; + OPJ_BOOL bench_decode = OPJ_TRUE; for (i = 1; i < argc; i++) { - if (strcmp(argv[i], "-display") == 0) { + if (strcmp(argv[i], "-encode") == 0) { + bench_decode = OPJ_FALSE; + } else if (strcmp(argv[i], "-decode") == 0) { + bench_decode = OPJ_TRUE; + } else if (strcmp(argv[i], "-display") == 0) { display = OPJ_TRUE; check = OPJ_TRUE; } else if (strcmp(argv[i], "-check") == 0) { @@ -223,13 +241,26 @@ int main(int argc, char** argv) image_comp.dy = 1; start = opj_clock(); - opj_dwt_decode(&tcd, &tilec, tilec.numresolutions); + start_wc = opj_wallclock(); + if (bench_decode) { + opj_dwt_decode(&tcd, &tilec, tilec.numresolutions); + } else { + opj_dwt_encode(&tcd, &tilec); + } stop = opj_clock(); - printf("time for dwt_decode: %.03f s\n", stop - start); + stop_wc = opj_wallclock(); + printf("time for %s: total = %.03f s, wallclock = %.03f s\n", + bench_decode ? "dwt_decode" : "dwt_encode", + stop - start, + stop_wc - start_wc); if (display || check) { if (display) { - printf("After IDWT\n"); + if (bench_decode) { + printf("After IDWT\n"); + } else { + printf("After FDWT\n"); + } k = 0; for (j = 0; j < tilec.y1 - tilec.y0; j++) { for (i = 0; i < tilec.x1 - tilec.x0; i++) { @@ -240,9 +271,18 @@ int main(int argc, char** argv) } } - opj_dwt_encode(&tilec); + if (bench_decode) { + opj_dwt_encode(&tcd, &tilec); + } else { + opj_dwt_decode(&tcd, &tilec, tilec.numresolutions); + } + if (display) { - printf("After FDWT\n"); + if (bench_decode) { + printf("After FDWT\n"); + } else { + printf("After IDWT\n"); + } k = 0; for (j = 0; j < tilec.y1 - tilec.y0; j++) { for (i = 0; i < tilec.x1 - tilec.x0; i++) { diff --git a/src/lib/openjp2/dwt.c b/src/lib/openjp2/dwt.c index 5930d1c7..ff771817 100644 --- a/src/lib/openjp2/dwt.c +++ b/src/lib/openjp2/dwt.c @@ -129,7 +129,7 @@ static void opj_dwt_deinterleave_h(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn, Forward lazy transform (vertical) */ static void opj_dwt_deinterleave_v(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn, - OPJ_INT32 sn, OPJ_INT32 x, OPJ_INT32 cas); + OPJ_INT32 sn, OPJ_UINT32 x, OPJ_INT32 cas); /** Forward 5-3 wavelet transform in 1-D */ @@ -155,7 +155,8 @@ static OPJ_BOOL opj_dwt_decode_partial_tile( opj_tcd_tilecomp_t* tilec, OPJ_UINT32 numres); -static OPJ_BOOL opj_dwt_encode_procedure(opj_tcd_tilecomp_t * tilec, +static OPJ_BOOL opj_dwt_encode_procedure(opj_thread_pool_t* tp, + opj_tcd_tilecomp_t * tilec, void (*p_function)(OPJ_INT32 *, OPJ_INT32, OPJ_INT32, OPJ_INT32)); static OPJ_UINT32 opj_dwt_max_resolution(opj_tcd_resolution_t* OPJ_RESTRICT r, @@ -271,7 +272,7 @@ static void opj_dwt_deinterleave_h(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn, /* Forward lazy transform (vertical). */ /* */ static void opj_dwt_deinterleave_v(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn, - OPJ_INT32 sn, OPJ_INT32 x, OPJ_INT32 cas) + OPJ_INT32 sn, OPJ_UINT32 x, OPJ_INT32 cas) { OPJ_INT32 i = sn; OPJ_INT32 * l_dest = b; @@ -1103,28 +1104,92 @@ static void opj_dwt_encode_stepsize(OPJ_INT32 stepsize, OPJ_INT32 numbps, */ +typedef struct { + opj_dwt_t h; + OPJ_UINT32 rw; + OPJ_UINT32 w; + OPJ_INT32 * OPJ_RESTRICT tiledp; + OPJ_UINT32 min_j; + OPJ_UINT32 max_j; + void (*p_function)(OPJ_INT32 *, OPJ_INT32, OPJ_INT32, OPJ_INT32); +} opj_dwt_encode_h_job_t; + +static void opj_dwt_encode_h_func(void* user_data, opj_tls_t* tls) +{ + OPJ_UINT32 j; + opj_dwt_encode_h_job_t* job; + (void)tls; + + job = (opj_dwt_encode_h_job_t*)user_data; + for (j = job->min_j; j < job->max_j; j++) { + OPJ_INT32* OPJ_RESTRICT aj = job->tiledp + j * job->w; + OPJ_UINT32 k; + for (k = 0; k < job->rw; k++) { + job->h.mem[k] = aj[k]; + } + (*job->p_function)(job->h.mem, job->h.dn, job->h.sn, job->h.cas); + opj_dwt_deinterleave_h(job->h.mem, aj, job->h.dn, job->h.sn, job->h.cas); + } + + opj_aligned_free(job->h.mem); + opj_free(job); +} + +typedef struct { + opj_dwt_t v; + OPJ_UINT32 rh; + OPJ_UINT32 w; + OPJ_INT32 * OPJ_RESTRICT tiledp; + OPJ_UINT32 min_j; + OPJ_UINT32 max_j; + void (*p_function)(OPJ_INT32 *, OPJ_INT32, OPJ_INT32, OPJ_INT32); +} opj_dwt_encode_v_job_t; + +static void opj_dwt_encode_v_func(void* user_data, opj_tls_t* tls) +{ + OPJ_UINT32 j; + opj_dwt_encode_v_job_t* job; + (void)tls; + + job = (opj_dwt_encode_v_job_t*)user_data; + for (j = job->min_j; j < job->max_j; j++) { + OPJ_INT32* OPJ_RESTRICT aj = job->tiledp + j; + OPJ_UINT32 k; + for (k = 0; k < job->rh; ++k) { + job->v.mem[k] = aj[k * job->w]; + } + + (*job->p_function)(job->v.mem, job->v.dn, job->v.sn, job->v.cas); + + opj_dwt_deinterleave_v(job->v.mem, aj, job->v.dn, job->v.sn, job->w, + job->v.cas); + } + + opj_aligned_free(job->v.mem); + opj_free(job); +} + /* */ /* Forward 5-3 wavelet transform in 2-D. */ /* */ -static INLINE OPJ_BOOL opj_dwt_encode_procedure(opj_tcd_tilecomp_t * tilec, +static INLINE OPJ_BOOL opj_dwt_encode_procedure(opj_thread_pool_t* tp, + opj_tcd_tilecomp_t * tilec, void (*p_function)(OPJ_INT32 *, OPJ_INT32, OPJ_INT32, OPJ_INT32)) { - OPJ_INT32 i, j, k; - OPJ_INT32 *a = 00; - OPJ_INT32 *aj = 00; + OPJ_INT32 i; OPJ_INT32 *bj = 00; - OPJ_INT32 w, l; + OPJ_UINT32 w; + OPJ_INT32 l; - OPJ_INT32 rw; /* width of the resolution level computed */ - OPJ_INT32 rh; /* height of the resolution level computed */ OPJ_SIZE_T l_data_size; opj_tcd_resolution_t * l_cur_res = 0; opj_tcd_resolution_t * l_last_res = 0; + const int num_threads = opj_thread_pool_get_thread_count(tp); + OPJ_INT32 * OPJ_RESTRICT tiledp = tilec->data; - w = tilec->x1 - tilec->x0; + w = (OPJ_UINT32)(tilec->x1 - tilec->x0); l = (OPJ_INT32)tilec->numresolutions - 1; - a = tilec->data; l_cur_res = tilec->resolutions + l; l_last_res = l_cur_res - 1; @@ -1136,7 +1201,7 @@ static INLINE OPJ_BOOL opj_dwt_encode_procedure(opj_tcd_tilecomp_t * tilec, return OPJ_FALSE; } l_data_size *= sizeof(OPJ_INT32); - bj = (OPJ_INT32*)opj_malloc(l_data_size); + bj = (OPJ_INT32*)opj_aligned_32_malloc(l_data_size); /* l_data_size is equal to 0 when numresolutions == 1 but bj is not used */ /* in that case, so do not error out */ if (l_data_size != 0 && ! bj) { @@ -1145,43 +1210,137 @@ static INLINE OPJ_BOOL opj_dwt_encode_procedure(opj_tcd_tilecomp_t * tilec, i = l; while (i--) { - OPJ_INT32 rw1; /* width of the resolution level once lower than computed one */ - OPJ_INT32 rh1; /* height of the resolution level once lower than computed one */ + OPJ_UINT32 j; + OPJ_UINT32 rw; /* width of the resolution level computed */ + OPJ_UINT32 rh; /* height of the resolution level computed */ + OPJ_UINT32 + rw1; /* width of the resolution level once lower than computed one */ + OPJ_UINT32 + rh1; /* height of the resolution level once lower than computed one */ OPJ_INT32 cas_col; /* 0 = non inversion on horizontal filtering 1 = inversion between low-pass and high-pass filtering */ OPJ_INT32 cas_row; /* 0 = non inversion on vertical filtering 1 = inversion between low-pass and high-pass filtering */ OPJ_INT32 dn, sn; - rw = l_cur_res->x1 - l_cur_res->x0; - rh = l_cur_res->y1 - l_cur_res->y0; - rw1 = l_last_res->x1 - l_last_res->x0; - rh1 = l_last_res->y1 - l_last_res->y0; + rw = (OPJ_UINT32)(l_cur_res->x1 - l_cur_res->x0); + rh = (OPJ_UINT32)(l_cur_res->y1 - l_cur_res->y0); + rw1 = (OPJ_UINT32)(l_last_res->x1 - l_last_res->x0); + rh1 = (OPJ_UINT32)(l_last_res->y1 - l_last_res->y0); cas_row = l_cur_res->x0 & 1; cas_col = l_cur_res->y0 & 1; - sn = rh1; - dn = rh - rh1; - for (j = 0; j < rw; ++j) { - aj = a + j; - for (k = 0; k < rh; ++k) { - bj[k] = aj[k * w]; + sn = (OPJ_INT32)rh1; + dn = (OPJ_INT32)(rh - rh1); + + /* Perform vertical pass */ + if (num_threads <= 1 || rw <= 1) { + for (j = 0; j < rw; ++j) { + OPJ_INT32* OPJ_RESTRICT aj = tiledp + j; + OPJ_UINT32 k; + for (k = 0; k < rh; ++k) { + bj[k] = aj[k * w]; + } + + (*p_function)(bj, dn, sn, cas_col); + + opj_dwt_deinterleave_v(bj, aj, dn, sn, w, cas_col); } + } else { + OPJ_UINT32 num_jobs = (OPJ_UINT32)num_threads; + OPJ_UINT32 step_j; - (*p_function)(bj, dn, sn, cas_col); + if (rw < num_jobs) { + num_jobs = rw; + } + step_j = (rw / num_jobs); - opj_dwt_deinterleave_v(bj, aj, dn, sn, w, cas_col); + for (j = 0; j < num_jobs; j++) { + opj_dwt_encode_v_job_t* job; + + job = (opj_dwt_encode_v_job_t*) opj_malloc(sizeof(opj_dwt_encode_v_job_t)); + if (!job) { + opj_thread_pool_wait_completion(tp, 0); + opj_aligned_free(bj); + return OPJ_FALSE; + } + job->v.mem = (OPJ_INT32*)opj_aligned_32_malloc(l_data_size); + if (!job->v.mem) { + opj_thread_pool_wait_completion(tp, 0); + opj_free(job); + opj_aligned_free(bj); + return OPJ_FALSE; + } + job->v.dn = dn; + job->v.sn = sn; + job->v.cas = cas_col; + job->rh = rh; + job->w = w; + job->tiledp = tiledp; + job->min_j = j * step_j; + job->max_j = (j + 1U) * step_j; /* this can overflow */ + if (j == (num_jobs - 1U)) { /* this will take care of the overflow */ + job->max_j = rw; + } + job->p_function = p_function; + opj_thread_pool_submit_job(tp, opj_dwt_encode_v_func, job); + } + opj_thread_pool_wait_completion(tp, 0); } - sn = rw1; - dn = rw - rw1; + sn = (OPJ_INT32)rw1; + dn = (OPJ_INT32)(rw - rw1); - for (j = 0; j < rh; j++) { - aj = a + j * w; - for (k = 0; k < rw; k++) { - bj[k] = aj[k]; + /* Perform horizontal pass */ + if (num_threads <= 1 || rh <= 1) { + for (j = 0; j < rh; j++) { + OPJ_INT32* OPJ_RESTRICT aj = tiledp + j * w; + OPJ_UINT32 k; + for (k = 0; k < rw; k++) { + bj[k] = aj[k]; + } + (*p_function)(bj, dn, sn, cas_row); + opj_dwt_deinterleave_h(bj, aj, dn, sn, cas_row); } - (*p_function)(bj, dn, sn, cas_row); - opj_dwt_deinterleave_h(bj, aj, dn, sn, cas_row); + } else { + OPJ_UINT32 num_jobs = (OPJ_UINT32)num_threads; + OPJ_UINT32 step_j; + + if (rh < num_jobs) { + num_jobs = rh; + } + step_j = (rh / num_jobs); + + for (j = 0; j < num_jobs; j++) { + opj_dwt_encode_h_job_t* job; + + job = (opj_dwt_encode_h_job_t*) opj_malloc(sizeof(opj_dwt_encode_h_job_t)); + if (!job) { + opj_thread_pool_wait_completion(tp, 0); + opj_aligned_free(bj); + return OPJ_FALSE; + } + job->h.mem = (OPJ_INT32*)opj_aligned_32_malloc(l_data_size); + if (!job->h.mem) { + opj_thread_pool_wait_completion(tp, 0); + opj_free(job); + opj_aligned_free(bj); + return OPJ_FALSE; + } + job->h.dn = dn; + job->h.sn = sn; + job->h.cas = cas_row; + job->rw = rw; + job->w = w; + job->tiledp = tiledp; + job->min_j = j * step_j; + job->max_j = (j + 1U) * step_j; /* this can overflow */ + if (j == (num_jobs - 1U)) { /* this will take care of the overflow */ + job->max_j = rh; + } + job->p_function = p_function; + opj_thread_pool_submit_job(tp, opj_dwt_encode_h_func, job); + } + opj_thread_pool_wait_completion(tp, 0); } l_cur_res = l_last_res; @@ -1189,15 +1348,16 @@ static INLINE OPJ_BOOL opj_dwt_encode_procedure(opj_tcd_tilecomp_t * tilec, --l_last_res; } - opj_free(bj); + opj_aligned_free(bj); return OPJ_TRUE; } /* Forward 5-3 wavelet transform in 2-D. */ /* */ -OPJ_BOOL opj_dwt_encode(opj_tcd_tilecomp_t * tilec) +OPJ_BOOL opj_dwt_encode(opj_tcd_t *p_tcd, + opj_tcd_tilecomp_t * tilec) { - return opj_dwt_encode_procedure(tilec, opj_dwt_encode_1); + return opj_dwt_encode_procedure(p_tcd->thread_pool, tilec, opj_dwt_encode_1); } /* */ @@ -1247,9 +1407,11 @@ OPJ_FLOAT64 opj_dwt_getnorm(OPJ_UINT32 level, OPJ_UINT32 orient) /* */ /* Forward 9-7 wavelet transform in 2-D. */ /* */ -OPJ_BOOL opj_dwt_encode_real(opj_tcd_tilecomp_t * tilec) +OPJ_BOOL opj_dwt_encode_real(opj_tcd_t *p_tcd, + opj_tcd_tilecomp_t * tilec) { - return opj_dwt_encode_procedure(tilec, opj_dwt_encode_1_real); + return opj_dwt_encode_procedure(p_tcd->thread_pool, tilec, + opj_dwt_encode_1_real); } /* */ @@ -1328,15 +1490,15 @@ typedef struct { OPJ_INT32 * OPJ_RESTRICT tiledp; OPJ_UINT32 min_j; OPJ_UINT32 max_j; -} opj_dwd_decode_h_job_t; +} opj_dwt_decode_h_job_t; static void opj_dwt_decode_h_func(void* user_data, opj_tls_t* tls) { OPJ_UINT32 j; - opj_dwd_decode_h_job_t* job; + opj_dwt_decode_h_job_t* job; (void)tls; - job = (opj_dwd_decode_h_job_t*)user_data; + job = (opj_dwt_decode_h_job_t*)user_data; for (j = job->min_j; j < job->max_j; j++) { opj_idwt53_h(&job->h, &job->tiledp[j * job->w]); } @@ -1352,15 +1514,15 @@ typedef struct { OPJ_INT32 * OPJ_RESTRICT tiledp; OPJ_UINT32 min_j; OPJ_UINT32 max_j; -} opj_dwd_decode_v_job_t; +} opj_dwt_decode_v_job_t; static void opj_dwt_decode_v_func(void* user_data, opj_tls_t* tls) { OPJ_UINT32 j; - opj_dwd_decode_v_job_t* job; + opj_dwt_decode_v_job_t* job; (void)tls; - job = (opj_dwd_decode_v_job_t*)user_data; + job = (opj_dwt_decode_v_job_t*)user_data; for (j = job->min_j; j + PARALLEL_COLS_53 <= job->max_j; j += PARALLEL_COLS_53) { opj_idwt53_v(&job->v, &job->tiledp[j], (OPJ_SIZE_T)job->w, @@ -1447,9 +1609,9 @@ static OPJ_BOOL opj_dwt_decode_tile(opj_thread_pool_t* tp, step_j = (rh / num_jobs); for (j = 0; j < num_jobs; j++) { - opj_dwd_decode_h_job_t* job; + opj_dwt_decode_h_job_t* job; - job = (opj_dwd_decode_h_job_t*) opj_malloc(sizeof(opj_dwd_decode_h_job_t)); + job = (opj_dwt_decode_h_job_t*) opj_malloc(sizeof(opj_dwt_decode_h_job_t)); if (!job) { /* It would be nice to fallback to single thread case, but */ /* unfortunately some jobs may be launched and have modified */ @@ -1502,9 +1664,9 @@ static OPJ_BOOL opj_dwt_decode_tile(opj_thread_pool_t* tp, step_j = (rw / num_jobs); for (j = 0; j < num_jobs; j++) { - opj_dwd_decode_v_job_t* job; + opj_dwt_decode_v_job_t* job; - job = (opj_dwd_decode_v_job_t*) opj_malloc(sizeof(opj_dwd_decode_v_job_t)); + job = (opj_dwt_decode_v_job_t*) opj_malloc(sizeof(opj_dwt_decode_v_job_t)); if (!job) { /* It would be nice to fallback to single thread case, but */ /* unfortunately some jobs may be launched and have modified */ diff --git a/src/lib/openjp2/dwt.h b/src/lib/openjp2/dwt.h index 4f63e524..89c859cb 100644 --- a/src/lib/openjp2/dwt.h +++ b/src/lib/openjp2/dwt.h @@ -56,9 +56,11 @@ DWT.C are used by some function in TCD.C. /** Forward 5-3 wavelet transform in 2-D. Apply a reversible DWT transform to a component of an image. +@param p_tcd TCD handle @param tilec Tile component information (current tile) */ -OPJ_BOOL opj_dwt_encode(opj_tcd_tilecomp_t * tilec); +OPJ_BOOL opj_dwt_encode(opj_tcd_t *p_tcd, + opj_tcd_tilecomp_t * tilec); /** Inverse 5-3 wavelet transform in 2-D. @@ -87,9 +89,11 @@ OPJ_FLOAT64 opj_dwt_getnorm(OPJ_UINT32 level, OPJ_UINT32 orient); /** Forward 9-7 wavelet transform in 2-D. Apply an irreversible DWT transform to a component of an image. +@param p_tcd TCD handle @param tilec Tile component information (current tile) */ -OPJ_BOOL opj_dwt_encode_real(opj_tcd_tilecomp_t * tilec); +OPJ_BOOL opj_dwt_encode_real(opj_tcd_t *p_tcd, + opj_tcd_tilecomp_t * tilec); /** Inverse 9-7 wavelet transform in 2-D. Apply an irreversible inverse DWT transform to a component of an image. diff --git a/src/lib/openjp2/tcd.c b/src/lib/openjp2/tcd.c index 108462ca..d5d60aaf 100644 --- a/src/lib/openjp2/tcd.c +++ b/src/lib/openjp2/tcd.c @@ -2488,11 +2488,11 @@ static OPJ_BOOL opj_tcd_dwt_encode(opj_tcd_t *p_tcd) for (compno = 0; compno < l_tile->numcomps; ++compno) { if (l_tccp->qmfbid == 1) { - if (! opj_dwt_encode(l_tile_comp)) { + if (! opj_dwt_encode(p_tcd, l_tile_comp)) { return OPJ_FALSE; } } else if (l_tccp->qmfbid == 0) { - if (! opj_dwt_encode_real(l_tile_comp)) { + if (! opj_dwt_encode_real(p_tcd, l_tile_comp)) { return OPJ_FALSE; } } diff --git a/tools/ctest_scripts/travis-ci.cmake b/tools/ctest_scripts/travis-ci.cmake index 75ed6f6b..0d54773c 100644 --- a/tools/ctest_scripts/travis-ci.cmake +++ b/tools/ctest_scripts/travis-ci.cmake @@ -121,6 +121,9 @@ BUILD_TESTING:BOOL=${BUILD_TESTING} # Build Thirdparty, useful but not required for test suite BUILD_THIRDPARTY:BOOL=TRUE +# Build unit tests that test subcomponents of libopenjp2 (e.g. DWT) +BUILD_UNIT_TESTS:BOOL=TRUE + # JPEG2000 test files are available with git clone https://github.com/uclouvain/openjpeg-data.git OPJ_DATA_ROOT:PATH=$ENV{PWD}/data From 99107d5e468beef3cf2f7db24633ba8b40732405 Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Mon, 18 May 2020 18:09:10 +0200 Subject: [PATCH 03/24] dwt.c: change sign of constants to match standard and compensate (no functional change) --- src/lib/openjp2/dwt.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/src/lib/openjp2/dwt.c b/src/lib/openjp2/dwt.c index ff771817..4b00c83a 100644 --- a/src/lib/openjp2/dwt.c +++ b/src/lib/openjp2/dwt.c @@ -102,10 +102,11 @@ typedef struct v4dwt_local { OPJ_UINT32 win_h_x1; /* end coord in high pass band */ } opj_v4dwt_t ; -static const OPJ_FLOAT32 opj_dwt_alpha = 1.586134342f; /* 12994 */ -static const OPJ_FLOAT32 opj_dwt_beta = 0.052980118f; /* 434 */ -static const OPJ_FLOAT32 opj_dwt_gamma = -0.882911075f; /* -7233 */ -static const OPJ_FLOAT32 opj_dwt_delta = -0.443506852f; /* -3633 */ +/* From table F.4 from the standard */ +static const OPJ_FLOAT32 opj_dwt_alpha = -1.586134342f; /* 12994 */ +static const OPJ_FLOAT32 opj_dwt_beta = -0.052980118f; /* 434 */ +static const OPJ_FLOAT32 opj_dwt_gamma = 0.882911075f; /* -7233 */ +static const OPJ_FLOAT32 opj_dwt_delta = 0.443506852f; /* -3633 */ static const OPJ_FLOAT32 opj_K = 1.230174105f; /* 10078 */ static const OPJ_FLOAT32 opj_c13318 = 1.625732422f; @@ -2627,19 +2628,19 @@ static void opj_v4dwt_decode(opj_v4dwt_t* OPJ_RESTRICT dwt) opj_v4dwt_decode_step2_sse(dwt->wavelet + b, dwt->wavelet + a + 1, dwt->win_l_x0, dwt->win_l_x1, (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a), - _mm_set1_ps(opj_dwt_delta)); + _mm_set1_ps(-opj_dwt_delta)); opj_v4dwt_decode_step2_sse(dwt->wavelet + a, dwt->wavelet + b + 1, dwt->win_h_x0, dwt->win_h_x1, (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b), - _mm_set1_ps(opj_dwt_gamma)); + _mm_set1_ps(-opj_dwt_gamma)); opj_v4dwt_decode_step2_sse(dwt->wavelet + b, dwt->wavelet + a + 1, dwt->win_l_x0, dwt->win_l_x1, (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a), - _mm_set1_ps(opj_dwt_beta)); + _mm_set1_ps(-opj_dwt_beta)); opj_v4dwt_decode_step2_sse(dwt->wavelet + a, dwt->wavelet + b + 1, dwt->win_h_x0, dwt->win_h_x1, (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b), - _mm_set1_ps(opj_dwt_alpha)); + _mm_set1_ps(-opj_dwt_alpha)); #else opj_v4dwt_decode_step1(dwt->wavelet + a, dwt->win_l_x0, dwt->win_l_x1, opj_K); @@ -2648,19 +2649,19 @@ static void opj_v4dwt_decode(opj_v4dwt_t* OPJ_RESTRICT dwt) opj_v4dwt_decode_step2(dwt->wavelet + b, dwt->wavelet + a + 1, dwt->win_l_x0, dwt->win_l_x1, (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a), - opj_dwt_delta); + -opj_dwt_delta); opj_v4dwt_decode_step2(dwt->wavelet + a, dwt->wavelet + b + 1, dwt->win_h_x0, dwt->win_h_x1, (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b), - opj_dwt_gamma); + -opj_dwt_gamma); opj_v4dwt_decode_step2(dwt->wavelet + b, dwt->wavelet + a + 1, dwt->win_l_x0, dwt->win_l_x1, (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a), - opj_dwt_beta); + -opj_dwt_beta); opj_v4dwt_decode_step2(dwt->wavelet + a, dwt->wavelet + b + 1, dwt->win_h_x0, dwt->win_h_x1, (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b), - opj_dwt_alpha); + -opj_dwt_alpha); #endif } From 00cff6f5c02deabb64ccb15f15c13fcb773968fe Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Mon, 18 May 2020 20:15:07 +0200 Subject: [PATCH 04/24] Encoder: use floating-point operations for irreversible transformation --- src/lib/openjp2/dwt.c | 47 +++++++----- src/lib/openjp2/mct.c | 168 +++--------------------------------------- src/lib/openjp2/mct.h | 5 +- src/lib/openjp2/t1.c | 11 +-- src/lib/openjp2/tcd.c | 10 ++- 5 files changed, 49 insertions(+), 192 deletions(-) diff --git a/src/lib/openjp2/dwt.c b/src/lib/openjp2/dwt.c index 4b00c83a..bdc91cf5 100644 --- a/src/lib/openjp2/dwt.c +++ b/src/lib/openjp2/dwt.c @@ -134,12 +134,12 @@ static void opj_dwt_deinterleave_v(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn, /** Forward 5-3 wavelet transform in 1-D */ -static void opj_dwt_encode_1(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn, +static void opj_dwt_encode_1(void *a, OPJ_INT32 dn, OPJ_INT32 sn, OPJ_INT32 cas); /** Forward 9-7 wavelet transform in 1-D */ -static void opj_dwt_encode_1_real(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn, +static void opj_dwt_encode_1_real(void *a, OPJ_INT32 dn, OPJ_INT32 sn, OPJ_INT32 cas); /** Explicit calculation of the Quantization Stepsizes @@ -156,9 +156,13 @@ static OPJ_BOOL opj_dwt_decode_partial_tile( opj_tcd_tilecomp_t* tilec, OPJ_UINT32 numres); +/* Where void* is a OPJ_INT32* for 5x3 and OPJ_FLOAT32* for 9x7 */ +typedef void (*opj_encode_one_row_fnptr_type)(void *, OPJ_INT32, OPJ_INT32, + OPJ_INT32); + static OPJ_BOOL opj_dwt_encode_procedure(opj_thread_pool_t* tp, opj_tcd_tilecomp_t * tilec, - void (*p_function)(OPJ_INT32 *, OPJ_INT32, OPJ_INT32, OPJ_INT32)); + opj_encode_one_row_fnptr_type p_function); static OPJ_UINT32 opj_dwt_max_resolution(opj_tcd_resolution_t* OPJ_RESTRICT r, OPJ_UINT32 i); @@ -346,10 +350,11 @@ static void opj_dwt_interleave_v(const opj_dwt_t* v, OPJ_INT32 *a, OPJ_INT32 x) /* */ /* Forward 5-3 wavelet transform in 1-D. */ /* */ -static void opj_dwt_encode_1(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn, +static void opj_dwt_encode_1(void *aIn, OPJ_INT32 dn, OPJ_INT32 sn, OPJ_INT32 cas) { OPJ_INT32 i; + OPJ_INT32* a = (OPJ_INT32*)aIn; if (!cas) { if ((dn > 0) || (sn > 1)) { /* NEW : CASE ONE ELEMENT */ @@ -1039,50 +1044,52 @@ static void opj_idwt53_v(const opj_dwt_t *dwt, /* */ /* Forward 9-7 wavelet transform in 1-D. */ /* */ -static void opj_dwt_encode_1_real(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn, +static void opj_dwt_encode_1_real(void *aIn, OPJ_INT32 dn, OPJ_INT32 sn, OPJ_INT32 cas) { OPJ_INT32 i; + OPJ_FLOAT32* a = (OPJ_FLOAT32*)aIn; + if (!cas) { if ((dn > 0) || (sn > 1)) { /* NEW : CASE ONE ELEMENT */ for (i = 0; i < dn; i++) { - OPJ_D(i) -= opj_int_fix_mul(OPJ_S_(i) + OPJ_S_(i + 1), 12993); + OPJ_D(i) += opj_dwt_alpha * (OPJ_S_(i) + OPJ_S_(i + 1)); } for (i = 0; i < sn; i++) { - OPJ_S(i) -= opj_int_fix_mul(OPJ_D_(i - 1) + OPJ_D_(i), 434); + OPJ_S(i) += opj_dwt_beta * (OPJ_D_(i - 1) + OPJ_D_(i)); } for (i = 0; i < dn; i++) { - OPJ_D(i) += opj_int_fix_mul(OPJ_S_(i) + OPJ_S_(i + 1), 7233); + OPJ_D(i) += opj_dwt_gamma * (OPJ_S_(i) + OPJ_S_(i + 1)); } for (i = 0; i < sn; i++) { - OPJ_S(i) += opj_int_fix_mul(OPJ_D_(i - 1) + OPJ_D_(i), 3633); + OPJ_S(i) += opj_dwt_delta * (OPJ_D_(i - 1) + OPJ_D_(i)); } for (i = 0; i < dn; i++) { - OPJ_D(i) = opj_int_fix_mul(OPJ_D(i), 5038); /*5038 */ + OPJ_D(i) = opj_K / 2 * OPJ_D(i); } for (i = 0; i < sn; i++) { - OPJ_S(i) = opj_int_fix_mul(OPJ_S(i), 6659); /*6660 */ + OPJ_S(i) = opj_c13318 / 2 * OPJ_S(i); } } } else { if ((sn > 0) || (dn > 1)) { /* NEW : CASE ONE ELEMENT */ for (i = 0; i < dn; i++) { - OPJ_S(i) -= opj_int_fix_mul(OPJ_DD_(i) + OPJ_DD_(i - 1), 12993); + OPJ_S(i) += opj_dwt_alpha * (OPJ_DD_(i) + OPJ_DD_(i - 1)); } for (i = 0; i < sn; i++) { - OPJ_D(i) -= opj_int_fix_mul(OPJ_SS_(i) + OPJ_SS_(i + 1), 434); + OPJ_D(i) += opj_dwt_beta * (OPJ_SS_(i) + OPJ_SS_(i + 1)); } for (i = 0; i < dn; i++) { - OPJ_S(i) += opj_int_fix_mul(OPJ_DD_(i) + OPJ_DD_(i - 1), 7233); + OPJ_S(i) += opj_dwt_gamma * (OPJ_DD_(i) + OPJ_DD_(i - 1)); } for (i = 0; i < sn; i++) { - OPJ_D(i) += opj_int_fix_mul(OPJ_SS_(i) + OPJ_SS_(i + 1), 3633); + OPJ_D(i) += opj_dwt_delta * (OPJ_SS_(i) + OPJ_SS_(i + 1)); } for (i = 0; i < dn; i++) { - OPJ_S(i) = opj_int_fix_mul(OPJ_S(i), 5038); /*5038 */ + OPJ_S(i) = opj_K / 2 * OPJ_S(i); } for (i = 0; i < sn; i++) { - OPJ_D(i) = opj_int_fix_mul(OPJ_D(i), 6659); /*6660 */ + OPJ_D(i) = opj_c13318 / 2 * OPJ_D(i); } } } @@ -1112,7 +1119,7 @@ typedef struct { OPJ_INT32 * OPJ_RESTRICT tiledp; OPJ_UINT32 min_j; OPJ_UINT32 max_j; - void (*p_function)(OPJ_INT32 *, OPJ_INT32, OPJ_INT32, OPJ_INT32); + opj_encode_one_row_fnptr_type p_function; } opj_dwt_encode_h_job_t; static void opj_dwt_encode_h_func(void* user_data, opj_tls_t* tls) @@ -1143,7 +1150,7 @@ typedef struct { OPJ_INT32 * OPJ_RESTRICT tiledp; OPJ_UINT32 min_j; OPJ_UINT32 max_j; - void (*p_function)(OPJ_INT32 *, OPJ_INT32, OPJ_INT32, OPJ_INT32); + opj_encode_one_row_fnptr_type p_function; } opj_dwt_encode_v_job_t; static void opj_dwt_encode_v_func(void* user_data, opj_tls_t* tls) @@ -1175,7 +1182,7 @@ static void opj_dwt_encode_v_func(void* user_data, opj_tls_t* tls) /* */ static INLINE OPJ_BOOL opj_dwt_encode_procedure(opj_thread_pool_t* tp, opj_tcd_tilecomp_t * tilec, - void (*p_function)(OPJ_INT32 *, OPJ_INT32, OPJ_INT32, OPJ_INT32)) + opj_encode_one_row_fnptr_type p_function) { OPJ_INT32 i; OPJ_INT32 *bj = 00; diff --git a/src/lib/openjp2/mct.c b/src/lib/openjp2/mct.c index 08bc8115..9d79b50a 100644 --- a/src/lib/openjp2/mct.c +++ b/src/lib/openjp2/mct.c @@ -209,175 +209,25 @@ OPJ_FLOAT64 opj_mct_getnorm(OPJ_UINT32 compno) /* */ /* Forward irreversible MCT. */ /* */ -#ifdef __SSE4_1__ void opj_mct_encode_real( - OPJ_INT32* OPJ_RESTRICT c0, - OPJ_INT32* OPJ_RESTRICT c1, - OPJ_INT32* OPJ_RESTRICT c2, - OPJ_SIZE_T n) -{ - OPJ_SIZE_T i; - const OPJ_SIZE_T len = n; - - const __m128i ry = _mm_set1_epi32(2449); - const __m128i gy = _mm_set1_epi32(4809); - const __m128i by = _mm_set1_epi32(934); - const __m128i ru = _mm_set1_epi32(1382); - const __m128i gu = _mm_set1_epi32(2714); - /* const __m128i bu = _mm_set1_epi32(4096); */ - /* const __m128i rv = _mm_set1_epi32(4096); */ - const __m128i gv = _mm_set1_epi32(3430); - const __m128i bv = _mm_set1_epi32(666); - const __m128i mulround = _mm_shuffle_epi32(_mm_cvtsi32_si128(4096), - _MM_SHUFFLE(1, 0, 1, 0)); - - for (i = 0; i < (len & ~3U); i += 4) { - __m128i lo, hi; - __m128i y, u, v; - __m128i r = _mm_load_si128((const __m128i *) & (c0[i])); - __m128i g = _mm_load_si128((const __m128i *) & (c1[i])); - __m128i b = _mm_load_si128((const __m128i *) & (c2[i])); - - lo = r; - hi = _mm_shuffle_epi32(r, _MM_SHUFFLE(3, 3, 1, 1)); - lo = _mm_mul_epi32(lo, ry); - hi = _mm_mul_epi32(hi, ry); - lo = _mm_add_epi64(lo, mulround); - hi = _mm_add_epi64(hi, mulround); - lo = _mm_srli_epi64(lo, 13); - hi = _mm_slli_epi64(hi, 32 - 13); - y = _mm_blend_epi16(lo, hi, 0xCC); - - lo = g; - hi = _mm_shuffle_epi32(g, _MM_SHUFFLE(3, 3, 1, 1)); - lo = _mm_mul_epi32(lo, gy); - hi = _mm_mul_epi32(hi, gy); - lo = _mm_add_epi64(lo, mulround); - hi = _mm_add_epi64(hi, mulround); - lo = _mm_srli_epi64(lo, 13); - hi = _mm_slli_epi64(hi, 32 - 13); - y = _mm_add_epi32(y, _mm_blend_epi16(lo, hi, 0xCC)); - - lo = b; - hi = _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 3, 1, 1)); - lo = _mm_mul_epi32(lo, by); - hi = _mm_mul_epi32(hi, by); - lo = _mm_add_epi64(lo, mulround); - hi = _mm_add_epi64(hi, mulround); - lo = _mm_srli_epi64(lo, 13); - hi = _mm_slli_epi64(hi, 32 - 13); - y = _mm_add_epi32(y, _mm_blend_epi16(lo, hi, 0xCC)); - _mm_store_si128((__m128i *) & (c0[i]), y); - - /*lo = b; - hi = _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 3, 1, 1)); - lo = _mm_mul_epi32(lo, mulround); - hi = _mm_mul_epi32(hi, mulround);*/ - lo = _mm_cvtepi32_epi64(_mm_shuffle_epi32(b, _MM_SHUFFLE(3, 2, 2, 0))); - hi = _mm_cvtepi32_epi64(_mm_shuffle_epi32(b, _MM_SHUFFLE(3, 2, 3, 1))); - lo = _mm_slli_epi64(lo, 12); - hi = _mm_slli_epi64(hi, 12); - lo = _mm_add_epi64(lo, mulround); - hi = _mm_add_epi64(hi, mulround); - lo = _mm_srli_epi64(lo, 13); - hi = _mm_slli_epi64(hi, 32 - 13); - u = _mm_blend_epi16(lo, hi, 0xCC); - - lo = r; - hi = _mm_shuffle_epi32(r, _MM_SHUFFLE(3, 3, 1, 1)); - lo = _mm_mul_epi32(lo, ru); - hi = _mm_mul_epi32(hi, ru); - lo = _mm_add_epi64(lo, mulround); - hi = _mm_add_epi64(hi, mulround); - lo = _mm_srli_epi64(lo, 13); - hi = _mm_slli_epi64(hi, 32 - 13); - u = _mm_sub_epi32(u, _mm_blend_epi16(lo, hi, 0xCC)); - - lo = g; - hi = _mm_shuffle_epi32(g, _MM_SHUFFLE(3, 3, 1, 1)); - lo = _mm_mul_epi32(lo, gu); - hi = _mm_mul_epi32(hi, gu); - lo = _mm_add_epi64(lo, mulround); - hi = _mm_add_epi64(hi, mulround); - lo = _mm_srli_epi64(lo, 13); - hi = _mm_slli_epi64(hi, 32 - 13); - u = _mm_sub_epi32(u, _mm_blend_epi16(lo, hi, 0xCC)); - _mm_store_si128((__m128i *) & (c1[i]), u); - - /*lo = r; - hi = _mm_shuffle_epi32(r, _MM_SHUFFLE(3, 3, 1, 1)); - lo = _mm_mul_epi32(lo, mulround); - hi = _mm_mul_epi32(hi, mulround);*/ - lo = _mm_cvtepi32_epi64(_mm_shuffle_epi32(r, _MM_SHUFFLE(3, 2, 2, 0))); - hi = _mm_cvtepi32_epi64(_mm_shuffle_epi32(r, _MM_SHUFFLE(3, 2, 3, 1))); - lo = _mm_slli_epi64(lo, 12); - hi = _mm_slli_epi64(hi, 12); - lo = _mm_add_epi64(lo, mulround); - hi = _mm_add_epi64(hi, mulround); - lo = _mm_srli_epi64(lo, 13); - hi = _mm_slli_epi64(hi, 32 - 13); - v = _mm_blend_epi16(lo, hi, 0xCC); - - lo = g; - hi = _mm_shuffle_epi32(g, _MM_SHUFFLE(3, 3, 1, 1)); - lo = _mm_mul_epi32(lo, gv); - hi = _mm_mul_epi32(hi, gv); - lo = _mm_add_epi64(lo, mulround); - hi = _mm_add_epi64(hi, mulround); - lo = _mm_srli_epi64(lo, 13); - hi = _mm_slli_epi64(hi, 32 - 13); - v = _mm_sub_epi32(v, _mm_blend_epi16(lo, hi, 0xCC)); - - lo = b; - hi = _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 3, 1, 1)); - lo = _mm_mul_epi32(lo, bv); - hi = _mm_mul_epi32(hi, bv); - lo = _mm_add_epi64(lo, mulround); - hi = _mm_add_epi64(hi, mulround); - lo = _mm_srli_epi64(lo, 13); - hi = _mm_slli_epi64(hi, 32 - 13); - v = _mm_sub_epi32(v, _mm_blend_epi16(lo, hi, 0xCC)); - _mm_store_si128((__m128i *) & (c2[i]), v); - } - for (; i < len; ++i) { - OPJ_INT32 r = c0[i]; - OPJ_INT32 g = c1[i]; - OPJ_INT32 b = c2[i]; - OPJ_INT32 y = opj_int_fix_mul(r, 2449) + opj_int_fix_mul(g, - 4809) + opj_int_fix_mul(b, 934); - OPJ_INT32 u = -opj_int_fix_mul(r, 1382) - opj_int_fix_mul(g, - 2714) + opj_int_fix_mul(b, 4096); - OPJ_INT32 v = opj_int_fix_mul(r, 4096) - opj_int_fix_mul(g, - 3430) - opj_int_fix_mul(b, 666); - c0[i] = y; - c1[i] = u; - c2[i] = v; - } -} -#else -void opj_mct_encode_real( - OPJ_INT32* OPJ_RESTRICT c0, - OPJ_INT32* OPJ_RESTRICT c1, - OPJ_INT32* OPJ_RESTRICT c2, + OPJ_FLOAT32* OPJ_RESTRICT c0, + OPJ_FLOAT32* OPJ_RESTRICT c1, + OPJ_FLOAT32* OPJ_RESTRICT c2, OPJ_SIZE_T n) { OPJ_SIZE_T i; for (i = 0; i < n; ++i) { - OPJ_INT32 r = c0[i]; - OPJ_INT32 g = c1[i]; - OPJ_INT32 b = c2[i]; - OPJ_INT32 y = opj_int_fix_mul(r, 2449) + opj_int_fix_mul(g, - 4809) + opj_int_fix_mul(b, 934); - OPJ_INT32 u = -opj_int_fix_mul(r, 1382) - opj_int_fix_mul(g, - 2714) + opj_int_fix_mul(b, 4096); - OPJ_INT32 v = opj_int_fix_mul(r, 4096) - opj_int_fix_mul(g, - 3430) - opj_int_fix_mul(b, 666); + OPJ_FLOAT32 r = c0[i]; + OPJ_FLOAT32 g = c1[i]; + OPJ_FLOAT32 b = c2[i]; + OPJ_FLOAT32 y = 0.299f * r + 0.587f * g + 0.114f * b; + OPJ_FLOAT32 u = -0.16875f * r - 0.331260f * g + 0.5f * b; + OPJ_FLOAT32 v = 0.5f * r - 0.41869f * g - 0.08131f * b; c0[i] = y; c1[i] = u; c2[i] = v; } } -#endif /* */ /* Inverse irreversible MCT. */ diff --git a/src/lib/openjp2/mct.h b/src/lib/openjp2/mct.h index 2e37ce73..3e1f5e49 100644 --- a/src/lib/openjp2/mct.h +++ b/src/lib/openjp2/mct.h @@ -85,8 +85,9 @@ Apply an irreversible multi-component transform to an image @param c2 Samples blue component @param n Number of samples for each component */ -void opj_mct_encode_real(OPJ_INT32* OPJ_RESTRICT c0, OPJ_INT32* OPJ_RESTRICT c1, - OPJ_INT32* OPJ_RESTRICT c2, OPJ_SIZE_T n); +void opj_mct_encode_real(OPJ_FLOAT32* OPJ_RESTRICT c0, + OPJ_FLOAT32* OPJ_RESTRICT c1, + OPJ_FLOAT32* OPJ_RESTRICT c2, OPJ_SIZE_T n); /** Apply an irreversible multi-component inverse transform to an image @param c0 Samples for luminance component diff --git a/src/lib/openjp2/t1.c b/src/lib/openjp2/t1.c index 1b9556ea..8d5feadf 100644 --- a/src/lib/openjp2/t1.c +++ b/src/lib/openjp2/t1.c @@ -2194,16 +2194,11 @@ static void opj_t1_clbl_encode_processor(void* user_data, opj_tls_t* tls) tileIndex += tileLineAdvance; } } else { /* if (tccp->qmfbid == 0) */ - const OPJ_INT32 bandconst = 8192 * 8192 / ((OPJ_INT32) floor( - band->stepsize * 8192)); - for (j = 0; j < cblk_h; ++j) { for (i = 0; i < cblk_w; ++i) { - OPJ_INT32 tmp = tiledp[tileIndex]; - tiledp[tileIndex] = - opj_int_fix_mul_t1( - tmp, - bandconst); + OPJ_FLOAT32 tmp = ((OPJ_FLOAT32*)tiledp)[tileIndex]; + tiledp[tileIndex] = (OPJ_INT32)opj_lrintf((tmp / band->stepsize) * + (1 << T1_NMSEDEC_FRACBITS)); tileIndex++; } tileIndex += tileLineAdvance; diff --git a/src/lib/openjp2/tcd.c b/src/lib/openjp2/tcd.c index d5d60aaf..503dc472 100644 --- a/src/lib/openjp2/tcd.c +++ b/src/lib/openjp2/tcd.c @@ -2411,7 +2411,8 @@ static OPJ_BOOL opj_tcd_dc_level_shift_encode(opj_tcd_t *p_tcd) } } else { for (i = 0; i < l_nb_elem; ++i) { - *l_current_ptr = (*l_current_ptr - l_tccp->m_dc_level_shift) * (1 << 11); + *((OPJ_FLOAT32 *) l_current_ptr) = (OPJ_FLOAT32)(*l_current_ptr - + l_tccp->m_dc_level_shift); ++l_current_ptr; } } @@ -2469,8 +2470,11 @@ static OPJ_BOOL opj_tcd_mct_encode(opj_tcd_t *p_tcd) opj_free(l_data); } else if (l_tcp->tccps->qmfbid == 0) { - opj_mct_encode_real(l_tile->comps[0].data, l_tile->comps[1].data, - l_tile->comps[2].data, samples); + opj_mct_encode_real( + (OPJ_FLOAT32*)l_tile->comps[0].data, + (OPJ_FLOAT32*)l_tile->comps[1].data, + (OPJ_FLOAT32*)l_tile->comps[2].data, + samples); } else { opj_mct_encode(l_tile->comps[0].data, l_tile->comps[1].data, l_tile->comps[2].data, samples); From 3d35d0f3af46ee206a3ea147298aad3d83a7775c Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Mon, 18 May 2020 20:17:07 +0200 Subject: [PATCH 05/24] tcd.c: add comment --- src/lib/openjp2/tcd.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/lib/openjp2/tcd.c b/src/lib/openjp2/tcd.c index 503dc472..e41e7772 100644 --- a/src/lib/openjp2/tcd.c +++ b/src/lib/openjp2/tcd.c @@ -1009,6 +1009,9 @@ static INLINE OPJ_BOOL opj_tcd_init_tile(opj_tcd_t *p_tcd, OPJ_UINT32 p_tile_no, /** avoid an if with storing function pointer */ l_gain = (*l_gain_ptr)(l_band->bandno); numbps = (OPJ_INT32)(l_image_comp->prec + l_gain); + + /* Delta_b value of Equation E-3 in "E.1 Inverse quantization + * procedure" of the standard */ l_band->stepsize = (OPJ_FLOAT32)(((1.0 + l_step_size->mant / 2048.0) * pow(2.0, (OPJ_INT32)(numbps - l_step_size->expn)))) * fraction; /* Mb value of Equation E-2 in "E.1 Inverse quantization From c2b9d09c65ec5db4a94de961b0470923aec74e2e Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Tue, 19 May 2020 18:03:29 +0200 Subject: [PATCH 06/24] compare_images.c: code reformatting --- tests/compare_images.c | 1482 ++++++++++++++++++++-------------------- 1 file changed, 743 insertions(+), 739 deletions(-) diff --git a/tests/compare_images.c b/tests/compare_images.c index b2ef00db..ed39a1ae 100644 --- a/tests/compare_images.c +++ b/tests/compare_images.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011-2012, Centre National d'Etudes Spatiales (CNES), France + * Copyright (c) 2011-2012, Centre National d'Etudes Spatiales (CNES), France * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -53,27 +53,25 @@ * Parse MSE and PEAK input values ( * separator = ":" *******************************************************************************/ -static double* parseToleranceValues( char* inArg, const int nbcomp) +static double* parseToleranceValues(char* inArg, const int nbcomp) { - double* outArgs= malloc((size_t)nbcomp * sizeof(double)); - int it_comp = 0; - const char delims[] = ":"; - char *result = strtok( inArg, delims ); + double* outArgs = malloc((size_t)nbcomp * sizeof(double)); + int it_comp = 0; + const char delims[] = ":"; + char *result = strtok(inArg, delims); - while( (result != NULL) && (it_comp < nbcomp )) - { - outArgs[it_comp] = atof(result); - result = strtok( NULL, delims ); - it_comp++; + while ((result != NULL) && (it_comp < nbcomp)) { + outArgs[it_comp] = atof(result); + result = strtok(NULL, delims); + it_comp++; } - if (it_comp != nbcomp) - { - free(outArgs); - return NULL; + if (it_comp != nbcomp) { + free(outArgs); + return NULL; } - /* else */ - return outArgs; + /* else */ + return outArgs; } /******************************************************************************* @@ -81,28 +79,40 @@ static double* parseToleranceValues( char* inArg, const int nbcomp) *******************************************************************************/ static void compare_images_help_display(void) { - fprintf(stdout,"\nList of parameters for the compare_images function \n"); - fprintf(stdout,"\n"); - fprintf(stdout," -b \t REQUIRED \t filename to the reference/baseline PGX/TIF/PNM image \n"); - fprintf(stdout," -t \t REQUIRED \t filename to the test PGX/TIF/PNM image\n"); - fprintf(stdout," -n \t REQUIRED \t number of component of the image (used to generate correct filename, not used when both input files are TIF)\n"); - fprintf(stdout," -m \t OPTIONAL \t list of MSE tolerances, separated by : (size must correspond to the number of component) of \n"); - fprintf(stdout," -p \t OPTIONAL \t list of PEAK tolerances, separated by : (size must correspond to the number of component) \n"); - fprintf(stdout," -s \t OPTIONAL \t 1 or 2 filename separator to take into account PGX/PNM image with different components, " - "please indicate b or t before separator to indicate respectively the separator " - "for ref/base file and for test file. \n"); - fprintf(stdout," -d \t OPTIONAL \t indicate if you want to run this function as conformance test or as non regression test\n"); - fprintf(stdout,"\n"); + fprintf(stdout, "\nList of parameters for the compare_images function \n"); + fprintf(stdout, "\n"); + fprintf(stdout, + " -b \t REQUIRED \t filename to the reference/baseline PGX/TIF/PNM image \n"); + fprintf(stdout, " -t \t REQUIRED \t filename to the test PGX/TIF/PNM image\n"); + fprintf(stdout, + " -n \t REQUIRED \t number of component of the image (used to generate correct filename, not used when both input files are TIF)\n"); + fprintf(stdout, + " -m \t OPTIONAL \t list of MSE tolerances, separated by : (size must correspond to the number of component) of \n"); + fprintf(stdout, + " -p \t OPTIONAL \t list of PEAK tolerances, separated by : (size must correspond to the number of component) \n"); + fprintf(stdout, + " -s \t OPTIONAL \t 1 or 2 filename separator to take into account PGX/PNM image with different components, " + "please indicate b or t before separator to indicate respectively the separator " + "for ref/base file and for test file. \n"); + fprintf(stdout, + " -d \t OPTIONAL \t indicate if you want to run this function as conformance test or as non regression test\n"); + fprintf(stdout, "\n"); } static int get_decod_format_from_string(const char *filename) { - const int dot = '.'; - char * ext = strrchr(filename, dot); - if( strcmp(ext,".pgx") == 0 ) return PGX_DFMT; - if( strcmp(ext,".tif") == 0 ) return TIF_DFMT; - if( strcmp(ext,".ppm") == 0 ) return PXM_DFMT; - return -1; + const int dot = '.'; + char * ext = strrchr(filename, dot); + if (strcmp(ext, ".pgx") == 0) { + return PGX_DFMT; + } + if (strcmp(ext, ".tif") == 0) { + return TIF_DFMT; + } + if (strcmp(ext, ".ppm") == 0) { + return PXM_DFMT; + } + return -1; } @@ -110,325 +120,333 @@ static int get_decod_format_from_string(const char *filename) * Create filenames from a filename using separator and nb components * (begin from 0) *******************************************************************************/ -static char* createMultiComponentsFilename(const char* inFilename, const int indexF, const char* separator) +static char* createMultiComponentsFilename(const char* inFilename, + const int indexF, const char* separator) { - char s[255]; - char *outFilename, *ptr; - const char token = '.'; - size_t posToken = 0; - int decod_format; + char s[255]; + char *outFilename, *ptr; + const char token = '.'; + size_t posToken = 0; + int decod_format; - /*printf("inFilename = %s\n", inFilename);*/ - if ((ptr = strrchr(inFilename, token)) != NULL) - { - posToken = strlen(inFilename) - strlen(ptr); - /*printf("Position of %c character inside inFilename = %d\n", token, posToken);*/ + /*printf("inFilename = %s\n", inFilename);*/ + if ((ptr = strrchr(inFilename, token)) != NULL) { + posToken = strlen(inFilename) - strlen(ptr); + /*printf("Position of %c character inside inFilename = %d\n", token, posToken);*/ + } else { + /*printf("Token %c not found\n", token);*/ + outFilename = (char*)malloc(1); + outFilename[0] = '\0'; + return outFilename; } - else - { - /*printf("Token %c not found\n", token);*/ - outFilename = (char*)malloc(1); - outFilename[0] = '\0'; + + outFilename = (char*)malloc((posToken + 7) * sizeof(char)); /*6*/ + + strncpy(outFilename, inFilename, posToken); + outFilename[posToken] = '\0'; + strcat(outFilename, separator); + sprintf(s, "%i", indexF); + strcat(outFilename, s); + + decod_format = get_decod_format_from_string(inFilename); + if (decod_format == PGX_DFMT) { + strcat(outFilename, ".pgx"); + } else if (decod_format == PXM_DFMT) { + strcat(outFilename, ".pgm"); + } + + /*printf("outfilename: %s\n", outFilename);*/ return outFilename; - } - - outFilename = (char*)malloc((posToken + 7) * sizeof(char)); /*6*/ - - strncpy(outFilename, inFilename, posToken); - outFilename[posToken] = '\0'; - strcat(outFilename, separator); - sprintf(s, "%i", indexF); - strcat(outFilename, s); - - decod_format = get_decod_format_from_string(inFilename); - if( decod_format == PGX_DFMT ) - { - strcat(outFilename, ".pgx"); - } - else if( decod_format == PXM_DFMT ) - { - strcat(outFilename, ".pgm"); - } - - /*printf("outfilename: %s\n", outFilename);*/ - return outFilename; } /******************************************************************************* * *******************************************************************************/ -static opj_image_t* readImageFromFilePPM(const char* filename, int nbFilenamePGX, const char *separator) +static opj_image_t* readImageFromFilePPM(const char* filename, + int nbFilenamePGX, const char *separator) { - int it_file; - opj_image_t* image_read = NULL; - opj_image_t* image = NULL; - opj_cparameters_t parameters; - opj_image_cmptparm_t* param_image_read; - int** data; + int it_file; + opj_image_t* image_read = NULL; + opj_image_t* image = NULL; + opj_cparameters_t parameters; + opj_image_cmptparm_t* param_image_read; + int** data; - /* If separator is empty => nb file to read is equal to one*/ - if ( strlen(separator) == 0 ) - nbFilenamePGX = 1; + /* If separator is empty => nb file to read is equal to one*/ + if (strlen(separator) == 0) { + nbFilenamePGX = 1; + } - /* set encoding parameters to default values */ - opj_set_default_encoder_parameters(¶meters); - parameters.decod_format = PXM_DFMT; - strcpy(parameters.infile, filename); + /* set encoding parameters to default values */ + opj_set_default_encoder_parameters(¶meters); + parameters.decod_format = PXM_DFMT; + strcpy(parameters.infile, filename); - /* Allocate memory*/ - param_image_read = malloc((size_t)nbFilenamePGX * sizeof(opj_image_cmptparm_t)); - data = malloc((size_t)nbFilenamePGX * sizeof(*data)); + /* Allocate memory*/ + param_image_read = malloc((size_t)nbFilenamePGX * sizeof(opj_image_cmptparm_t)); + data = malloc((size_t)nbFilenamePGX * sizeof(*data)); - for (it_file = 0; it_file < nbFilenamePGX; it_file++) - { - /* Create the right filename*/ - char *filenameComponentPGX; - if (strlen(separator) == 0) - { - filenameComponentPGX = malloc((strlen(filename) + 1) * sizeof(*filenameComponentPGX)); - strcpy(filenameComponentPGX, filename); - } - else - filenameComponentPGX = createMultiComponentsFilename(filename, it_file, separator); + for (it_file = 0; it_file < nbFilenamePGX; it_file++) { + /* Create the right filename*/ + char *filenameComponentPGX; + if (strlen(separator) == 0) { + filenameComponentPGX = malloc((strlen(filename) + 1) * sizeof( + *filenameComponentPGX)); + strcpy(filenameComponentPGX, filename); + } else { + filenameComponentPGX = createMultiComponentsFilename(filename, it_file, + separator); + } + + /* Read the tif file corresponding to the component */ + image_read = pnmtoimage(filenameComponentPGX, ¶meters); + if (!image_read) { + int it_free_data; + fprintf(stderr, "Unable to load ppm file: %s\n", filenameComponentPGX); + + free(param_image_read); + + for (it_free_data = 0; it_free_data < it_file; it_free_data++) { + free(data[it_free_data]); + } + free(data); + + free(filenameComponentPGX); + + return NULL; + } + + /* Set the image_read parameters*/ + param_image_read[it_file].x0 = 0; + param_image_read[it_file].y0 = 0; + param_image_read[it_file].dx = 0; + param_image_read[it_file].dy = 0; + param_image_read[it_file].h = image_read->comps->h; + param_image_read[it_file].w = image_read->comps->w; + param_image_read[it_file].bpp = image_read->comps->bpp; + param_image_read[it_file].prec = image_read->comps->prec; + param_image_read[it_file].sgnd = image_read->comps->sgnd; + + /* Copy data*/ + data[it_file] = malloc(param_image_read[it_file].h * param_image_read[it_file].w + * sizeof(int)); + memcpy(data[it_file], image_read->comps->data, + image_read->comps->h * image_read->comps->w * sizeof(int)); + + /* Free memory*/ + opj_image_destroy(image_read); + free(filenameComponentPGX); + } + + image = opj_image_create((OPJ_UINT32)nbFilenamePGX, param_image_read, + OPJ_CLRSPC_UNSPECIFIED); + for (it_file = 0; it_file < nbFilenamePGX; it_file++) { + /* Copy data into output image and free memory*/ + memcpy(image->comps[it_file].data, data[it_file], + image->comps[it_file].h * image->comps[it_file].w * sizeof(int)); + free(data[it_file]); + } + + /* Free memory*/ + free(param_image_read); + free(data); + + return image; +} + +static opj_image_t* readImageFromFileTIF(const char* filename, + int nbFilenamePGX, const char *separator) +{ + opj_image_t* image_read = NULL; + opj_cparameters_t parameters; + (void)nbFilenamePGX; + (void)separator; + + /* conformance test suite produce annoying warning/error: + * TIFFReadDirectory: Warning, /.../data/baseline/conformance/jp2_1.tif: unknown field with tag 37724 (0x935c) encountered. + * TIFFOpen: /.../data/baseline/nonregression/opj_jp2_1.tif: Cannot open. + * On Win32 this open a message box by default, so remove it from the test suite: + */ +#ifdef OPJ_HAVE_LIBTIFF + TIFFSetWarningHandler(NULL); + TIFFSetErrorHandler(NULL); +#endif + + if (strlen(separator) != 0) { + return NULL; + } + + /* set encoding parameters to default values */ + opj_set_default_encoder_parameters(¶meters); + parameters.decod_format = TIF_DFMT; + strcpy(parameters.infile, filename); /* Read the tif file corresponding to the component */ - image_read = pnmtoimage(filenameComponentPGX, ¶meters); - if (!image_read) - { - int it_free_data; - fprintf(stderr, "Unable to load ppm file: %s\n", filenameComponentPGX); - - free(param_image_read); - - for (it_free_data = 0; it_free_data < it_file; it_free_data++) { - free(data[it_free_data]); - } - free(data); - - free(filenameComponentPGX); - - return NULL; - } - - /* Set the image_read parameters*/ - param_image_read[it_file].x0 = 0; - param_image_read[it_file].y0 = 0; - param_image_read[it_file].dx = 0; - param_image_read[it_file].dy = 0; - param_image_read[it_file].h = image_read->comps->h; - param_image_read[it_file].w = image_read->comps->w; - param_image_read[it_file].bpp = image_read->comps->bpp; - param_image_read[it_file].prec = image_read->comps->prec; - param_image_read[it_file].sgnd = image_read->comps->sgnd; - - /* Copy data*/ - data[it_file] = malloc(param_image_read[it_file].h * param_image_read[it_file].w * sizeof(int)); - memcpy(data[it_file], image_read->comps->data, image_read->comps->h * image_read->comps->w * sizeof(int)); - - /* Free memory*/ - opj_image_destroy(image_read); - free(filenameComponentPGX); +#ifdef OPJ_HAVE_LIBTIFF + image_read = tiftoimage(filename, ¶meters); +#endif + if (!image_read) { + fprintf(stderr, "Unable to load TIF file\n"); + return NULL; } - image = opj_image_create((OPJ_UINT32)nbFilenamePGX, param_image_read, OPJ_CLRSPC_UNSPECIFIED); - for (it_file = 0; it_file < nbFilenamePGX; it_file++) - { - /* Copy data into output image and free memory*/ - memcpy(image->comps[it_file].data, data[it_file], image->comps[it_file].h * image->comps[it_file].w * sizeof(int)); - free(data[it_file]); - } - - /* Free memory*/ - free(param_image_read); - free(data); - - return image; + return image_read; } -static opj_image_t* readImageFromFileTIF(const char* filename, int nbFilenamePGX, const char *separator) +static opj_image_t* readImageFromFilePGX(const char* filename, + int nbFilenamePGX, const char *separator) { - opj_image_t* image_read = NULL; - opj_cparameters_t parameters; - (void)nbFilenamePGX; - (void)separator; + int it_file; + opj_image_t* image_read = NULL; + opj_image_t* image = NULL; + opj_cparameters_t parameters; + opj_image_cmptparm_t* param_image_read; + int** data; - /* conformance test suite produce annoying warning/error: - * TIFFReadDirectory: Warning, /.../data/baseline/conformance/jp2_1.tif: unknown field with tag 37724 (0x935c) encountered. - * TIFFOpen: /.../data/baseline/nonregression/opj_jp2_1.tif: Cannot open. - * On Win32 this open a message box by default, so remove it from the test suite: - */ -#ifdef OPJ_HAVE_LIBTIFF - TIFFSetWarningHandler(NULL); - TIFFSetErrorHandler(NULL); -#endif - - if ( strlen(separator) != 0 ) return NULL; - - /* set encoding parameters to default values */ - opj_set_default_encoder_parameters(¶meters); - parameters.decod_format = TIF_DFMT; - strcpy(parameters.infile, filename); - - /* Read the tif file corresponding to the component */ -#ifdef OPJ_HAVE_LIBTIFF - image_read = tiftoimage(filename, ¶meters); -#endif - if (!image_read) - { - fprintf(stderr, "Unable to load TIF file\n"); - return NULL; + /* If separator is empty => nb file to read is equal to one*/ + if (strlen(separator) == 0) { + nbFilenamePGX = 1; } - return image_read; -} + /* set encoding parameters to default values */ + opj_set_default_encoder_parameters(¶meters); + parameters.decod_format = PGX_DFMT; + strcpy(parameters.infile, filename); -static opj_image_t* readImageFromFilePGX(const char* filename, int nbFilenamePGX, const char *separator) -{ - int it_file; - opj_image_t* image_read = NULL; - opj_image_t* image = NULL; - opj_cparameters_t parameters; - opj_image_cmptparm_t* param_image_read; - int** data; + /* Allocate memory*/ + param_image_read = malloc((size_t)nbFilenamePGX * sizeof(opj_image_cmptparm_t)); + data = malloc((size_t)nbFilenamePGX * sizeof(*data)); - /* If separator is empty => nb file to read is equal to one*/ - if ( strlen(separator) == 0 ) - nbFilenamePGX = 1; + for (it_file = 0; it_file < nbFilenamePGX; it_file++) { + /* Create the right filename*/ + char *filenameComponentPGX; + if (strlen(separator) == 0) { + filenameComponentPGX = malloc((strlen(filename) + 1) * sizeof( + *filenameComponentPGX)); + strcpy(filenameComponentPGX, filename); + } else { + filenameComponentPGX = createMultiComponentsFilename(filename, it_file, + separator); + } - /* set encoding parameters to default values */ - opj_set_default_encoder_parameters(¶meters); - parameters.decod_format = PGX_DFMT; - strcpy(parameters.infile, filename); + /* Read the pgx file corresponding to the component */ + image_read = pgxtoimage(filenameComponentPGX, ¶meters); + if (!image_read) { + int it_free_data; + fprintf(stderr, "Unable to load pgx file\n"); - /* Allocate memory*/ - param_image_read = malloc((size_t)nbFilenamePGX * sizeof(opj_image_cmptparm_t)); - data = malloc((size_t)nbFilenamePGX * sizeof(*data)); + free(param_image_read); - for (it_file = 0; it_file < nbFilenamePGX; it_file++) - { - /* Create the right filename*/ - char *filenameComponentPGX; - if (strlen(separator) == 0) - { - filenameComponentPGX = malloc((strlen(filename) + 1) * sizeof(*filenameComponentPGX)); - strcpy(filenameComponentPGX, filename); - } - else - filenameComponentPGX = createMultiComponentsFilename(filename, it_file, separator); + for (it_free_data = 0; it_free_data < it_file; it_free_data++) { + free(data[it_free_data]); + } + free(data); - /* Read the pgx file corresponding to the component */ - image_read = pgxtoimage(filenameComponentPGX, ¶meters); - if (!image_read) - { - int it_free_data; - fprintf(stderr, "Unable to load pgx file\n"); + free(filenameComponentPGX); - free(param_image_read); + return NULL; + } - for (it_free_data = 0; it_free_data < it_file; it_free_data++) { - free(data[it_free_data]); - } - free(data); + /* Set the image_read parameters*/ + param_image_read[it_file].x0 = 0; + param_image_read[it_file].y0 = 0; + param_image_read[it_file].dx = 0; + param_image_read[it_file].dy = 0; + param_image_read[it_file].h = image_read->comps->h; + param_image_read[it_file].w = image_read->comps->w; + param_image_read[it_file].bpp = image_read->comps->bpp; + param_image_read[it_file].prec = image_read->comps->prec; + param_image_read[it_file].sgnd = image_read->comps->sgnd; - free(filenameComponentPGX); + /* Copy data*/ + data[it_file] = malloc(param_image_read[it_file].h * param_image_read[it_file].w + * sizeof(int)); + memcpy(data[it_file], image_read->comps->data, + image_read->comps->h * image_read->comps->w * sizeof(int)); - return NULL; - } + /* Free memory*/ + opj_image_destroy(image_read); + free(filenameComponentPGX); + } - /* Set the image_read parameters*/ - param_image_read[it_file].x0 = 0; - param_image_read[it_file].y0 = 0; - param_image_read[it_file].dx = 0; - param_image_read[it_file].dy = 0; - param_image_read[it_file].h = image_read->comps->h; - param_image_read[it_file].w = image_read->comps->w; - param_image_read[it_file].bpp = image_read->comps->bpp; - param_image_read[it_file].prec = image_read->comps->prec; - param_image_read[it_file].sgnd = image_read->comps->sgnd; - - /* Copy data*/ - data[it_file] = malloc(param_image_read[it_file].h * param_image_read[it_file].w * sizeof(int)); - memcpy(data[it_file], image_read->comps->data, image_read->comps->h * image_read->comps->w * sizeof(int)); + image = opj_image_create((OPJ_UINT32)nbFilenamePGX, param_image_read, + OPJ_CLRSPC_UNSPECIFIED); + for (it_file = 0; it_file < nbFilenamePGX; it_file++) { + /* Copy data into output image and free memory*/ + memcpy(image->comps[it_file].data, data[it_file], + image->comps[it_file].h * image->comps[it_file].w * sizeof(int)); + free(data[it_file]); + } /* Free memory*/ - opj_image_destroy(image_read); - free(filenameComponentPGX); - } + free(param_image_read); + free(data); - image = opj_image_create((OPJ_UINT32)nbFilenamePGX, param_image_read, OPJ_CLRSPC_UNSPECIFIED); - for (it_file = 0; it_file < nbFilenamePGX; it_file++) - { - /* Copy data into output image and free memory*/ - memcpy(image->comps[it_file].data, data[it_file], image->comps[it_file].h * image->comps[it_file].w * sizeof(int)); - free(data[it_file]); - } - - /* Free memory*/ - free(param_image_read); - free(data); - - return image; + return image; } #if defined(OPJ_HAVE_LIBPNG) && 0 /* remove for now */ /******************************************************************************* * *******************************************************************************/ -static int imageToPNG(const opj_image_t* image, const char* filename, int num_comp_select) +static int imageToPNG(const opj_image_t* image, const char* filename, + int num_comp_select) { - opj_image_cmptparm_t param_image_write; - opj_image_t* image_write = NULL; + opj_image_cmptparm_t param_image_write; + opj_image_t* image_write = NULL; - param_image_write.x0 = 0; - param_image_write.y0 = 0; - param_image_write.dx = 0; - param_image_write.dy = 0; - param_image_write.h = image->comps[num_comp_select].h; - param_image_write.w = image->comps[num_comp_select].w; - param_image_write.bpp = image->comps[num_comp_select].bpp; - param_image_write.prec = image->comps[num_comp_select].prec; - param_image_write.sgnd = image->comps[num_comp_select].sgnd; + param_image_write.x0 = 0; + param_image_write.y0 = 0; + param_image_write.dx = 0; + param_image_write.dy = 0; + param_image_write.h = image->comps[num_comp_select].h; + param_image_write.w = image->comps[num_comp_select].w; + param_image_write.bpp = image->comps[num_comp_select].bpp; + param_image_write.prec = image->comps[num_comp_select].prec; + param_image_write.sgnd = image->comps[num_comp_select].sgnd; - image_write = opj_image_create(1u, ¶m_image_write, OPJ_CLRSPC_GRAY); - memcpy(image_write->comps->data, image->comps[num_comp_select].data, param_image_write.h * param_image_write.w * sizeof(int)); + image_write = opj_image_create(1u, ¶m_image_write, OPJ_CLRSPC_GRAY); + memcpy(image_write->comps->data, image->comps[num_comp_select].data, + param_image_write.h * param_image_write.w * sizeof(int)); - imagetopng(image_write, filename); + imagetopng(image_write, filename); - opj_image_destroy(image_write); + opj_image_destroy(image_write); - return EXIT_SUCCESS; + return EXIT_SUCCESS; } #endif -typedef struct test_cmp_parameters -{ - /** */ - char* base_filename; - /** */ - char* test_filename; - /** Number of components */ - int nbcomp; - /** */ - double* tabMSEvalues; - /** */ - double* tabPEAKvalues; - /** */ - int nr_flag; - /** */ - char separator_base[2]; - /** */ - char separator_test[2]; +typedef struct test_cmp_parameters { + /** */ + char* base_filename; + /** */ + char* test_filename; + /** Number of components */ + int nbcomp; + /** */ + double* tabMSEvalues; + /** */ + double* tabPEAKvalues; + /** */ + int nr_flag; + /** */ + char separator_base[2]; + /** */ + char separator_test[2]; } test_cmp_parameters; /* return decode format PGX / TIF / PPM , return -1 on error */ static int get_decod_format(test_cmp_parameters* param) { - int base_format = get_decod_format_from_string( param->base_filename ); - int test_format = get_decod_format_from_string( param->test_filename ); - if( base_format != test_format ) return -1; - /* handle case -1: */ - return base_format; + int base_format = get_decod_format_from_string(param->base_filename); + int test_format = get_decod_format_from_string(param->test_filename); + if (base_format != test_format) { + return -1; + } + /* handle case -1: */ + return base_format; } /******************************************************************************* @@ -436,206 +454,183 @@ static int get_decod_format(test_cmp_parameters* param) *******************************************************************************/ static int parse_cmdline_cmp(int argc, char **argv, test_cmp_parameters* param) { - char *MSElistvalues = NULL; char *PEAKlistvalues= NULL; - char *separatorList = NULL; - size_t sizemembasefile, sizememtestfile; - int index, flagM=0, flagP=0; - const char optlist[] = "b:t:n:m:p:s:d"; - int c; + char *MSElistvalues = NULL; + char *PEAKlistvalues = NULL; + char *separatorList = NULL; + size_t sizemembasefile, sizememtestfile; + int index, flagM = 0, flagP = 0; + const char optlist[] = "b:t:n:m:p:s:d"; + int c; - /* Init parameters*/ - param->base_filename = NULL; - param->test_filename = NULL; - param->nbcomp = 0; - param->tabMSEvalues = NULL; - param->tabPEAKvalues = NULL; - param->nr_flag = 0; - param->separator_base[0] = 0; - param->separator_test[0] = 0; + /* Init parameters*/ + param->base_filename = NULL; + param->test_filename = NULL; + param->nbcomp = 0; + param->tabMSEvalues = NULL; + param->tabPEAKvalues = NULL; + param->nr_flag = 0; + param->separator_base[0] = 0; + param->separator_test[0] = 0; - opj_opterr = 0; + opj_opterr = 0; - while ((c = opj_getopt(argc, argv, optlist)) != -1) - switch (c) - { - case 'b': - sizemembasefile = strlen(opj_optarg) + 1; - param->base_filename = (char*) malloc(sizemembasefile); - strcpy(param->base_filename, opj_optarg); - /*printf("param->base_filename = %s [%d / %d]\n", param->base_filename, strlen(param->base_filename), sizemembasefile );*/ - break; - case 't': - sizememtestfile = strlen(opj_optarg) + 1; - param->test_filename = (char*) malloc(sizememtestfile); - strcpy(param->test_filename, opj_optarg); - /*printf("param->test_filename = %s [%d / %d]\n", param->test_filename, strlen(param->test_filename), sizememtestfile);*/ - break; - case 'n': - param->nbcomp = atoi(opj_optarg); - break; - case 'm': - MSElistvalues = opj_optarg; - flagM = 1; - break; - case 'p': - PEAKlistvalues = opj_optarg; - flagP = 1; - break; - case 'd': - param->nr_flag = 1; - break; - case 's': - separatorList = opj_optarg; - break; - case '?': - if ((opj_optopt == 'b') || (opj_optopt == 't') || (opj_optopt == 'n') || (opj_optopt == 'p') || (opj_optopt == 'm') || (opj_optopt - == 's')) - fprintf(stderr, "Option -%c requires an argument.\n", opj_optopt); - else - if (isprint(opj_optopt)) fprintf(stderr, "Unknown option `-%c'.\n", opj_optopt); - else fprintf(stderr, "Unknown option character `\\x%x'.\n", opj_optopt); + while ((c = opj_getopt(argc, argv, optlist)) != -1) + switch (c) { + case 'b': + sizemembasefile = strlen(opj_optarg) + 1; + param->base_filename = (char*) malloc(sizemembasefile); + strcpy(param->base_filename, opj_optarg); + /*printf("param->base_filename = %s [%d / %d]\n", param->base_filename, strlen(param->base_filename), sizemembasefile );*/ + break; + case 't': + sizememtestfile = strlen(opj_optarg) + 1; + param->test_filename = (char*) malloc(sizememtestfile); + strcpy(param->test_filename, opj_optarg); + /*printf("param->test_filename = %s [%d / %d]\n", param->test_filename, strlen(param->test_filename), sizememtestfile);*/ + break; + case 'n': + param->nbcomp = atoi(opj_optarg); + break; + case 'm': + MSElistvalues = opj_optarg; + flagM = 1; + break; + case 'p': + PEAKlistvalues = opj_optarg; + flagP = 1; + break; + case 'd': + param->nr_flag = 1; + break; + case 's': + separatorList = opj_optarg; + break; + case '?': + if ((opj_optopt == 'b') || (opj_optopt == 't') || (opj_optopt == 'n') || + (opj_optopt == 'p') || (opj_optopt == 'm') || (opj_optopt + == 's')) { + fprintf(stderr, "Option -%c requires an argument.\n", opj_optopt); + } else if (isprint(opj_optopt)) { + fprintf(stderr, "Unknown option `-%c'.\n", opj_optopt); + } else { + fprintf(stderr, "Unknown option character `\\x%x'.\n", opj_optopt); + } + return 1; + default: + fprintf(stderr, "WARNING -> this option is not valid \"-%c %s\"\n", c, + opj_optarg); + break; + } + + if (opj_optind != argc) { + for (index = opj_optind; index < argc; index++) { + fprintf(stderr, "Non-option argument %s\n", argv[index]); + } return 1; - default: - fprintf(stderr, "WARNING -> this option is not valid \"-%c %s\"\n", c, opj_optarg); - break; - } - - if (opj_optind != argc) - { - for (index = opj_optind; index < argc; index++) - fprintf(stderr,"Non-option argument %s\n", argv[index]); - return 1; } - if (param->nbcomp == 0) - { - fprintf(stderr,"Need to indicate the number of components !\n"); - return 1; + if (param->nbcomp == 0) { + fprintf(stderr, "Need to indicate the number of components !\n"); + return 1; } - /* else */ - if ( flagM && flagP ) - { - param->tabMSEvalues = parseToleranceValues( MSElistvalues, param->nbcomp); - param->tabPEAKvalues = parseToleranceValues( PEAKlistvalues, param->nbcomp); - if ( (param->tabMSEvalues == NULL) || (param->tabPEAKvalues == NULL)) - { - fprintf(stderr,"MSE and PEAK values are not correct (respectively need %d values)\n",param->nbcomp); - return 1; - } - } - - /* Get separators after corresponding letter (b or t)*/ - if (separatorList != NULL) - { - if( (strlen(separatorList) ==2) || (strlen(separatorList) ==4) ) - { - /* keep original string*/ - size_t sizeseplist = strlen(separatorList)+1; - char* separatorList2 = (char*)malloc( sizeseplist ); - strcpy(separatorList2, separatorList); - /*printf("separatorList2 = %s [%d / %d]\n", separatorList2, strlen(separatorList2), sizeseplist);*/ - - if (strlen(separatorList) == 2) /* one separator behind b or t*/ - { - char *resultT = NULL; - resultT = strtok(separatorList2, "t"); - if (strlen(resultT) == strlen(separatorList)) /* didn't find t character, try to find b*/ - { - char *resultB = NULL; - resultB = strtok(resultT, "b"); - if (strlen(resultB) == 1) - { - param->separator_base[0] = separatorList[1]; - param->separator_base[1] = 0; - param->separator_test[0] = 0; - } - else /* not found b*/ - { - free(separatorList2); + /* else */ + if (flagM && flagP) { + param->tabMSEvalues = parseToleranceValues(MSElistvalues, param->nbcomp); + param->tabPEAKvalues = parseToleranceValues(PEAKlistvalues, param->nbcomp); + if ((param->tabMSEvalues == NULL) || (param->tabPEAKvalues == NULL)) { + fprintf(stderr, + "MSE and PEAK values are not correct (respectively need %d values)\n", + param->nbcomp); return 1; - } - } - else /* found t*/ - { - param->separator_base[0] = 0; - param->separator_test[0] = separatorList[1]; - param->separator_test[1] = 0; - } - /*printf("sep b = %s [%d] and sep t = %s [%d]\n",param->separator_base, strlen(param->separator_base), param->separator_test, strlen(param->separator_test) );*/ } - else /* == 4 characters we must found t and b*/ - { - char *resultT = NULL; - resultT = strtok(separatorList2, "t"); - if (strlen(resultT) == 3) /* found t in first place*/ - { - char *resultB = NULL; - resultB = strtok(resultT, "b"); - if (strlen(resultB) == 1) /* found b after t*/ - { - param->separator_test[0] = separatorList[1]; - param->separator_test[1] = 0; - param->separator_base[0] = separatorList[3]; - param->separator_base[1] = 0; + } + + /* Get separators after corresponding letter (b or t)*/ + if (separatorList != NULL) { + if ((strlen(separatorList) == 2) || (strlen(separatorList) == 4)) { + /* keep original string*/ + size_t sizeseplist = strlen(separatorList) + 1; + char* separatorList2 = (char*)malloc(sizeseplist); + strcpy(separatorList2, separatorList); + /*printf("separatorList2 = %s [%d / %d]\n", separatorList2, strlen(separatorList2), sizeseplist);*/ + + if (strlen(separatorList) == 2) { /* one separator behind b or t*/ + char *resultT = NULL; + resultT = strtok(separatorList2, "t"); + if (strlen(resultT) == strlen( + separatorList)) { /* didn't find t character, try to find b*/ + char *resultB = NULL; + resultB = strtok(resultT, "b"); + if (strlen(resultB) == 1) { + param->separator_base[0] = separatorList[1]; + param->separator_base[1] = 0; + param->separator_test[0] = 0; + } else { /* not found b*/ + free(separatorList2); + return 1; + } + } else { /* found t*/ + param->separator_base[0] = 0; + param->separator_test[0] = separatorList[1]; + param->separator_test[1] = 0; + } + /*printf("sep b = %s [%d] and sep t = %s [%d]\n",param->separator_base, strlen(param->separator_base), param->separator_test, strlen(param->separator_test) );*/ + } else { /* == 4 characters we must found t and b*/ + char *resultT = NULL; + resultT = strtok(separatorList2, "t"); + if (strlen(resultT) == 3) { /* found t in first place*/ + char *resultB = NULL; + resultB = strtok(resultT, "b"); + if (strlen(resultB) == 1) { /* found b after t*/ + param->separator_test[0] = separatorList[1]; + param->separator_test[1] = 0; + param->separator_base[0] = separatorList[3]; + param->separator_base[1] = 0; + } else { /* didn't find b after t*/ + free(separatorList2); + return 1; + } + } else { /* == 2, didn't find t in first place*/ + char *resultB = NULL; + resultB = strtok(resultT, "b"); + if (strlen(resultB) == 1) { /* found b in first place*/ + param->separator_base[0] = separatorList[1]; + param->separator_base[1] = 0; + param->separator_test[0] = separatorList[3]; + param->separator_test[1] = 0; + } else { /* didn't found b in first place => problem*/ + free(separatorList2); + return 1; + } + } } - else /* didn't find b after t*/ - { free(separatorList2); + } else { /* wrong number of argument after -s*/ + return 1; + } + } else { + if (param->nbcomp == 1) { + assert(param->separator_base[0] == 0); + assert(param->separator_test[0] == 0); + } else { + fprintf(stderr, "If number of component is > 1, we need separator\n"); return 1; - } - } - else /* == 2, didn't find t in first place*/ - { - char *resultB = NULL; - resultB = strtok(resultT, "b"); - if (strlen(resultB) == 1) /* found b in first place*/ - { - param->separator_base[0] = separatorList[1]; - param->separator_base[1] = 0; - param->separator_test[0] = separatorList[3]; - param->separator_test[1] = 0; - } - else /* didn't found b in first place => problem*/ - { - free(separatorList2); - return 1; - } - } } - free(separatorList2); - } - else /* wrong number of argument after -s*/ - { - return 1; - } - } - else - { - if (param->nbcomp == 1) - { - assert( param->separator_base[0] == 0 ); - assert( param->separator_test[0] == 0 ); - } - else - { - fprintf(stderr,"If number of component is > 1, we need separator\n"); - return 1; - } } - if ( (param->nr_flag) && (flagP || flagM) ) - { - fprintf(stderr,"Wrong input parameters list: it is non-regression test or tolerance comparison\n"); - return 1; + if ((param->nr_flag) && (flagP || flagM)) { + fprintf(stderr, + "Wrong input parameters list: it is non-regression test or tolerance comparison\n"); + return 1; } - if ( (!param->nr_flag) && (!flagP || !flagM) ) - { - fprintf(stderr,"Wrong input parameters list: it is non-regression test or tolerance comparison\n"); - return 1; + if ((!param->nr_flag) && (!flagP || !flagM)) { + fprintf(stderr, + "Wrong input parameters list: it is non-regression test or tolerance comparison\n"); + return 1; } - return 0; + return 0; } /******************************************************************************* @@ -643,310 +638,319 @@ static int parse_cmdline_cmp(int argc, char **argv, test_cmp_parameters* param) *******************************************************************************/ int main(int argc, char **argv) { - test_cmp_parameters inParam; - OPJ_UINT32 it_comp, itpxl; - int failed = 1; - int nbFilenamePGXbase = 0, nbFilenamePGXtest = 0; - char *filenamePNGtest= NULL, *filenamePNGbase = NULL, *filenamePNGdiff = NULL; - size_t memsizebasefilename, memsizetestfilename; - size_t memsizedifffilename; - int valueDiff = 0, nbPixelDiff = 0; - double sumDiff = 0.0; - /* Structures to store image parameters and data*/ - opj_image_t *imageBase = NULL, *imageTest = NULL, *imageDiff = NULL; - opj_image_cmptparm_t* param_image_diff = NULL; - int decod_format; + test_cmp_parameters inParam; + OPJ_UINT32 it_comp, itpxl; + int failed = 1; + int nbFilenamePGXbase = 0, nbFilenamePGXtest = 0; + char *filenamePNGtest = NULL, *filenamePNGbase = NULL, *filenamePNGdiff = NULL; + size_t memsizebasefilename, memsizetestfilename; + size_t memsizedifffilename; + int valueDiff = 0, nbPixelDiff = 0; + double sumDiff = 0.0; + /* Structures to store image parameters and data*/ + opj_image_t *imageBase = NULL, *imageTest = NULL, *imageDiff = NULL; + opj_image_cmptparm_t* param_image_diff = NULL; + int decod_format; - /* Get parameters from command line*/ - if( parse_cmdline_cmp(argc, argv, &inParam) ) - { - compare_images_help_display(); - goto cleanup; + /* Get parameters from command line*/ + if (parse_cmdline_cmp(argc, argv, &inParam)) { + compare_images_help_display(); + goto cleanup; } - /* Display Parameters*/ - printf("******Parameters********* \n"); - printf(" base_filename = %s\n" - " test_filename = %s\n" - " nb of Components = %d\n" - " Non regression test = %d\n" - " separator Base = %s\n" - " separator Test = %s\n", - inParam.base_filename, inParam.test_filename, inParam.nbcomp, - inParam.nr_flag, inParam.separator_base, inParam.separator_test); + /* Display Parameters*/ + printf("******Parameters********* \n"); + printf(" base_filename = %s\n" + " test_filename = %s\n" + " nb of Components = %d\n" + " Non regression test = %d\n" + " separator Base = %s\n" + " separator Test = %s\n", + inParam.base_filename, inParam.test_filename, inParam.nbcomp, + inParam.nr_flag, inParam.separator_base, inParam.separator_test); - if ( (inParam.tabMSEvalues != NULL) && (inParam.tabPEAKvalues != NULL)) - { - int it_comp2; - printf(" MSE values = ["); - for (it_comp2 = 0; it_comp2 < inParam.nbcomp; it_comp2++) - printf(" %f ", inParam.tabMSEvalues[it_comp2]); - printf("]\n"); - printf(" PEAK values = ["); - for (it_comp2 = 0; it_comp2 < inParam.nbcomp; it_comp2++) - printf(" %f ", inParam.tabPEAKvalues[it_comp2]); - printf("]\n"); - printf(" Non-regression test = %d\n", inParam.nr_flag); + if ((inParam.tabMSEvalues != NULL) && (inParam.tabPEAKvalues != NULL)) { + int it_comp2; + printf(" MSE values = ["); + for (it_comp2 = 0; it_comp2 < inParam.nbcomp; it_comp2++) { + printf(" %f ", inParam.tabMSEvalues[it_comp2]); + } + printf("]\n"); + printf(" PEAK values = ["); + for (it_comp2 = 0; it_comp2 < inParam.nbcomp; it_comp2++) { + printf(" %f ", inParam.tabPEAKvalues[it_comp2]); + } + printf("]\n"); + printf(" Non-regression test = %d\n", inParam.nr_flag); } - if (strlen(inParam.separator_base) != 0) - nbFilenamePGXbase = inParam.nbcomp; - - if (strlen(inParam.separator_test) != 0) - nbFilenamePGXtest = inParam.nbcomp; - - printf(" NbFilename to generate from base filename = %d\n", nbFilenamePGXbase); - printf(" NbFilename to generate from test filename = %d\n", nbFilenamePGXtest); - printf("************************* \n"); - - /*----------BASELINE IMAGE--------*/ - memsizebasefilename = strlen(inParam.test_filename) + 1 + 5 + 2 + 4; - memsizetestfilename = strlen(inParam.test_filename) + 1 + 5 + 2 + 4; - - decod_format = get_decod_format(&inParam); - if( decod_format == -1 ) - { - fprintf( stderr, "Unhandled file format\n" ); - goto cleanup; - } - assert( decod_format == PGX_DFMT || decod_format == TIF_DFMT || decod_format == PXM_DFMT ); - - if( decod_format == PGX_DFMT ) - { - imageBase = readImageFromFilePGX( inParam.base_filename, nbFilenamePGXbase, inParam.separator_base); - if ( imageBase == NULL ) - goto cleanup; - } - else if( decod_format == TIF_DFMT ) - { - imageBase = readImageFromFileTIF( inParam.base_filename, nbFilenamePGXbase, ""); - if ( imageBase == NULL ) - goto cleanup; - } - else if( decod_format == PXM_DFMT ) - { - imageBase = readImageFromFilePPM( inParam.base_filename, nbFilenamePGXbase, inParam.separator_base); - if ( imageBase == NULL ) - goto cleanup; + if (strlen(inParam.separator_base) != 0) { + nbFilenamePGXbase = inParam.nbcomp; } - filenamePNGbase = (char*) malloc(memsizebasefilename); - strcpy(filenamePNGbase, inParam.test_filename); - strcat(filenamePNGbase, ".base"); - /*printf("filenamePNGbase = %s [%d / %d octets]\n",filenamePNGbase, strlen(filenamePNGbase),memsizebasefilename );*/ - - /*----------TEST IMAGE--------*/ - - if( decod_format == PGX_DFMT ) - { - imageTest = readImageFromFilePGX(inParam.test_filename, nbFilenamePGXtest, inParam.separator_test); - if ( imageTest == NULL ) - goto cleanup; - } - else if( decod_format == TIF_DFMT ) - { - imageTest = readImageFromFileTIF(inParam.test_filename, nbFilenamePGXtest, ""); - if ( imageTest == NULL ) - goto cleanup; - } - else if( decod_format == PXM_DFMT ) - { - imageTest = readImageFromFilePPM(inParam.test_filename, nbFilenamePGXtest, inParam.separator_test); - if ( imageTest == NULL ) - goto cleanup; + if (strlen(inParam.separator_test) != 0) { + nbFilenamePGXtest = inParam.nbcomp; } - filenamePNGtest = (char*) malloc(memsizetestfilename); - strcpy(filenamePNGtest, inParam.test_filename); - strcat(filenamePNGtest, ".test"); - /*printf("filenamePNGtest = %s [%d / %d octets]\n",filenamePNGtest, strlen(filenamePNGtest),memsizetestfilename );*/ + printf(" NbFilename to generate from base filename = %d\n", nbFilenamePGXbase); + printf(" NbFilename to generate from test filename = %d\n", nbFilenamePGXtest); + printf("************************* \n"); - /*----------DIFF IMAGE--------*/ + /*----------BASELINE IMAGE--------*/ + memsizebasefilename = strlen(inParam.test_filename) + 1 + 5 + 2 + 4; + memsizetestfilename = strlen(inParam.test_filename) + 1 + 5 + 2 + 4; - /* Allocate memory*/ - param_image_diff = malloc( imageBase->numcomps * sizeof(opj_image_cmptparm_t)); + decod_format = get_decod_format(&inParam); + if (decod_format == -1) { + fprintf(stderr, "Unhandled file format\n"); + goto cleanup; + } + assert(decod_format == PGX_DFMT || decod_format == TIF_DFMT || + decod_format == PXM_DFMT); - /* Comparison of header parameters*/ - printf("Step 1 -> Header comparison\n"); - - /* check dimensions (issue 286)*/ - if(imageBase->numcomps != imageTest->numcomps ) - { - printf("ERROR: dim mismatch (%d><%d)\n", imageBase->numcomps, imageTest->numcomps); - goto cleanup; + if (decod_format == PGX_DFMT) { + imageBase = readImageFromFilePGX(inParam.base_filename, nbFilenamePGXbase, + inParam.separator_base); + if (imageBase == NULL) { + goto cleanup; + } + } else if (decod_format == TIF_DFMT) { + imageBase = readImageFromFileTIF(inParam.base_filename, nbFilenamePGXbase, ""); + if (imageBase == NULL) { + goto cleanup; + } + } else if (decod_format == PXM_DFMT) { + imageBase = readImageFromFilePPM(inParam.base_filename, nbFilenamePGXbase, + inParam.separator_base); + if (imageBase == NULL) { + goto cleanup; + } } - for (it_comp = 0; it_comp < imageBase->numcomps; it_comp++) - { - param_image_diff[it_comp].x0 = 0; - param_image_diff[it_comp].y0 = 0; - param_image_diff[it_comp].dx = 0; - param_image_diff[it_comp].dy = 0; - param_image_diff[it_comp].sgnd = 0; - param_image_diff[it_comp].prec = 8; - param_image_diff[it_comp].bpp = 1; - param_image_diff[it_comp].h = imageBase->comps[it_comp].h; - param_image_diff[it_comp].w = imageBase->comps[it_comp].w; + filenamePNGbase = (char*) malloc(memsizebasefilename); + strcpy(filenamePNGbase, inParam.test_filename); + strcat(filenamePNGbase, ".base"); + /*printf("filenamePNGbase = %s [%d / %d octets]\n",filenamePNGbase, strlen(filenamePNGbase),memsizebasefilename );*/ - if (imageBase->comps[it_comp].sgnd != imageTest->comps[it_comp].sgnd) - { - printf("ERROR: sign mismatch [comp %d] (%d><%d)\n", it_comp, ((imageBase->comps)[it_comp]).sgnd, ((imageTest->comps)[it_comp]).sgnd); - goto cleanup; - } + /*----------TEST IMAGE--------*/ - if (((imageBase->comps)[it_comp]).prec != ((imageTest->comps)[it_comp]).prec) - { - printf("ERROR: prec mismatch [comp %d] (%d><%d)\n", it_comp, ((imageBase->comps)[it_comp]).prec, ((imageTest->comps)[it_comp]).prec); - goto cleanup; - } - - if (((imageBase->comps)[it_comp]).bpp != ((imageTest->comps)[it_comp]).bpp) - { - printf("ERROR: byte per pixel mismatch [comp %d] (%d><%d)\n", it_comp, ((imageBase->comps)[it_comp]).bpp, ((imageTest->comps)[it_comp]).bpp); - goto cleanup; - } - - if (((imageBase->comps)[it_comp]).h != ((imageTest->comps)[it_comp]).h) - { - printf("ERROR: height mismatch [comp %d] (%d><%d)\n", it_comp, ((imageBase->comps)[it_comp]).h, ((imageTest->comps)[it_comp]).h); - goto cleanup; - } - - if (((imageBase->comps)[it_comp]).w != ((imageTest->comps)[it_comp]).w) - { - printf("ERROR: width mismatch [comp %d] (%d><%d)\n", it_comp, ((imageBase->comps)[it_comp]).w, ((imageTest->comps)[it_comp]).w); - goto cleanup; - } + if (decod_format == PGX_DFMT) { + imageTest = readImageFromFilePGX(inParam.test_filename, nbFilenamePGXtest, + inParam.separator_test); + if (imageTest == NULL) { + goto cleanup; + } + } else if (decod_format == TIF_DFMT) { + imageTest = readImageFromFileTIF(inParam.test_filename, nbFilenamePGXtest, ""); + if (imageTest == NULL) { + goto cleanup; + } + } else if (decod_format == PXM_DFMT) { + imageTest = readImageFromFilePPM(inParam.test_filename, nbFilenamePGXtest, + inParam.separator_test); + if (imageTest == NULL) { + goto cleanup; + } } - imageDiff = opj_image_create(imageBase->numcomps, param_image_diff, OPJ_CLRSPC_UNSPECIFIED); - /* Free memory*/ - free(param_image_diff); param_image_diff = NULL; + filenamePNGtest = (char*) malloc(memsizetestfilename); + strcpy(filenamePNGtest, inParam.test_filename); + strcat(filenamePNGtest, ".test"); + /*printf("filenamePNGtest = %s [%d / %d octets]\n",filenamePNGtest, strlen(filenamePNGtest),memsizetestfilename );*/ - /* Measurement computation*/ - printf("Step 2 -> measurement comparison\n"); + /*----------DIFF IMAGE--------*/ - memsizedifffilename = strlen(inParam.test_filename) + 1 + 5 + 2 + 4; - filenamePNGdiff = (char*) malloc(memsizedifffilename); - strcpy(filenamePNGdiff, inParam.test_filename); - strcat(filenamePNGdiff, ".diff"); - /*printf("filenamePNGdiff = %s [%d / %d octets]\n",filenamePNGdiff, strlen(filenamePNGdiff),memsizedifffilename );*/ + /* Allocate memory*/ + param_image_diff = malloc(imageBase->numcomps * sizeof(opj_image_cmptparm_t)); - /* Compute pixel diff*/ - for (it_comp = 0; it_comp < imageDiff->numcomps; it_comp++) - { - double SE=0,PEAK=0; - double MSE=0; - for (itpxl = 0; itpxl < ((imageDiff->comps)[it_comp]).w * ((imageDiff->comps)[it_comp]).h; itpxl++) - { - if (abs( ((imageBase->comps)[it_comp]).data[itpxl] - ((imageTest->comps)[it_comp]).data[itpxl] ) > 0) - { - valueDiff = ((imageBase->comps)[it_comp]).data[itpxl] - ((imageTest->comps)[it_comp]).data[itpxl]; - ((imageDiff->comps)[it_comp]).data[itpxl] = abs(valueDiff); - sumDiff += valueDiff; - nbPixelDiff++; + /* Comparison of header parameters*/ + printf("Step 1 -> Header comparison\n"); - SE += (double)valueDiff * valueDiff; - PEAK = (PEAK > abs(valueDiff)) ? PEAK : abs(valueDiff); - } - else - ((imageDiff->comps)[it_comp]).data[itpxl] = 0; - }/* h*w loop */ + /* check dimensions (issue 286)*/ + if (imageBase->numcomps != imageTest->numcomps) { + printf("ERROR: dim mismatch (%d><%d)\n", imageBase->numcomps, + imageTest->numcomps); + goto cleanup; + } - MSE = SE / ( ((imageDiff->comps)[it_comp]).w * ((imageDiff->comps)[it_comp]).h ); + for (it_comp = 0; it_comp < imageBase->numcomps; it_comp++) { + param_image_diff[it_comp].x0 = 0; + param_image_diff[it_comp].y0 = 0; + param_image_diff[it_comp].dx = 0; + param_image_diff[it_comp].dy = 0; + param_image_diff[it_comp].sgnd = 0; + param_image_diff[it_comp].prec = 8; + param_image_diff[it_comp].bpp = 1; + param_image_diff[it_comp].h = imageBase->comps[it_comp].h; + param_image_diff[it_comp].w = imageBase->comps[it_comp].w; - if (!inParam.nr_flag && (inParam.tabMSEvalues != NULL) && (inParam.tabPEAKvalues != NULL)) - { /* Conformance test*/ - printf(" %f \n", it_comp, PEAK); - printf(" %f \n", it_comp, MSE); + if (imageBase->comps[it_comp].sgnd != imageTest->comps[it_comp].sgnd) { + printf("ERROR: sign mismatch [comp %d] (%d><%d)\n", it_comp, + ((imageBase->comps)[it_comp]).sgnd, ((imageTest->comps)[it_comp]).sgnd); + goto cleanup; + } - if ( (MSE > inParam.tabMSEvalues[it_comp]) || (PEAK > inParam.tabPEAKvalues[it_comp]) ) - { - printf("ERROR: MSE (%f) or PEAK (%f) values produced by the decoded file are greater " - "than the allowable error (respectively %f and %f) \n", - MSE, PEAK, inParam.tabMSEvalues[it_comp], inParam.tabPEAKvalues[it_comp]); - goto cleanup; - } - } - else /* Non regression-test */ - { - if ( nbPixelDiff > 0) - { - char it_compc[255]; - it_compc[0] = 0; + if (((imageBase->comps)[it_comp]).prec != ((imageTest->comps)[it_comp]).prec) { + printf("ERROR: prec mismatch [comp %d] (%d><%d)\n", it_comp, + ((imageBase->comps)[it_comp]).prec, ((imageTest->comps)[it_comp]).prec); + goto cleanup; + } - printf(" %d \n", it_comp, nbPixelDiff); - printf(" %f \n", it_comp, sumDiff); - printf(" %f \n", it_comp, PEAK); - printf(" %f \n", it_comp, MSE); + if (((imageBase->comps)[it_comp]).bpp != ((imageTest->comps)[it_comp]).bpp) { + printf("ERROR: byte per pixel mismatch [comp %d] (%d><%d)\n", it_comp, + ((imageBase->comps)[it_comp]).bpp, ((imageTest->comps)[it_comp]).bpp); + goto cleanup; + } + + if (((imageBase->comps)[it_comp]).h != ((imageTest->comps)[it_comp]).h) { + printf("ERROR: height mismatch [comp %d] (%d><%d)\n", it_comp, + ((imageBase->comps)[it_comp]).h, ((imageTest->comps)[it_comp]).h); + goto cleanup; + } + + if (((imageBase->comps)[it_comp]).w != ((imageTest->comps)[it_comp]).w) { + printf("ERROR: width mismatch [comp %d] (%d><%d)\n", it_comp, + ((imageBase->comps)[it_comp]).w, ((imageTest->comps)[it_comp]).w); + goto cleanup; + } + } + + imageDiff = opj_image_create(imageBase->numcomps, param_image_diff, + OPJ_CLRSPC_UNSPECIFIED); + /* Free memory*/ + free(param_image_diff); + param_image_diff = NULL; + + /* Measurement computation*/ + printf("Step 2 -> measurement comparison\n"); + + memsizedifffilename = strlen(inParam.test_filename) + 1 + 5 + 2 + 4; + filenamePNGdiff = (char*) malloc(memsizedifffilename); + strcpy(filenamePNGdiff, inParam.test_filename); + strcat(filenamePNGdiff, ".diff"); + /*printf("filenamePNGdiff = %s [%d / %d octets]\n",filenamePNGdiff, strlen(filenamePNGdiff),memsizedifffilename );*/ + + /* Compute pixel diff*/ + for (it_comp = 0; it_comp < imageDiff->numcomps; it_comp++) { + double SE = 0, PEAK = 0; + double MSE = 0; + for (itpxl = 0; + itpxl < ((imageDiff->comps)[it_comp]).w * ((imageDiff->comps)[it_comp]).h; + itpxl++) { + if (abs(((imageBase->comps)[it_comp]).data[itpxl] - (( + imageTest->comps)[it_comp]).data[itpxl]) > 0) { + valueDiff = ((imageBase->comps)[it_comp]).data[itpxl] - (( + imageTest->comps)[it_comp]).data[itpxl]; + ((imageDiff->comps)[it_comp]).data[itpxl] = abs(valueDiff); + sumDiff += valueDiff; + nbPixelDiff++; + + SE += (double)valueDiff * valueDiff; + PEAK = (PEAK > abs(valueDiff)) ? PEAK : abs(valueDiff); + } else { + ((imageDiff->comps)[it_comp]).data[itpxl] = 0; + } + }/* h*w loop */ + + MSE = SE / (((imageDiff->comps)[it_comp]).w * ((imageDiff->comps)[it_comp]).h); + + if (!inParam.nr_flag && (inParam.tabMSEvalues != NULL) && + (inParam.tabPEAKvalues != NULL)) { + /* Conformance test*/ + printf(" %f \n", + it_comp, PEAK); + printf(" %f \n", + it_comp, MSE); + + if ((MSE > inParam.tabMSEvalues[it_comp]) || + (PEAK > inParam.tabPEAKvalues[it_comp])) { + printf("ERROR: MSE (%f) or PEAK (%f) values produced by the decoded file are greater " + "than the allowable error (respectively %f and %f) \n", + MSE, PEAK, inParam.tabMSEvalues[it_comp], inParam.tabPEAKvalues[it_comp]); + goto cleanup; + } + } else { /* Non regression-test */ + if (nbPixelDiff > 0) { + char it_compc[255]; + it_compc[0] = 0; + + printf(" %d \n", + it_comp, nbPixelDiff); + printf(" %f \n", + it_comp, sumDiff); + printf(" %f \n", + it_comp, PEAK); + printf(" %f \n", + it_comp, MSE); #ifdef OPJ_HAVE_LIBPNG - { - char *filenamePNGbase_it_comp, *filenamePNGtest_it_comp, *filenamePNGdiff_it_comp; + { + char *filenamePNGbase_it_comp, *filenamePNGtest_it_comp, + *filenamePNGdiff_it_comp; - filenamePNGbase_it_comp = (char*) malloc(memsizebasefilename); - strcpy(filenamePNGbase_it_comp,filenamePNGbase); + filenamePNGbase_it_comp = (char*) malloc(memsizebasefilename); + strcpy(filenamePNGbase_it_comp, filenamePNGbase); - filenamePNGtest_it_comp = (char*) malloc(memsizetestfilename); - strcpy(filenamePNGtest_it_comp,filenamePNGtest); + filenamePNGtest_it_comp = (char*) malloc(memsizetestfilename); + strcpy(filenamePNGtest_it_comp, filenamePNGtest); - filenamePNGdiff_it_comp = (char*) malloc(memsizedifffilename); - strcpy(filenamePNGdiff_it_comp,filenamePNGdiff); + filenamePNGdiff_it_comp = (char*) malloc(memsizedifffilename); + strcpy(filenamePNGdiff_it_comp, filenamePNGdiff); - sprintf(it_compc, "_%i", it_comp); - strcat(it_compc,".png"); - strcat(filenamePNGbase_it_comp, it_compc); - /*printf("filenamePNGbase_it = %s [%d / %d octets]\n",filenamePNGbase_it_comp, strlen(filenamePNGbase_it_comp),memsizebasefilename );*/ - strcat(filenamePNGtest_it_comp, it_compc); - /*printf("filenamePNGtest_it = %s [%d / %d octets]\n",filenamePNGtest_it_comp, strlen(filenamePNGtest_it_comp),memsizetestfilename );*/ - strcat(filenamePNGdiff_it_comp, it_compc); - /*printf("filenamePNGdiff_it = %s [%d / %d octets]\n",filenamePNGdiff_it_comp, strlen(filenamePNGdiff_it_comp),memsizedifffilename );*/ + sprintf(it_compc, "_%i", it_comp); + strcat(it_compc, ".png"); + strcat(filenamePNGbase_it_comp, it_compc); + /*printf("filenamePNGbase_it = %s [%d / %d octets]\n",filenamePNGbase_it_comp, strlen(filenamePNGbase_it_comp),memsizebasefilename );*/ + strcat(filenamePNGtest_it_comp, it_compc); + /*printf("filenamePNGtest_it = %s [%d / %d octets]\n",filenamePNGtest_it_comp, strlen(filenamePNGtest_it_comp),memsizetestfilename );*/ + strcat(filenamePNGdiff_it_comp, it_compc); + /*printf("filenamePNGdiff_it = %s [%d / %d octets]\n",filenamePNGdiff_it_comp, strlen(filenamePNGdiff_it_comp),memsizedifffilename );*/ - /* - if ( imageToPNG(imageBase, filenamePNGbase_it_comp, it_comp) == EXIT_SUCCESS ) - { - printf(" %s \n", it_comp, filenamePNGbase_it_comp); - } + /* + if ( imageToPNG(imageBase, filenamePNGbase_it_comp, it_comp) == EXIT_SUCCESS ) + { + printf(" %s \n", it_comp, filenamePNGbase_it_comp); + } - if ( imageToPNG(imageTest, filenamePNGtest_it_comp, it_comp) == EXIT_SUCCESS ) - { - printf(" %s \n", it_comp, filenamePNGtest_it_comp); - } + if ( imageToPNG(imageTest, filenamePNGtest_it_comp, it_comp) == EXIT_SUCCESS ) + { + printf(" %s \n", it_comp, filenamePNGtest_it_comp); + } - if ( imageToPNG(imageDiff, filenamePNGdiff_it_comp, it_comp) == EXIT_SUCCESS ) - { - printf(" %s \n", it_comp, filenamePNGdiff_it_comp); - } - */ + if ( imageToPNG(imageDiff, filenamePNGdiff_it_comp, it_comp) == EXIT_SUCCESS ) + { + printf(" %s \n", it_comp, filenamePNGdiff_it_comp); + } + */ - free(filenamePNGbase_it_comp); - free(filenamePNGtest_it_comp); - free(filenamePNGdiff_it_comp); - } + free(filenamePNGbase_it_comp); + free(filenamePNGtest_it_comp); + free(filenamePNGdiff_it_comp); + } #endif - goto cleanup; - } - } - } /* it_comp loop */ + goto cleanup; + } + } + } /* it_comp loop */ - printf("---- TEST SUCCEED ----\n"); - failed = 0; + printf("---- TEST SUCCEED ----\n"); + failed = 0; cleanup: - /*-----------------------------*/ - free(param_image_diff); - /* Free memory */ - opj_image_destroy(imageBase); - opj_image_destroy(imageTest); - opj_image_destroy(imageDiff); + /*-----------------------------*/ + free(param_image_diff); + /* Free memory */ + opj_image_destroy(imageBase); + opj_image_destroy(imageTest); + opj_image_destroy(imageDiff); - free(filenamePNGbase); - free(filenamePNGtest); - free(filenamePNGdiff); + free(filenamePNGbase); + free(filenamePNGtest); + free(filenamePNGdiff); - free(inParam.tabMSEvalues); - free(inParam.tabPEAKvalues); - free(inParam.base_filename); - free(inParam.test_filename); + free(inParam.tabMSEvalues); + free(inParam.tabPEAKvalues); + free(inParam.base_filename); + free(inParam.test_filename); - return failed ? EXIT_FAILURE : EXIT_SUCCESS; + return failed ? EXIT_FAILURE : EXIT_SUCCESS; } From fe4c15f12c562a42a6b0c4b0a0c5e42a25797235 Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Tue, 19 May 2020 18:03:58 +0200 Subject: [PATCH 07/24] Testing: revise testing of lossy encoding by comparing PEAK and MSE with original image --- tests/compare_images.c | 54 ++++++++++++++----- tests/nonregression/CMakeLists.txt | 53 +++++++++++++++++- tests/nonregression/test_suite.ctest.in | 16 +++--- ...untu12.04-gcc4.6.3-x86_64-Release-3rdP.txt | 8 --- ...tu14.04-clang3.8.0-x86_64-Release-3rdP.txt | 8 --- ...Ubuntu14.04-gcc4.8.4-i386-Release-3rdP.txt | 8 --- ...Ubuntu14.04-gcc4.8.4-x86_64-Debug-3rdP.txt | 8 --- 7 files changed, 101 insertions(+), 54 deletions(-) delete mode 100644 tools/travis-ci/knownfailures-Ubuntu12.04-gcc4.6.3-x86_64-Release-3rdP.txt delete mode 100644 tools/travis-ci/knownfailures-Ubuntu14.04-clang3.8.0-x86_64-Release-3rdP.txt delete mode 100644 tools/travis-ci/knownfailures-Ubuntu14.04-gcc4.8.4-x86_64-Debug-3rdP.txt diff --git a/tests/compare_images.c b/tests/compare_images.c index ed39a1ae..c2d95e5b 100644 --- a/tests/compare_images.c +++ b/tests/compare_images.c @@ -96,6 +96,8 @@ static void compare_images_help_display(void) "for ref/base file and for test file. \n"); fprintf(stdout, " -d \t OPTIONAL \t indicate if you want to run this function as conformance test or as non regression test\n"); + fprintf(stdout, + " -i \t OPTIONAL \t list of features to ignore. Currently 'prec' only supported\n"); fprintf(stdout, "\n"); } @@ -434,6 +436,8 @@ typedef struct test_cmp_parameters { char separator_base[2]; /** */ char separator_test[2]; + /** whether to ignore prec differences */ + int ignore_prec; } test_cmp_parameters; @@ -459,7 +463,8 @@ static int parse_cmdline_cmp(int argc, char **argv, test_cmp_parameters* param) char *separatorList = NULL; size_t sizemembasefile, sizememtestfile; int index, flagM = 0, flagP = 0; - const char optlist[] = "b:t:n:m:p:s:d"; + const char optlist[] = "b:t:n:m:p:s:di:"; + char* ignoreList = NULL; int c; /* Init parameters*/ @@ -471,6 +476,7 @@ static int parse_cmdline_cmp(int argc, char **argv, test_cmp_parameters* param) param->nr_flag = 0; param->separator_base[0] = 0; param->separator_test[0] = 0; + param->ignore_prec = 0; opj_opterr = 0; @@ -505,6 +511,9 @@ static int parse_cmdline_cmp(int argc, char **argv, test_cmp_parameters* param) case 's': separatorList = opj_optarg; break; + case 'i': + ignoreList = opj_optarg; + break; case '?': if ((opj_optopt == 'b') || (opj_optopt == 't') || (opj_optopt == 'n') || (opj_optopt == 'p') || (opj_optopt == 'm') || (opj_optopt @@ -618,6 +627,14 @@ static int parse_cmdline_cmp(int argc, char **argv, test_cmp_parameters* param) } } + if (ignoreList != NULL) { + if (strcmp(ignoreList, "prec") == 0) { + param->ignore_prec = 1; + } else { + fprintf(stderr, "Unsupported value for -i\n"); + return 1; + } + } if ((param->nr_flag) && (flagP || flagM)) { fprintf(stderr, @@ -645,7 +662,7 @@ int main(int argc, char **argv) char *filenamePNGtest = NULL, *filenamePNGbase = NULL, *filenamePNGdiff = NULL; size_t memsizebasefilename, memsizetestfilename; size_t memsizedifffilename; - int valueDiff = 0, nbPixelDiff = 0; + int nbPixelDiff = 0; double sumDiff = 0.0; /* Structures to store image parameters and data*/ opj_image_t *imageBase = NULL, *imageTest = NULL, *imageDiff = NULL; @@ -790,14 +807,16 @@ int main(int argc, char **argv) goto cleanup; } - if (((imageBase->comps)[it_comp]).prec != ((imageTest->comps)[it_comp]).prec) { + if (((imageBase->comps)[it_comp]).prec != ((imageTest->comps)[it_comp]).prec && + !inParam.ignore_prec) { printf("ERROR: prec mismatch [comp %d] (%d><%d)\n", it_comp, ((imageBase->comps)[it_comp]).prec, ((imageTest->comps)[it_comp]).prec); goto cleanup; } - if (((imageBase->comps)[it_comp]).bpp != ((imageTest->comps)[it_comp]).bpp) { - printf("ERROR: byte per pixel mismatch [comp %d] (%d><%d)\n", it_comp, + if (((imageBase->comps)[it_comp]).bpp != ((imageTest->comps)[it_comp]).bpp && + !inParam.ignore_prec) { + printf("ERROR: bit per pixel mismatch [comp %d] (%d><%d)\n", it_comp, ((imageBase->comps)[it_comp]).bpp, ((imageTest->comps)[it_comp]).bpp); goto cleanup; } @@ -831,16 +850,25 @@ int main(int argc, char **argv) /*printf("filenamePNGdiff = %s [%d / %d octets]\n",filenamePNGdiff, strlen(filenamePNGdiff),memsizedifffilename );*/ /* Compute pixel diff*/ + failed = 0; for (it_comp = 0; it_comp < imageDiff->numcomps; it_comp++) { double SE = 0, PEAK = 0; double MSE = 0; + unsigned right_shift_input = 0; + unsigned right_shift_output = 0; + if (((imageBase->comps)[it_comp]).bpp > ((imageTest->comps)[it_comp]).bpp) { + right_shift_input = ((imageBase->comps)[it_comp]).bpp - (( + imageTest->comps)[it_comp]).bpp; + } else { + right_shift_output = ((imageTest->comps)[it_comp]).bpp - (( + imageBase->comps)[it_comp]).bpp; + } for (itpxl = 0; itpxl < ((imageDiff->comps)[it_comp]).w * ((imageDiff->comps)[it_comp]).h; itpxl++) { - if (abs(((imageBase->comps)[it_comp]).data[itpxl] - (( - imageTest->comps)[it_comp]).data[itpxl]) > 0) { - valueDiff = ((imageBase->comps)[it_comp]).data[itpxl] - (( - imageTest->comps)[it_comp]).data[itpxl]; + int valueDiff = (((imageBase->comps)[it_comp]).data[itpxl] >> right_shift_input) + - (((imageTest->comps)[it_comp]).data[itpxl] >> right_shift_output); + if (valueDiff != 0) { ((imageDiff->comps)[it_comp]).data[itpxl] = abs(valueDiff); sumDiff += valueDiff; nbPixelDiff++; @@ -867,7 +895,7 @@ int main(int argc, char **argv) printf("ERROR: MSE (%f) or PEAK (%f) values produced by the decoded file are greater " "than the allowable error (respectively %f and %f) \n", MSE, PEAK, inParam.tabMSEvalues[it_comp], inParam.tabPEAKvalues[it_comp]); - goto cleanup; + failed = 1; } } else { /* Non regression-test */ if (nbPixelDiff > 0) { @@ -928,13 +956,15 @@ int main(int argc, char **argv) free(filenamePNGdiff_it_comp); } #endif + failed = 1; goto cleanup; } } } /* it_comp loop */ - printf("---- TEST SUCCEED ----\n"); - failed = 0; + if (!failed) { + printf("---- TEST SUCCEED ----\n"); + } cleanup: /*-----------------------------*/ free(param_image_diff); diff --git a/tests/nonregression/CMakeLists.txt b/tests/nonregression/CMakeLists.txt index 9f956b90..f1813ed8 100644 --- a/tests/nonregression/CMakeLists.txt +++ b/tests/nonregression/CMakeLists.txt @@ -248,6 +248,32 @@ foreach(OPJ_TEST_CMD_LINE ${OPJ_TEST_CMD_LINE_LIST}) list(REMOVE_AT CMD_ARG_LIST 0) + if(ENC_TEST_FOUND) + + # Parse lines like opj_compress lossy-check { -n 3 -m 0:0:0 -p 0:0:0 } ... + set(LOSSY_CHECK_ARG_LIST "") + list(GET CMD_ARG_LIST 0 NEXT_ARG) + string(REGEX MATCH "^lossy-check$" LOSSY_CHECK ${NEXT_ARG}) + if(LOSSY_CHECK) + list(REMOVE_AT CMD_ARG_LIST 0) + list(GET CMD_ARG_LIST 0 NEXT_ARG) + string(REGEX MATCH "^{$" FOUND_OPEN_CURL ${NEXT_ARG}) + if(NOT FOUND_OPEN_CURL) + message( FATAL_ERROR "'{' expected after lossy-check") + endif() + list(REMOVE_AT CMD_ARG_LIST 0) + while(TRUE) + list(GET CMD_ARG_LIST 0 NEXT_ARG) + list(REMOVE_AT CMD_ARG_LIST 0) + string(REGEX MATCH "^}$" FOUND_CLOSE_CURL ${NEXT_ARG}) + if(FOUND_CLOSE_CURL) + break() + endif() + list (APPEND LOSSY_CHECK_ARG_LIST ${NEXT_ARG}) + endwhile() + endif() + endif() + endif () # Parse the argument list to find the input filename and output filename @@ -320,8 +346,32 @@ foreach(OPJ_TEST_CMD_LINE ${OPJ_TEST_CMD_LINE_LIST}) PROPERTIES DEPENDS NR-ENC-${INPUT_FILENAME_NAME}-${IT_TEST_ENC}-dump) + if(LOSSY_CHECK) + add_test(NAME NR-ENC-${INPUT_FILENAME_NAME}-${IT_TEST_ENC}-decode-ref + COMMAND opj_decompress + -i ${OUTPUT_FILENAME} + -o ${OUTPUT_FILENAME}.tif + ) + + set_tests_properties(NR-ENC-${INPUT_FILENAME_NAME}-${IT_TEST_ENC}-decode-ref + PROPERTIES DEPENDS + NR-ENC-${INPUT_FILENAME_NAME}-${IT_TEST_ENC}-encode) + + # Compare the decoding file with original one, using tolerance + add_test(NAME NR-ENC-${INPUT_FILENAME_NAME}-${IT_TEST_ENC}-compare_dec-ref-out2base + COMMAND compare_images + -b ${INPUT_FILENAME} + -t ${OUTPUT_FILENAME}.tif + -s bXtY + ${LOSSY_CHECK_ARG_LIST} + ) + + set_tests_properties(NR-ENC-${INPUT_FILENAME_NAME}-${IT_TEST_ENC}-compare_dec-ref-out2base + PROPERTIES DEPENDS + NR-ENC-${INPUT_FILENAME_NAME}-${IT_TEST_ENC}-decode-ref) + # Decode the encoding file with kakadu expand command - if (KDU_EXPAND_EXECUTABLE) + elseif (KDU_EXPAND_EXECUTABLE) add_test(NAME NR-ENC-${INPUT_FILENAME_NAME}-${IT_TEST_ENC}-decode-ref COMMAND ${KDU_EXPAND_EXECUTABLE} -i ${OUTPUT_FILENAME} @@ -344,7 +394,6 @@ foreach(OPJ_TEST_CMD_LINE ${OPJ_TEST_CMD_LINE_LIST}) PROPERTIES DEPENDS NR-ENC-${INPUT_FILENAME_NAME}-${IT_TEST_ENC}-decode-ref) endif() - endif() # Test the encoded file is a valid JP2 file diff --git a/tests/nonregression/test_suite.ctest.in b/tests/nonregression/test_suite.ctest.in index 3ed97ab1..3cf4dfc2 100644 --- a/tests/nonregression/test_suite.ctest.in +++ b/tests/nonregression/test_suite.ctest.in @@ -32,16 +32,16 @@ opj_compress -i @INPUT_NR_PATH@/random-issue-0005.tif -o @TEMP_PATH@/random-issu # related to issue 62 opj_compress -i @INPUT_NR_PATH@/tmp-issue-0062.raw -o @TEMP_PATH@/tmp-issue-0062-u.raw.j2k -F 512,512,1,16,u opj_compress -i @INPUT_NR_PATH@/tmp-issue-0062.raw -o @TEMP_PATH@/tmp-issue-0062-s.raw.j2k -F 512,512,1,16,s -opj_compress -i @INPUT_NR_PATH@/X_4_2K_24_185_CBR_WB_000.tif -o @TEMP_PATH@/X_4_2K_24_185_CBR_WB_000_C2K_24.j2k -cinema2K 24 -opj_compress -i @INPUT_NR_PATH@/X_5_2K_24_235_CBR_STEM24_000.tif -o @TEMP_PATH@/X_5_2K_24_235_CBR_STEM24_000_C2K_24.j2k -cinema2K 24 -opj_compress -i @INPUT_NR_PATH@/X_6_2K_24_FULL_CBR_CIRCLE_000.tif -o @TEMP_PATH@/X_6_2K_24_FULL_CBR_CIRCLE_000_C2K_24.j2k -cinema2K 24 -opj_compress -i @INPUT_NR_PATH@/X_4_2K_24_185_CBR_WB_000.tif -o @TEMP_PATH@/X_4_2K_24_185_CBR_WB_000_C2K_48.j2k -cinema2K 48 -opj_compress -i @INPUT_NR_PATH@/X_5_2K_24_235_CBR_STEM24_000.tif -o @TEMP_PATH@/X_5_2K_24_235_CBR_STEM24_000_C2K_48.j2k -cinema2K 48 -opj_compress -i @INPUT_NR_PATH@/X_6_2K_24_FULL_CBR_CIRCLE_000.tif -o @TEMP_PATH@/X_6_2K_24_FULL_CBR_CIRCLE_000_C2K_48.j2k -cinema2K 48 -opj_compress -i @INPUT_NR_PATH@/ElephantDream_4K.tif -o @TEMP_PATH@/ElephantDream_4K_C4K.j2k -cinema4K +opj_compress lossy-check { -n 3 -i prec -m 175:100:212 -p 78:63:91 } -i @INPUT_NR_PATH@/X_4_2K_24_185_CBR_WB_000.tif -o @TEMP_PATH@/X_4_2K_24_185_CBR_WB_000_C2K_24.j2k -cinema2K 24 +opj_compress lossy-check { -n 3 -i prec -m 298:168:363 -p 121:73:164 } -i @INPUT_NR_PATH@/X_5_2K_24_235_CBR_STEM24_000.tif -o @TEMP_PATH@/X_5_2K_24_235_CBR_STEM24_000_C2K_24.j2k -cinema2K 24 +opj_compress lossy-check { -n 3 -i prec -m 76:54:140 -p 55:49:74 } -i @INPUT_NR_PATH@/X_6_2K_24_FULL_CBR_CIRCLE_000.tif -o @TEMP_PATH@/X_6_2K_24_FULL_CBR_CIRCLE_000_C2K_24.j2k -cinema2K 24 +opj_compress lossy-check { -n 3 -i prec -m 384:385:842 -p 134:146:200 } -i @INPUT_NR_PATH@/X_4_2K_24_185_CBR_WB_000.tif -o @TEMP_PATH@/X_4_2K_24_185_CBR_WB_000_C2K_48.j2k -cinema2K 48 +opj_compress lossy-check { -n 3 -i prec -m 933:827:2206 -p 201:184:314 } -i @INPUT_NR_PATH@/X_5_2K_24_235_CBR_STEM24_000.tif -o @TEMP_PATH@/X_5_2K_24_235_CBR_STEM24_000_C2K_48.j2k -cinema2K 48 +opj_compress lossy-check { -n 3 -i prec -m 194:173:531 -p 94:79:154 } -i @INPUT_NR_PATH@/X_6_2K_24_FULL_CBR_CIRCLE_000.tif -o @TEMP_PATH@/X_6_2K_24_FULL_CBR_CIRCLE_000_C2K_48.j2k -cinema2K 48 +opj_compress lossy-check { -n 3 -i prec -m 6:4:7 -p 141:141:193 } -i @INPUT_NR_PATH@/ElephantDream_4K.tif -o @TEMP_PATH@/ElephantDream_4K_C4K.j2k -cinema4K # issue 141 opj_compress -i @INPUT_NR_PATH@/issue141.rawl -o @TEMP_PATH@/issue141.rawl.j2k -F 2048,32,1,16,u -opj_compress -i @INPUT_NR_PATH@/issue141.rawl -o @TEMP_PATH@/issue141-I.rawl.j2k -F 2048,32,1,16,u -I +opj_compress lossy-check { -n 1 -m 61 -p 11 } -i @INPUT_NR_PATH@/issue141.tif -o @TEMP_PATH@/issue141-I.rawl.j2k -I # issue 46: opj_compress -i @INPUT_NR_PATH@/Bretagne2.ppm -o @TEMP_PATH@/Bretagne2_5.j2k -c [64,64] # issue 316 diff --git a/tools/travis-ci/knownfailures-Ubuntu12.04-gcc4.6.3-x86_64-Release-3rdP.txt b/tools/travis-ci/knownfailures-Ubuntu12.04-gcc4.6.3-x86_64-Release-3rdP.txt deleted file mode 100644 index 2554d91c..00000000 --- a/tools/travis-ci/knownfailures-Ubuntu12.04-gcc4.6.3-x86_64-Release-3rdP.txt +++ /dev/null @@ -1,8 +0,0 @@ -NR-ENC-X_4_2K_24_185_CBR_WB_000.tif-15-compare_dec-ref-out2base -NR-ENC-X_5_2K_24_235_CBR_STEM24_000.tif-16-compare_dec-ref-out2base -NR-ENC-X_6_2K_24_FULL_CBR_CIRCLE_000.tif-17-compare_dec-ref-out2base -NR-ENC-X_4_2K_24_185_CBR_WB_000.tif-18-compare_dec-ref-out2base -NR-ENC-X_5_2K_24_235_CBR_STEM24_000.tif-19-compare_dec-ref-out2base -NR-ENC-X_6_2K_24_FULL_CBR_CIRCLE_000.tif-20-compare_dec-ref-out2base -NR-ENC-ElephantDream_4K.tif-21-compare_dec-ref-out2base -NR-ENC-issue141.rawl-23-compare_dec-ref-out2base diff --git a/tools/travis-ci/knownfailures-Ubuntu14.04-clang3.8.0-x86_64-Release-3rdP.txt b/tools/travis-ci/knownfailures-Ubuntu14.04-clang3.8.0-x86_64-Release-3rdP.txt deleted file mode 100644 index 2554d91c..00000000 --- a/tools/travis-ci/knownfailures-Ubuntu14.04-clang3.8.0-x86_64-Release-3rdP.txt +++ /dev/null @@ -1,8 +0,0 @@ -NR-ENC-X_4_2K_24_185_CBR_WB_000.tif-15-compare_dec-ref-out2base -NR-ENC-X_5_2K_24_235_CBR_STEM24_000.tif-16-compare_dec-ref-out2base -NR-ENC-X_6_2K_24_FULL_CBR_CIRCLE_000.tif-17-compare_dec-ref-out2base -NR-ENC-X_4_2K_24_185_CBR_WB_000.tif-18-compare_dec-ref-out2base -NR-ENC-X_5_2K_24_235_CBR_STEM24_000.tif-19-compare_dec-ref-out2base -NR-ENC-X_6_2K_24_FULL_CBR_CIRCLE_000.tif-20-compare_dec-ref-out2base -NR-ENC-ElephantDream_4K.tif-21-compare_dec-ref-out2base -NR-ENC-issue141.rawl-23-compare_dec-ref-out2base diff --git a/tools/travis-ci/knownfailures-Ubuntu14.04-gcc4.8.4-i386-Release-3rdP.txt b/tools/travis-ci/knownfailures-Ubuntu14.04-gcc4.8.4-i386-Release-3rdP.txt index 9615008a..8e53a623 100644 --- a/tools/travis-ci/knownfailures-Ubuntu14.04-gcc4.8.4-i386-Release-3rdP.txt +++ b/tools/travis-ci/knownfailures-Ubuntu14.04-gcc4.8.4-i386-Release-3rdP.txt @@ -46,11 +46,3 @@ NR-DEC-kodak_2layers_lrcp.j2c-32-decode-md5 NR-DEC-issue135.j2k-68-decode-md5 NR-DEC-db11217111510058.jp2-306-decode-md5 NR-DEC-tnsot_zero.jp2-307-decode-md5 -NR-ENC-X_4_2K_24_185_CBR_WB_000.tif-15-compare_dec-ref-out2base -NR-ENC-X_5_2K_24_235_CBR_STEM24_000.tif-16-compare_dec-ref-out2base -NR-ENC-X_6_2K_24_FULL_CBR_CIRCLE_000.tif-17-compare_dec-ref-out2base -NR-ENC-X_4_2K_24_185_CBR_WB_000.tif-18-compare_dec-ref-out2base -NR-ENC-X_5_2K_24_235_CBR_STEM24_000.tif-19-compare_dec-ref-out2base -NR-ENC-X_6_2K_24_FULL_CBR_CIRCLE_000.tif-20-compare_dec-ref-out2base -NR-ENC-ElephantDream_4K.tif-21-compare_dec-ref-out2base -NR-ENC-issue141.rawl-23-compare_dec-ref-out2base diff --git a/tools/travis-ci/knownfailures-Ubuntu14.04-gcc4.8.4-x86_64-Debug-3rdP.txt b/tools/travis-ci/knownfailures-Ubuntu14.04-gcc4.8.4-x86_64-Debug-3rdP.txt deleted file mode 100644 index 2554d91c..00000000 --- a/tools/travis-ci/knownfailures-Ubuntu14.04-gcc4.8.4-x86_64-Debug-3rdP.txt +++ /dev/null @@ -1,8 +0,0 @@ -NR-ENC-X_4_2K_24_185_CBR_WB_000.tif-15-compare_dec-ref-out2base -NR-ENC-X_5_2K_24_235_CBR_STEM24_000.tif-16-compare_dec-ref-out2base -NR-ENC-X_6_2K_24_FULL_CBR_CIRCLE_000.tif-17-compare_dec-ref-out2base -NR-ENC-X_4_2K_24_185_CBR_WB_000.tif-18-compare_dec-ref-out2base -NR-ENC-X_5_2K_24_235_CBR_STEM24_000.tif-19-compare_dec-ref-out2base -NR-ENC-X_6_2K_24_FULL_CBR_CIRCLE_000.tif-20-compare_dec-ref-out2base -NR-ENC-ElephantDream_4K.tif-21-compare_dec-ref-out2base -NR-ENC-issue141.rawl-23-compare_dec-ref-out2base From c6a413a42394836b956846cc037dd8297b732f44 Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Tue, 19 May 2020 19:45:00 +0200 Subject: [PATCH 08/24] opj_mct_encode_real(): add SSE optimization --- src/lib/openjp2/mct.c | 47 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/src/lib/openjp2/mct.c b/src/lib/openjp2/mct.c index 9d79b50a..88c8f409 100644 --- a/src/lib/openjp2/mct.c +++ b/src/lib/openjp2/mct.c @@ -216,6 +216,53 @@ void opj_mct_encode_real( OPJ_SIZE_T n) { OPJ_SIZE_T i; +#ifdef __SSE__ + const __m128 YR = _mm_set1_ps(0.299f); + const __m128 YG = _mm_set1_ps(0.587f); + const __m128 YB = _mm_set1_ps(0.114f); + const __m128 UR = _mm_set1_ps(-0.16875f); + const __m128 UG = _mm_set1_ps(-0.331260f); + const __m128 UB = _mm_set1_ps(0.5f); + const __m128 VR = _mm_set1_ps(0.5f); + const __m128 VG = _mm_set1_ps(-0.41869f); + const __m128 VB = _mm_set1_ps(-0.08131f); + for (i = 0; i < (n >> 3); i ++) { + __m128 r, g, b, y, u, v; + + r = _mm_load_ps(c0); + g = _mm_load_ps(c1); + b = _mm_load_ps(c2); + y = _mm_add_ps(_mm_add_ps(_mm_mul_ps(r, YR), _mm_mul_ps(g, YG)), + _mm_mul_ps(b, YB)); + u = _mm_add_ps(_mm_add_ps(_mm_mul_ps(r, UR), _mm_mul_ps(g, UG)), + _mm_mul_ps(b, UB)); + v = _mm_add_ps(_mm_add_ps(_mm_mul_ps(r, VR), _mm_mul_ps(g, VG)), + _mm_mul_ps(b, VB)); + _mm_store_ps(c0, y); + _mm_store_ps(c1, u); + _mm_store_ps(c2, v); + c0 += 4; + c1 += 4; + c2 += 4; + + r = _mm_load_ps(c0); + g = _mm_load_ps(c1); + b = _mm_load_ps(c2); + y = _mm_add_ps(_mm_add_ps(_mm_mul_ps(r, YR), _mm_mul_ps(g, YG)), + _mm_mul_ps(b, YB)); + u = _mm_add_ps(_mm_add_ps(_mm_mul_ps(r, UR), _mm_mul_ps(g, UG)), + _mm_mul_ps(b, UB)); + v = _mm_add_ps(_mm_add_ps(_mm_mul_ps(r, VR), _mm_mul_ps(g, VG)), + _mm_mul_ps(b, VB)); + _mm_store_ps(c0, y); + _mm_store_ps(c1, u); + _mm_store_ps(c2, v); + c0 += 4; + c1 += 4; + c2 += 4; + } + n &= 7; +#endif for (i = 0; i < n; ++i) { OPJ_FLOAT32 r = c0[i]; OPJ_FLOAT32 g = c1[i]; From 4ab2ed090747ad2f22e78d45a1db5640d40e6a34 Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Tue, 19 May 2020 22:26:22 +0200 Subject: [PATCH 09/24] opj_j2k_setup_encoder(): add validation of tile width and height to avoid potential division by zero --- src/lib/openjp2/j2k.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/lib/openjp2/j2k.c b/src/lib/openjp2/j2k.c index 842c8caa..1a6cdc3e 100644 --- a/src/lib/openjp2/j2k.c +++ b/src/lib/openjp2/j2k.c @@ -7823,6 +7823,14 @@ OPJ_BOOL opj_j2k_setup_encoder(opj_j2k_t *p_j2k, */ if (parameters->tile_size_on) { + if (cp->tdx == 0) { + opj_event_msg(p_manager, EVT_ERROR, "Invalid tile width\n"); + return OPJ_FALSE; + } + if (cp->tdy == 0) { + opj_event_msg(p_manager, EVT_ERROR, "Invalid tile height\n"); + return OPJ_FALSE; + } cp->tw = (OPJ_UINT32)opj_int_ceildiv((OPJ_INT32)(image->x1 - cp->tx0), (OPJ_INT32)cp->tdx); cp->th = (OPJ_UINT32)opj_int_ceildiv((OPJ_INT32)(image->y1 - cp->ty0), From e46e300de55aa35279935ef9e8572949d725d833 Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Wed, 20 May 2020 11:36:05 +0200 Subject: [PATCH 10/24] opj_dwt_encode_1_real(): avoid many bound comparisons, similarly to decoding side --- src/lib/openjp2/dwt.c | 108 ++++++++++++++++++++++++------------------ 1 file changed, 63 insertions(+), 45 deletions(-) diff --git a/src/lib/openjp2/dwt.c b/src/lib/openjp2/dwt.c index bdc91cf5..a825f013 100644 --- a/src/lib/openjp2/dwt.c +++ b/src/lib/openjp2/dwt.c @@ -1040,59 +1040,77 @@ static void opj_idwt53_v(const opj_dwt_t *dwt, #endif } +static void opj_dwt_encode_step1(OPJ_FLOAT32* fw, + OPJ_UINT32 start, + OPJ_UINT32 end, + const OPJ_FLOAT32 c) +{ + OPJ_UINT32 i; + for (i = start; i < end; ++i) { + fw[i * 2] *= c; + } +} +static void opj_dwt_encode_step2(OPJ_FLOAT32* fl, OPJ_FLOAT32* fw, + OPJ_UINT32 start, + OPJ_UINT32 end, + OPJ_UINT32 m, + OPJ_FLOAT32 c) +{ + OPJ_UINT32 i; + OPJ_UINT32 imax = opj_uint_min(end, m); + if (start > 0) { + fw += 2 * start; + fl = fw - 2; + } + for (i = start; i < imax; ++i) { + fw[-1] += (fl[0] + fw[0]) * c; + fl = fw; + fw += 2; + } + if (m < end) { + assert(m + 1 == end); + fw[-1] += (2 * fl[0]) * c; + } +} -/* */ -/* Forward 9-7 wavelet transform in 1-D. */ -/* */ static void opj_dwt_encode_1_real(void *aIn, OPJ_INT32 dn, OPJ_INT32 sn, OPJ_INT32 cas) { - OPJ_INT32 i; - OPJ_FLOAT32* a = (OPJ_FLOAT32*)aIn; - - if (!cas) { - if ((dn > 0) || (sn > 1)) { /* NEW : CASE ONE ELEMENT */ - for (i = 0; i < dn; i++) { - OPJ_D(i) += opj_dwt_alpha * (OPJ_S_(i) + OPJ_S_(i + 1)); - } - for (i = 0; i < sn; i++) { - OPJ_S(i) += opj_dwt_beta * (OPJ_D_(i - 1) + OPJ_D_(i)); - } - for (i = 0; i < dn; i++) { - OPJ_D(i) += opj_dwt_gamma * (OPJ_S_(i) + OPJ_S_(i + 1)); - } - for (i = 0; i < sn; i++) { - OPJ_S(i) += opj_dwt_delta * (OPJ_D_(i - 1) + OPJ_D_(i)); - } - for (i = 0; i < dn; i++) { - OPJ_D(i) = opj_K / 2 * OPJ_D(i); - } - for (i = 0; i < sn; i++) { - OPJ_S(i) = opj_c13318 / 2 * OPJ_S(i); - } + OPJ_FLOAT32* w = (OPJ_FLOAT32*)aIn; + OPJ_INT32 a, b; + if (cas == 0) { + if (!((dn > 0) || (sn > 1))) { + return; } + a = 0; + b = 1; } else { - if ((sn > 0) || (dn > 1)) { /* NEW : CASE ONE ELEMENT */ - for (i = 0; i < dn; i++) { - OPJ_S(i) += opj_dwt_alpha * (OPJ_DD_(i) + OPJ_DD_(i - 1)); - } - for (i = 0; i < sn; i++) { - OPJ_D(i) += opj_dwt_beta * (OPJ_SS_(i) + OPJ_SS_(i + 1)); - } - for (i = 0; i < dn; i++) { - OPJ_S(i) += opj_dwt_gamma * (OPJ_DD_(i) + OPJ_DD_(i - 1)); - } - for (i = 0; i < sn; i++) { - OPJ_D(i) += opj_dwt_delta * (OPJ_SS_(i) + OPJ_SS_(i + 1)); - } - for (i = 0; i < dn; i++) { - OPJ_S(i) = opj_K / 2 * OPJ_S(i); - } - for (i = 0; i < sn; i++) { - OPJ_D(i) = opj_c13318 / 2 * OPJ_D(i); - } + if (!((sn > 0) || (dn > 1))) { + return; } + a = 1; + b = 0; } + opj_dwt_encode_step2(w + a, w + b + 1, + 0, (OPJ_UINT32)dn, + (OPJ_UINT32)opj_int_min(dn, sn - b), + opj_dwt_alpha); + opj_dwt_encode_step2(w + b, w + a + 1, + 0, (OPJ_UINT32)sn, + (OPJ_UINT32)opj_int_min(sn, dn - a), + opj_dwt_beta); + opj_dwt_encode_step2(w + a, w + b + 1, + 0, (OPJ_UINT32)dn, + (OPJ_UINT32)opj_int_min(dn, sn - b), + opj_dwt_gamma); + opj_dwt_encode_step2(w + b, w + a + 1, + 0, (OPJ_UINT32)sn, + (OPJ_UINT32)opj_int_min(sn, dn - a), + opj_dwt_delta); + opj_dwt_encode_step1(w + b, 0, (OPJ_UINT32)dn, + opj_K / 2); + opj_dwt_encode_step1(w + a, 0, (OPJ_UINT32)sn, + opj_c13318 / 2); } static void opj_dwt_encode_stepsize(OPJ_INT32 stepsize, OPJ_INT32 numbps, From f38c069547f1c41dc94ec4a273efb07997685c21 Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Wed, 20 May 2020 13:14:16 +0200 Subject: [PATCH 11/24] Irreversible decoding: align code more closely to the standard by avoid messing up with stepsize (no functional change) --- src/lib/openjp2/t1.c | 8 +++++--- src/lib/openjp2/tcd.c | 10 +++++----- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/src/lib/openjp2/t1.c b/src/lib/openjp2/t1.c index 8d5feadf..0787dce8 100644 --- a/src/lib/openjp2/t1.c +++ b/src/lib/openjp2/t1.c @@ -1725,10 +1725,11 @@ static void opj_t1_clbl_decode_processor(void* user_data, opj_tls_t* tls) datap[i] /= 2; } } else { /* if (tccp->qmfbid == 0) */ + const float stepsize = 0.5f * band->stepsize; i = 0; #ifdef __SSE2__ { - const __m128 xmm_stepsize = _mm_set1_ps(band->stepsize); + const __m128 xmm_stepsize = _mm_set1_ps(stepsize); for (; i < (cblk_size & ~15U); i += 16) { __m128 xmm0_data = _mm_cvtepi32_ps(_mm_load_si128((__m128i * const)( datap + 0))); @@ -1747,7 +1748,7 @@ static void opj_t1_clbl_decode_processor(void* user_data, opj_tls_t* tls) } #endif for (; i < cblk_size; ++i) { - OPJ_FLOAT32 tmp = ((OPJ_FLOAT32)(*datap)) * band->stepsize; + OPJ_FLOAT32 tmp = ((OPJ_FLOAT32)(*datap)) * stepsize; memcpy(datap, &tmp, sizeof(tmp)); datap++; } @@ -1773,12 +1774,13 @@ static void opj_t1_clbl_decode_processor(void* user_data, opj_tls_t* tls) } } } else { /* if (tccp->qmfbid == 0) */ + const float stepsize = 0.5f * band->stepsize; OPJ_FLOAT32* OPJ_RESTRICT tiledp = (OPJ_FLOAT32*) &tilec->data[(OPJ_SIZE_T)y * tile_w + (OPJ_SIZE_T)x]; for (j = 0; j < cblk_h; ++j) { OPJ_FLOAT32* OPJ_RESTRICT tiledp2 = tiledp; for (i = 0; i < cblk_w; ++i) { - OPJ_FLOAT32 tmp = (OPJ_FLOAT32) * datap * band->stepsize; + OPJ_FLOAT32 tmp = (OPJ_FLOAT32) * datap * stepsize; *tiledp2 = tmp; datap++; tiledp2++; diff --git a/src/lib/openjp2/tcd.c b/src/lib/openjp2/tcd.c index e41e7772..a84ec063 100644 --- a/src/lib/openjp2/tcd.c +++ b/src/lib/openjp2/tcd.c @@ -112,7 +112,7 @@ void tcd_dump(FILE *fd, opj_tcd_t *tcd, opj_tcd_image_t * img) * Initializes tile coding/decoding */ static INLINE OPJ_BOOL opj_tcd_init_tile(opj_tcd_t *p_tcd, OPJ_UINT32 p_tile_no, - OPJ_BOOL isEncoder, OPJ_FLOAT32 fraction, OPJ_SIZE_T sizeof_block, + OPJ_BOOL isEncoder, OPJ_SIZE_T sizeof_block, opj_event_mgr_t* manager); /** @@ -721,7 +721,7 @@ OPJ_BOOL opj_alloc_tile_component_data(opj_tcd_tilecomp_t *l_tilec) /* ----------------------------------------------------------------------- */ static INLINE OPJ_BOOL opj_tcd_init_tile(opj_tcd_t *p_tcd, OPJ_UINT32 p_tile_no, - OPJ_BOOL isEncoder, OPJ_FLOAT32 fraction, OPJ_SIZE_T sizeof_block, + OPJ_BOOL isEncoder, OPJ_SIZE_T sizeof_block, opj_event_mgr_t* manager) { OPJ_UINT32(*l_gain_ptr)(OPJ_UINT32) = 00; @@ -1013,7 +1013,7 @@ static INLINE OPJ_BOOL opj_tcd_init_tile(opj_tcd_t *p_tcd, OPJ_UINT32 p_tile_no, /* Delta_b value of Equation E-3 in "E.1 Inverse quantization * procedure" of the standard */ l_band->stepsize = (OPJ_FLOAT32)(((1.0 + l_step_size->mant / 2048.0) * pow(2.0, - (OPJ_INT32)(numbps - l_step_size->expn)))) * fraction; + (OPJ_INT32)(numbps - l_step_size->expn)))); /* Mb value of Equation E-2 in "E.1 Inverse quantization * procedure" of the standard */ l_band->numbps = l_step_size->expn + (OPJ_INT32)l_tccp->numgbits - @@ -1196,14 +1196,14 @@ static INLINE OPJ_BOOL opj_tcd_init_tile(opj_tcd_t *p_tcd, OPJ_UINT32 p_tile_no, OPJ_BOOL opj_tcd_init_encode_tile(opj_tcd_t *p_tcd, OPJ_UINT32 p_tile_no, opj_event_mgr_t* p_manager) { - return opj_tcd_init_tile(p_tcd, p_tile_no, OPJ_TRUE, 1.0F, + return opj_tcd_init_tile(p_tcd, p_tile_no, OPJ_TRUE, sizeof(opj_tcd_cblk_enc_t), p_manager); } OPJ_BOOL opj_tcd_init_decode_tile(opj_tcd_t *p_tcd, OPJ_UINT32 p_tile_no, opj_event_mgr_t* p_manager) { - return opj_tcd_init_tile(p_tcd, p_tile_no, OPJ_FALSE, 0.5F, + return opj_tcd_init_tile(p_tcd, p_tile_no, OPJ_FALSE, sizeof(opj_tcd_cblk_dec_t), p_manager); } From 3cd1305596f191a01afdc11f9355f9c6590065dd Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Wed, 20 May 2020 18:00:45 +0200 Subject: [PATCH 12/24] Irreversible compression/decompression DWT: use 1/K constant as per standard The previous constant opj_c13318 was mysteriously equal to 2/K , and in the DWT, we had to divide K and opj_c13318 by 2... The issue was that the band->stepsize computation in tcd.c didn't take into account the log2gain of the band. The effect of this change is expected to be mostly equivalent to the previous situation, except some difference in rounding. But it leads to a dramatic reduction of the mean square error and peak error in the irreversible encoding of issue141.tif ! --- src/lib/openjp2/dwt.c | 44 ++++++------------------- src/lib/openjp2/dwt.h | 12 ------- src/lib/openjp2/t1.c | 4 +++ src/lib/openjp2/tcd.c | 28 ++++++++-------- tests/nonregression/test_suite.ctest.in | 12 +++---- 5 files changed, 33 insertions(+), 67 deletions(-) diff --git a/src/lib/openjp2/dwt.c b/src/lib/openjp2/dwt.c index a825f013..de8fdf4e 100644 --- a/src/lib/openjp2/dwt.c +++ b/src/lib/openjp2/dwt.c @@ -103,13 +103,13 @@ typedef struct v4dwt_local { } opj_v4dwt_t ; /* From table F.4 from the standard */ -static const OPJ_FLOAT32 opj_dwt_alpha = -1.586134342f; /* 12994 */ -static const OPJ_FLOAT32 opj_dwt_beta = -0.052980118f; /* 434 */ -static const OPJ_FLOAT32 opj_dwt_gamma = 0.882911075f; /* -7233 */ -static const OPJ_FLOAT32 opj_dwt_delta = 0.443506852f; /* -3633 */ +static const OPJ_FLOAT32 opj_dwt_alpha = -1.586134342f; +static const OPJ_FLOAT32 opj_dwt_beta = -0.052980118f; +static const OPJ_FLOAT32 opj_dwt_gamma = 0.882911075f; +static const OPJ_FLOAT32 opj_dwt_delta = 0.443506852f; -static const OPJ_FLOAT32 opj_K = 1.230174105f; /* 10078 */ -static const OPJ_FLOAT32 opj_c13318 = 1.625732422f; +static const OPJ_FLOAT32 opj_K = 1.230174105f; +static const OPJ_FLOAT32 opj_invK = (OPJ_FLOAT32)(1.0 / 1.230174105); /*@}*/ @@ -1108,9 +1108,9 @@ static void opj_dwt_encode_1_real(void *aIn, OPJ_INT32 dn, OPJ_INT32 sn, (OPJ_UINT32)opj_int_min(sn, dn - a), opj_dwt_delta); opj_dwt_encode_step1(w + b, 0, (OPJ_UINT32)dn, - opj_K / 2); + opj_K); opj_dwt_encode_step1(w + a, 0, (OPJ_UINT32)sn, - opj_c13318 / 2); + opj_invK); } static void opj_dwt_encode_stepsize(OPJ_INT32 stepsize, OPJ_INT32 numbps, @@ -1399,21 +1399,6 @@ OPJ_BOOL opj_dwt_decode(opj_tcd_t *p_tcd, opj_tcd_tilecomp_t* tilec, } } - -/* */ -/* Get gain of 5-3 wavelet transform. */ -/* */ -OPJ_UINT32 opj_dwt_getgain(OPJ_UINT32 orient) -{ - if (orient == 0) { - return 0; - } - if (orient == 1 || orient == 2) { - return 1; - } - return 2; -} - /* */ /* Get norm of 5-3 wavelet. */ /* */ @@ -1440,15 +1425,6 @@ OPJ_BOOL opj_dwt_encode_real(opj_tcd_t *p_tcd, opj_dwt_encode_1_real); } -/* */ -/* Get gain of 9-7 wavelet transform. */ -/* */ -OPJ_UINT32 opj_dwt_getgain_real(OPJ_UINT32 orient) -{ - (void)orient; - return 0; -} - /* */ /* Get norm of 9-7 wavelet. */ /* */ @@ -2649,7 +2625,7 @@ static void opj_v4dwt_decode(opj_v4dwt_t* OPJ_RESTRICT dwt) opj_v4dwt_decode_step1_sse(dwt->wavelet + a, dwt->win_l_x0, dwt->win_l_x1, _mm_set1_ps(opj_K)); opj_v4dwt_decode_step1_sse(dwt->wavelet + b, dwt->win_h_x0, dwt->win_h_x1, - _mm_set1_ps(opj_c13318)); + _mm_set1_ps(opj_invK)); opj_v4dwt_decode_step2_sse(dwt->wavelet + b, dwt->wavelet + a + 1, dwt->win_l_x0, dwt->win_l_x1, (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a), @@ -2670,7 +2646,7 @@ static void opj_v4dwt_decode(opj_v4dwt_t* OPJ_RESTRICT dwt) opj_v4dwt_decode_step1(dwt->wavelet + a, dwt->win_l_x0, dwt->win_l_x1, opj_K); opj_v4dwt_decode_step1(dwt->wavelet + b, dwt->win_h_x0, dwt->win_h_x1, - opj_c13318); + opj_invK); opj_v4dwt_decode_step2(dwt->wavelet + b, dwt->wavelet + a + 1, dwt->win_l_x0, dwt->win_l_x1, (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a), diff --git a/src/lib/openjp2/dwt.h b/src/lib/openjp2/dwt.h index 89c859cb..215061e6 100644 --- a/src/lib/openjp2/dwt.h +++ b/src/lib/openjp2/dwt.h @@ -73,12 +73,6 @@ OPJ_BOOL opj_dwt_decode(opj_tcd_t *p_tcd, opj_tcd_tilecomp_t* tilec, OPJ_UINT32 numres); -/** -Get the gain of a subband for the reversible 5-3 DWT. -@param orient Number that identifies the subband (0->LL, 1->HL, 2->LH, 3->HH) -@return Returns 0 if orient = 0, returns 1 if orient = 1 or 2, returns 2 otherwise -*/ -OPJ_UINT32 opj_dwt_getgain(OPJ_UINT32 orient) ; /** Get the norm of a wavelet function of a subband at a specified level for the reversible 5-3 DWT. @param level Level of the wavelet function @@ -105,12 +99,6 @@ OPJ_BOOL opj_dwt_decode_real(opj_tcd_t *p_tcd, opj_tcd_tilecomp_t* OPJ_RESTRICT tilec, OPJ_UINT32 numres); -/** -Get the gain of a subband for the irreversible 9-7 DWT. -@param orient Number that identifies the subband (0->LL, 1->HL, 2->LH, 3->HH) -@return Returns the gain of the 9-7 wavelet transform -*/ -OPJ_UINT32 opj_dwt_getgain_real(OPJ_UINT32 orient); /** Get the norm of a wavelet function of a subband at a specified level for the irreversible 9-7 DWT @param level Level of the wavelet function diff --git a/src/lib/openjp2/t1.c b/src/lib/openjp2/t1.c index 0787dce8..937f420a 100644 --- a/src/lib/openjp2/t1.c +++ b/src/lib/openjp2/t1.c @@ -1426,7 +1426,11 @@ static OPJ_FLOAT64 opj_t1_getwmsedec( if (qmfbid == 1) { w2 = opj_dwt_getnorm(level, orient); } else { /* if (qmfbid == 0) */ + const OPJ_INT32 log2_gain = (orient == 0) ? 0 : + (orient == 3) ? 2 : 1; w2 = opj_dwt_getnorm_real(level, orient); + /* Not sure this is right. But preserves past behaviour */ + stepsize /= (1 << log2_gain); } wmsedec = w1 * w2 * stepsize * (1 << bpno); diff --git a/src/lib/openjp2/tcd.c b/src/lib/openjp2/tcd.c index a84ec063..02fb11db 100644 --- a/src/lib/openjp2/tcd.c +++ b/src/lib/openjp2/tcd.c @@ -724,7 +724,6 @@ static INLINE OPJ_BOOL opj_tcd_init_tile(opj_tcd_t *p_tcd, OPJ_UINT32 p_tile_no, OPJ_BOOL isEncoder, OPJ_SIZE_T sizeof_block, opj_event_mgr_t* manager) { - OPJ_UINT32(*l_gain_ptr)(OPJ_UINT32) = 00; OPJ_UINT32 compno, resno, bandno, precno, cblkno; opj_tcp_t * l_tcp = 00; opj_cp_t * l_cp = 00; @@ -740,7 +739,6 @@ static INLINE OPJ_BOOL opj_tcd_init_tile(opj_tcd_t *p_tcd, OPJ_UINT32 p_tile_no, OPJ_UINT32 p, q; OPJ_UINT32 l_level_no; OPJ_UINT32 l_pdx, l_pdy; - OPJ_UINT32 l_gain; OPJ_INT32 l_x0b, l_y0b; OPJ_UINT32 l_tx0, l_ty0; /* extent of precincts , top left, bottom right**/ @@ -879,11 +877,6 @@ static INLINE OPJ_BOOL opj_tcd_init_tile(opj_tcd_t *p_tcd, OPJ_UINT32 p_tile_no, l_level_no = l_tilec->numresolutions; l_res = l_tilec->resolutions; l_step_size = l_tccp->stepsizes; - if (l_tccp->qmfbid == 0) { - l_gain_ptr = &opj_dwt_getgain_real; - } else { - l_gain_ptr = &opj_dwt_getgain; - } /*fprintf(stderr, "\tlevel_no=%d\n",l_level_no);*/ for (resno = 0; resno < l_tilec->numresolutions; ++resno) { @@ -970,7 +963,6 @@ static INLINE OPJ_BOOL opj_tcd_init_tile(opj_tcd_t *p_tcd, OPJ_UINT32 p_tile_no, l_band = l_res->bands; for (bandno = 0; bandno < l_res->numbands; ++bandno, ++l_band, ++l_step_size) { - OPJ_INT32 numbps; /*fprintf(stderr, "\t\t\tband_no=%d/%d\n", bandno, l_res->numbands );*/ if (resno == 0) { @@ -1006,14 +998,20 @@ static INLINE OPJ_BOOL opj_tcd_init_tile(opj_tcd_t *p_tcd, OPJ_UINT32 p_tile_no, } } - /** avoid an if with storing function pointer */ - l_gain = (*l_gain_ptr)(l_band->bandno); - numbps = (OPJ_INT32)(l_image_comp->prec + l_gain); + { + /* Table E-1 - Sub-band gains */ + const OPJ_INT32 log2_gain = (l_band->bandno == 0) ? 0 : + (l_band->bandno == 3) ? 2 : 1; + + /* Nominal dynamic range. Equation E-4 */ + const OPJ_INT32 Rb = (OPJ_INT32)l_image_comp->prec + log2_gain; + + /* Delta_b value of Equation E-3 in "E.1 Inverse quantization + * procedure" of the standard */ + l_band->stepsize = (OPJ_FLOAT32)(((1.0 + l_step_size->mant / 2048.0) * pow(2.0, + (OPJ_INT32)(Rb - l_step_size->expn)))); + } - /* Delta_b value of Equation E-3 in "E.1 Inverse quantization - * procedure" of the standard */ - l_band->stepsize = (OPJ_FLOAT32)(((1.0 + l_step_size->mant / 2048.0) * pow(2.0, - (OPJ_INT32)(numbps - l_step_size->expn)))); /* Mb value of Equation E-2 in "E.1 Inverse quantization * procedure" of the standard */ l_band->numbps = l_step_size->expn + (OPJ_INT32)l_tccp->numgbits - diff --git a/tests/nonregression/test_suite.ctest.in b/tests/nonregression/test_suite.ctest.in index 3cf4dfc2..32f87d37 100644 --- a/tests/nonregression/test_suite.ctest.in +++ b/tests/nonregression/test_suite.ctest.in @@ -32,16 +32,16 @@ opj_compress -i @INPUT_NR_PATH@/random-issue-0005.tif -o @TEMP_PATH@/random-issu # related to issue 62 opj_compress -i @INPUT_NR_PATH@/tmp-issue-0062.raw -o @TEMP_PATH@/tmp-issue-0062-u.raw.j2k -F 512,512,1,16,u opj_compress -i @INPUT_NR_PATH@/tmp-issue-0062.raw -o @TEMP_PATH@/tmp-issue-0062-s.raw.j2k -F 512,512,1,16,s -opj_compress lossy-check { -n 3 -i prec -m 175:100:212 -p 78:63:91 } -i @INPUT_NR_PATH@/X_4_2K_24_185_CBR_WB_000.tif -o @TEMP_PATH@/X_4_2K_24_185_CBR_WB_000_C2K_24.j2k -cinema2K 24 -opj_compress lossy-check { -n 3 -i prec -m 298:168:363 -p 121:73:164 } -i @INPUT_NR_PATH@/X_5_2K_24_235_CBR_STEM24_000.tif -o @TEMP_PATH@/X_5_2K_24_235_CBR_STEM24_000_C2K_24.j2k -cinema2K 24 -opj_compress lossy-check { -n 3 -i prec -m 76:54:140 -p 55:49:74 } -i @INPUT_NR_PATH@/X_6_2K_24_FULL_CBR_CIRCLE_000.tif -o @TEMP_PATH@/X_6_2K_24_FULL_CBR_CIRCLE_000_C2K_24.j2k -cinema2K 24 -opj_compress lossy-check { -n 3 -i prec -m 384:385:842 -p 134:146:200 } -i @INPUT_NR_PATH@/X_4_2K_24_185_CBR_WB_000.tif -o @TEMP_PATH@/X_4_2K_24_185_CBR_WB_000_C2K_48.j2k -cinema2K 48 +opj_compress lossy-check { -n 3 -i prec -m 175:100:212 -p 79:64:92 } -i @INPUT_NR_PATH@/X_4_2K_24_185_CBR_WB_000.tif -o @TEMP_PATH@/X_4_2K_24_185_CBR_WB_000_C2K_24.j2k -cinema2K 24 +opj_compress lossy-check { -n 3 -i prec -m 298:168:363 -p 122:73:164 } -i @INPUT_NR_PATH@/X_5_2K_24_235_CBR_STEM24_000.tif -o @TEMP_PATH@/X_5_2K_24_235_CBR_STEM24_000_C2K_24.j2k -cinema2K 24 +opj_compress lossy-check { -n 3 -i prec -m 76:54:140 -p 56:49:74 } -i @INPUT_NR_PATH@/X_6_2K_24_FULL_CBR_CIRCLE_000.tif -o @TEMP_PATH@/X_6_2K_24_FULL_CBR_CIRCLE_000_C2K_24.j2k -cinema2K 24 +opj_compress lossy-check { -n 3 -i prec -m 384:385:842 -p 135:144:202 } -i @INPUT_NR_PATH@/X_4_2K_24_185_CBR_WB_000.tif -o @TEMP_PATH@/X_4_2K_24_185_CBR_WB_000_C2K_48.j2k -cinema2K 48 opj_compress lossy-check { -n 3 -i prec -m 933:827:2206 -p 201:184:314 } -i @INPUT_NR_PATH@/X_5_2K_24_235_CBR_STEM24_000.tif -o @TEMP_PATH@/X_5_2K_24_235_CBR_STEM24_000_C2K_48.j2k -cinema2K 48 opj_compress lossy-check { -n 3 -i prec -m 194:173:531 -p 94:79:154 } -i @INPUT_NR_PATH@/X_6_2K_24_FULL_CBR_CIRCLE_000.tif -o @TEMP_PATH@/X_6_2K_24_FULL_CBR_CIRCLE_000_C2K_48.j2k -cinema2K 48 -opj_compress lossy-check { -n 3 -i prec -m 6:4:7 -p 141:141:193 } -i @INPUT_NR_PATH@/ElephantDream_4K.tif -o @TEMP_PATH@/ElephantDream_4K_C4K.j2k -cinema4K +opj_compress lossy-check { -n 3 -i prec -m 6:4:7 -p 141:141:191 } -i @INPUT_NR_PATH@/ElephantDream_4K.tif -o @TEMP_PATH@/ElephantDream_4K_C4K.j2k -cinema4K # issue 141 opj_compress -i @INPUT_NR_PATH@/issue141.rawl -o @TEMP_PATH@/issue141.rawl.j2k -F 2048,32,1,16,u -opj_compress lossy-check { -n 1 -m 61 -p 11 } -i @INPUT_NR_PATH@/issue141.tif -o @TEMP_PATH@/issue141-I.rawl.j2k -I +opj_compress lossy-check { -n 1 -m 0.1 -p 2 } -i @INPUT_NR_PATH@/issue141.tif -o @TEMP_PATH@/issue141-I.rawl.j2k -I # issue 46: opj_compress -i @INPUT_NR_PATH@/Bretagne2.ppm -o @TEMP_PATH@/Bretagne2_5.j2k -c [64,64] # issue 316 From adccbc8336ce5c46ca7de85ac364d9c992d337c7 Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Wed, 20 May 2020 19:24:09 +0200 Subject: [PATCH 13/24] Irreversible decoding: partially revert previous commit, to fix failures in test suite --- src/lib/openjp2/dwt.c | 12 ++++++++++-- src/lib/openjp2/tcd.c | 6 +++++- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/src/lib/openjp2/dwt.c b/src/lib/openjp2/dwt.c index de8fdf4e..5710e802 100644 --- a/src/lib/openjp2/dwt.c +++ b/src/lib/openjp2/dwt.c @@ -2608,6 +2608,14 @@ static void opj_v4dwt_decode_step2(opj_v4_t* l, opj_v4_t* w, static void opj_v4dwt_decode(opj_v4dwt_t* OPJ_RESTRICT dwt) { OPJ_INT32 a, b; + /* BUG_WEIRD_TWO_INVK (look for this identifier in tcd.c) */ + /* Historic value for 2 / opj_invK */ + /* Normally, we should use invK, but if we do so, we have failures in the */ + /* conformance test, due to MSE and peak errors significantly higher than */ + /* accepted value */ + /* Due to using two_invK instead of invK, we have to compensate in tcd.c */ + /* the computation of the stepsize for the non LL subbands */ + const float two_invK = 1.625732422f; if (dwt->cas == 0) { if (!((dwt->dn > 0) || (dwt->sn > 1))) { return; @@ -2625,7 +2633,7 @@ static void opj_v4dwt_decode(opj_v4dwt_t* OPJ_RESTRICT dwt) opj_v4dwt_decode_step1_sse(dwt->wavelet + a, dwt->win_l_x0, dwt->win_l_x1, _mm_set1_ps(opj_K)); opj_v4dwt_decode_step1_sse(dwt->wavelet + b, dwt->win_h_x0, dwt->win_h_x1, - _mm_set1_ps(opj_invK)); + _mm_set1_ps(two_invK)); opj_v4dwt_decode_step2_sse(dwt->wavelet + b, dwt->wavelet + a + 1, dwt->win_l_x0, dwt->win_l_x1, (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a), @@ -2646,7 +2654,7 @@ static void opj_v4dwt_decode(opj_v4dwt_t* OPJ_RESTRICT dwt) opj_v4dwt_decode_step1(dwt->wavelet + a, dwt->win_l_x0, dwt->win_l_x1, opj_K); opj_v4dwt_decode_step1(dwt->wavelet + b, dwt->win_h_x0, dwt->win_h_x1, - opj_invK); + two_invK); opj_v4dwt_decode_step2(dwt->wavelet + b, dwt->wavelet + a + 1, dwt->win_l_x0, dwt->win_l_x1, (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a), diff --git a/src/lib/openjp2/tcd.c b/src/lib/openjp2/tcd.c index 02fb11db..810f83ec 100644 --- a/src/lib/openjp2/tcd.c +++ b/src/lib/openjp2/tcd.c @@ -1000,7 +1000,11 @@ static INLINE OPJ_BOOL opj_tcd_init_tile(opj_tcd_t *p_tcd, OPJ_UINT32 p_tile_no, { /* Table E-1 - Sub-band gains */ - const OPJ_INT32 log2_gain = (l_band->bandno == 0) ? 0 : + /* BUG_WEIRD_TWO_INVK (look for this identifier in dwt.c): */ + /* the test (!isEncoder && l_tccp->qmfbid == 0) is strongly */ + /* linked to the use of two_invK instead of invK */ + const OPJ_INT32 log2_gain = (!isEncoder && + l_tccp->qmfbid == 0) ? 0 : (l_band->bandno == 0) ? 0 : (l_band->bandno == 3) ? 2 : 1; /* Nominal dynamic range. Equation E-4 */ From 0c09062464fe74f62fb9a4da829992c6a8bd7920 Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Wed, 20 May 2020 23:20:48 +0200 Subject: [PATCH 14/24] bench_dwt.c: add a -I switch to test irreversible FWDT/IDWT --- src/lib/openjp2/bench_dwt.c | 108 ++++++++++++++++++++++++------------ 1 file changed, 74 insertions(+), 34 deletions(-) diff --git a/src/lib/openjp2/bench_dwt.c b/src/lib/openjp2/bench_dwt.c index 2b274145..1124cd61 100644 --- a/src/lib/openjp2/bench_dwt.c +++ b/src/lib/openjp2/bench_dwt.c @@ -49,7 +49,8 @@ void init_tilec(opj_tcd_tilecomp_t * l_tilec, OPJ_INT32 y0, OPJ_INT32 x1, OPJ_INT32 y1, - OPJ_UINT32 numresolutions) + OPJ_UINT32 numresolutions, + OPJ_BOOL irreversible) { opj_tcd_resolution_t* l_res; OPJ_UINT32 resno, l_level_no; @@ -64,7 +65,13 @@ void init_tilec(opj_tcd_tilecomp_t * l_tilec, (size_t)(l_tilec->y1 - l_tilec->y0); l_tilec->data = (OPJ_INT32*) opj_malloc(sizeof(OPJ_INT32) * nValues); for (i = 0; i < nValues; i++) { - l_tilec->data[i] = getValue((OPJ_UINT32)i); + OPJ_INT32 val = getValue((OPJ_UINT32)i); + if (irreversible) { + OPJ_FLOAT32 fVal = (OPJ_FLOAT32)val; + memcpy(&l_tilec->data[i], &fVal, sizeof(OPJ_FLOAT32)); + } else { + l_tilec->data[i] = val; + } } l_tilec->numresolutions = numresolutions; l_tilec->minimum_num_resolutions = numresolutions; @@ -99,7 +106,7 @@ void free_tilec(opj_tcd_tilecomp_t * l_tilec) void usage(void) { printf( - "bench_dwt [-decode|encode] [-size value] [-check] [-display]\n"); + "bench_dwt [-decode|encode] [-I] [-size value] [-check] [-display]\n"); printf( " [-num_resolutions val] [-offset x y] [-num_threads val]\n"); exit(1); @@ -163,6 +170,7 @@ int main(int argc, char** argv) OPJ_UINT32 offset_y = ((OPJ_UINT32)size + 1) / 2 - 1; OPJ_UINT32 num_resolutions = 6; OPJ_BOOL bench_decode = OPJ_TRUE; + OPJ_BOOL irreversible = OPJ_FALSE; for (i = 1; i < argc; i++) { if (strcmp(argv[i], "-encode") == 0) { @@ -171,9 +179,10 @@ int main(int argc, char** argv) bench_decode = OPJ_TRUE; } else if (strcmp(argv[i], "-display") == 0) { display = OPJ_TRUE; - check = OPJ_TRUE; } else if (strcmp(argv[i], "-check") == 0) { check = OPJ_TRUE; + } else if (strcmp(argv[i], "-I") == 0) { + irreversible = OPJ_TRUE; } else if (strcmp(argv[i], "-size") == 0 && i + 1 < argc) { size = atoi(argv[i + 1]); i ++; @@ -197,18 +206,29 @@ int main(int argc, char** argv) } } + if (irreversible && check) { + /* Due to irreversible inverse DWT not being symetric of forward */ + /* See BUG_WEIRD_TWO_INVK in dwt.c */ + printf("-I and -check aren't compatible\n"); + exit(1); + } + tp = opj_thread_pool_create(num_threads); init_tilec(&tilec, (OPJ_INT32)offset_x, (OPJ_INT32)offset_y, (OPJ_INT32)offset_x + size, (OPJ_INT32)offset_y + size, - num_resolutions); + num_resolutions, irreversible); if (display) { printf("Before\n"); k = 0; for (j = 0; j < tilec.y1 - tilec.y0; j++) { for (i = 0; i < tilec.x1 - tilec.x0; i++) { - printf("%d ", tilec.data[k]); + if (irreversible) { + printf("%f ", ((OPJ_FLOAT32*)tilec.data)[k]); + } else { + printf("%d ", tilec.data[k]); + } k ++; } printf("\n"); @@ -243,9 +263,17 @@ int main(int argc, char** argv) start = opj_clock(); start_wc = opj_wallclock(); if (bench_decode) { - opj_dwt_decode(&tcd, &tilec, tilec.numresolutions); + if (irreversible) { + opj_dwt_decode_real(&tcd, &tilec, tilec.numresolutions); + } else { + opj_dwt_decode(&tcd, &tilec, tilec.numresolutions); + } } else { - opj_dwt_encode(&tcd, &tilec); + if (irreversible) { + opj_dwt_encode_real(&tcd, &tilec); + } else { + opj_dwt_encode(&tcd, &tilec); + } } stop = opj_clock(); stop_wc = opj_wallclock(); @@ -254,22 +282,27 @@ int main(int argc, char** argv) stop - start, stop_wc - start_wc); - if (display || check) { - if (display) { - if (bench_decode) { - printf("After IDWT\n"); - } else { - printf("After FDWT\n"); - } - k = 0; - for (j = 0; j < tilec.y1 - tilec.y0; j++) { - for (i = 0; i < tilec.x1 - tilec.x0; i++) { - printf("%d ", tilec.data[k]); - k ++; - } - printf("\n"); - } + if (display) { + if (bench_decode) { + printf("After IDWT\n"); + } else { + printf("After FDWT\n"); } + k = 0; + for (j = 0; j < tilec.y1 - tilec.y0; j++) { + for (i = 0; i < tilec.x1 - tilec.x0; i++) { + if (irreversible) { + printf("%f ", ((OPJ_FLOAT32*)tilec.data)[k]); + } else { + printf("%d ", tilec.data[k]); + } + k ++; + } + printf("\n"); + } + } + + if ((display || check) && !irreversible) { if (bench_decode) { opj_dwt_encode(&tcd, &tilec); @@ -277,7 +310,8 @@ int main(int argc, char** argv) opj_dwt_decode(&tcd, &tilec, tilec.numresolutions); } - if (display) { + + if (display && !irreversible) { if (bench_decode) { printf("After FDWT\n"); } else { @@ -286,22 +320,28 @@ int main(int argc, char** argv) k = 0; for (j = 0; j < tilec.y1 - tilec.y0; j++) { for (i = 0; i < tilec.x1 - tilec.x0; i++) { - printf("%d ", tilec.data[k]); + if (irreversible) { + printf("%f ", ((OPJ_FLOAT32*)tilec.data)[k]); + } else { + printf("%d ", tilec.data[k]); + } k ++; } printf("\n"); } } - if (check) { - size_t idx; - size_t nValues = (size_t)(tilec.x1 - tilec.x0) * - (size_t)(tilec.y1 - tilec.y0); - for (idx = 0; idx < nValues; idx++) { - if (tilec.data[idx] != getValue((OPJ_UINT32)idx)) { - printf("Difference found at idx = %u\n", (OPJ_UINT32)idx); - exit(1); - } + } + + if (check) { + + size_t idx; + size_t nValues = (size_t)(tilec.x1 - tilec.x0) * + (size_t)(tilec.y1 - tilec.y0); + for (idx = 0; idx < nValues; idx++) { + if (tilec.data[idx] != getValue((OPJ_UINT32)idx)) { + printf("Difference found at idx = %u\n", (OPJ_UINT32)idx); + exit(1); } } } From 47943daa15983d20ab1ffd0a237125f1493c5658 Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Thu, 21 May 2020 11:23:00 +0200 Subject: [PATCH 15/24] Speed-up 9x7 IDWD by ~20% "bench_dwt -I" time goes from 2.8s to 2.2s --- src/lib/openjp2/dwt.c | 382 +++++++++++++++++++----------------------- 1 file changed, 176 insertions(+), 206 deletions(-) diff --git a/src/lib/openjp2/dwt.c b/src/lib/openjp2/dwt.c index 5710e802..9fef2234 100644 --- a/src/lib/openjp2/dwt.c +++ b/src/lib/openjp2/dwt.c @@ -87,12 +87,14 @@ typedef struct dwt_local { OPJ_INT32 cas; /* 0 = start on even coord, 1 = start on odd coord */ } opj_dwt_t; -typedef union { - OPJ_FLOAT32 f[4]; -} opj_v4_t; +#define NB_ELTS_V8 8 -typedef struct v4dwt_local { - opj_v4_t* wavelet ; +typedef union { + OPJ_FLOAT32 f[NB_ELTS_V8]; +} opj_v8_t; + +typedef struct v8dwt_local { + opj_v8_t* wavelet ; OPJ_INT32 dn ; /* number of elements in high pass band */ OPJ_INT32 sn ; /* number of elements in low pass band */ OPJ_INT32 cas ; /* 0 = start on even coord, 1 = start on odd coord */ @@ -100,7 +102,7 @@ typedef struct v4dwt_local { OPJ_UINT32 win_l_x1; /* end coord in low pass band */ OPJ_UINT32 win_h_x0; /* start coord in high pass band */ OPJ_UINT32 win_h_x1; /* end coord in high pass band */ -} opj_v4dwt_t ; +} opj_v8dwt_t ; /* From table F.4 from the standard */ static const OPJ_FLOAT32 opj_dwt_alpha = -1.586134342f; @@ -170,42 +172,6 @@ static OPJ_UINT32 opj_dwt_max_resolution(opj_tcd_resolution_t* OPJ_RESTRICT r, /* */ /* Inverse 9-7 wavelet transform in 1-D. */ /* */ -static void opj_v4dwt_decode(opj_v4dwt_t* OPJ_RESTRICT dwt); - -static void opj_v4dwt_interleave_h(opj_v4dwt_t* OPJ_RESTRICT dwt, - OPJ_FLOAT32* OPJ_RESTRICT a, - OPJ_UINT32 width, - OPJ_UINT32 remaining_height); - -static void opj_v4dwt_interleave_v(opj_v4dwt_t* OPJ_RESTRICT dwt, - OPJ_FLOAT32* OPJ_RESTRICT a, - OPJ_UINT32 width, - OPJ_UINT32 nb_elts_read); - -#ifdef __SSE__ -static void opj_v4dwt_decode_step1_sse(opj_v4_t* w, - OPJ_UINT32 start, - OPJ_UINT32 end, - const __m128 c); - -static void opj_v4dwt_decode_step2_sse(opj_v4_t* l, opj_v4_t* w, - OPJ_UINT32 start, - OPJ_UINT32 end, - OPJ_UINT32 m, __m128 c); - -#else -static void opj_v4dwt_decode_step1(opj_v4_t* w, - OPJ_UINT32 start, - OPJ_UINT32 end, - const OPJ_FLOAT32 c); - -static void opj_v4dwt_decode_step2(opj_v4_t* l, opj_v4_t* w, - OPJ_UINT32 start, - OPJ_UINT32 end, - OPJ_UINT32 m, - OPJ_FLOAT32 c); - -#endif /*@}*/ @@ -2332,7 +2298,7 @@ static OPJ_BOOL opj_dwt_decode_partial_tile( return OPJ_TRUE; } -static void opj_v4dwt_interleave_h(opj_v4dwt_t* OPJ_RESTRICT dwt, +static void opj_v8dwt_interleave_h(opj_v8dwt_t* OPJ_RESTRICT dwt, OPJ_FLOAT32* OPJ_RESTRICT a, OPJ_UINT32 width, OPJ_UINT32 remaining_height) @@ -2343,39 +2309,69 @@ static void opj_v4dwt_interleave_h(opj_v4dwt_t* OPJ_RESTRICT dwt, OPJ_UINT32 x1 = dwt->win_l_x1; for (k = 0; k < 2; ++k) { - if (remaining_height >= 4 && ((OPJ_SIZE_T) a & 0x0f) == 0 && - ((OPJ_SIZE_T) bi & 0x0f) == 0 && (width & 0x0f) == 0) { + if (remaining_height >= NB_ELTS_V8 && ((OPJ_SIZE_T) a & 0x0f) == 0 && + ((OPJ_SIZE_T) bi & 0x0f) == 0) { /* Fast code path */ for (i = x0; i < x1; ++i) { OPJ_UINT32 j = i; - bi[i * 8 ] = a[j]; + OPJ_FLOAT32* OPJ_RESTRICT dst = bi + i * 2 * NB_ELTS_V8; + dst[0] = a[j]; j += width; - bi[i * 8 + 1] = a[j]; + dst[1] = a[j]; j += width; - bi[i * 8 + 2] = a[j]; + dst[2] = a[j]; j += width; - bi[i * 8 + 3] = a[j]; + dst[3] = a[j]; + j += width; + dst[4] = a[j]; + j += width; + dst[5] = a[j]; + j += width; + dst[6] = a[j]; + j += width; + dst[7] = a[j]; } } else { /* Slow code path */ for (i = x0; i < x1; ++i) { OPJ_UINT32 j = i; - bi[i * 8 ] = a[j]; + OPJ_FLOAT32* OPJ_RESTRICT dst = bi + i * 2 * NB_ELTS_V8; + dst[0] = a[j]; j += width; if (remaining_height == 1) { continue; } - bi[i * 8 + 1] = a[j]; + dst[1] = a[j]; j += width; if (remaining_height == 2) { continue; } - bi[i * 8 + 2] = a[j]; + dst[2] = a[j]; j += width; if (remaining_height == 3) { continue; } - bi[i * 8 + 3] = a[j]; /* This one*/ + dst[3] = a[j]; + j += width; + if (remaining_height == 4) { + continue; + } + dst[4] = a[j]; + j += width; + if (remaining_height == 5) { + continue; + } + dst[5] = a[j]; + j += width; + if (remaining_height == 6) { + continue; + } + dst[6] = a[j]; + j += width; + if (remaining_height == 7) { + continue; + } + dst[7] = a[j]; } } @@ -2386,7 +2382,7 @@ static void opj_v4dwt_interleave_h(opj_v4dwt_t* OPJ_RESTRICT dwt, } } -static void opj_v4dwt_interleave_partial_h(opj_v4dwt_t* dwt, +static void opj_v8dwt_interleave_partial_h(opj_v8dwt_t* dwt, opj_sparse_array_int32_t* sa, OPJ_UINT32 sa_line, OPJ_UINT32 remaining_height) @@ -2399,25 +2395,25 @@ static void opj_v4dwt_interleave_partial_h(opj_v4dwt_t* dwt, dwt->win_l_x1, sa_line + i + 1, /* Nasty cast from float* to int32* */ (OPJ_INT32*)(dwt->wavelet + dwt->cas + 2 * dwt->win_l_x0) + i, - 8, 0, OPJ_TRUE); + 2 * NB_ELTS_V8, 0, OPJ_TRUE); assert(ret); ret = opj_sparse_array_int32_read(sa, (OPJ_UINT32)dwt->sn + dwt->win_h_x0, sa_line + i, (OPJ_UINT32)dwt->sn + dwt->win_h_x1, sa_line + i + 1, /* Nasty cast from float* to int32* */ (OPJ_INT32*)(dwt->wavelet + 1 - dwt->cas + 2 * dwt->win_h_x0) + i, - 8, 0, OPJ_TRUE); + 2 * NB_ELTS_V8, 0, OPJ_TRUE); assert(ret); OPJ_UNUSED(ret); } } -static void opj_v4dwt_interleave_v(opj_v4dwt_t* OPJ_RESTRICT dwt, - OPJ_FLOAT32* OPJ_RESTRICT a, - OPJ_UINT32 width, - OPJ_UINT32 nb_elts_read) +static INLINE void opj_v8dwt_interleave_v(opj_v8dwt_t* OPJ_RESTRICT dwt, + OPJ_FLOAT32* OPJ_RESTRICT a, + OPJ_UINT32 width, + OPJ_UINT32 nb_elts_read) { - opj_v4_t* OPJ_RESTRICT bi = dwt->wavelet + dwt->cas; + opj_v8_t* OPJ_RESTRICT bi = dwt->wavelet + dwt->cas; OPJ_UINT32 i; for (i = dwt->win_l_x0; i < dwt->win_l_x1; ++i) { @@ -2434,7 +2430,7 @@ static void opj_v4dwt_interleave_v(opj_v4dwt_t* OPJ_RESTRICT dwt, } } -static void opj_v4dwt_interleave_partial_v(opj_v4dwt_t* OPJ_RESTRICT dwt, +static void opj_v8dwt_interleave_partial_v(opj_v8dwt_t* OPJ_RESTRICT dwt, opj_sparse_array_int32_t* sa, OPJ_UINT32 sa_col, OPJ_UINT32 nb_elts_read) @@ -2444,44 +2440,36 @@ static void opj_v4dwt_interleave_partial_v(opj_v4dwt_t* OPJ_RESTRICT dwt, sa_col, dwt->win_l_x0, sa_col + nb_elts_read, dwt->win_l_x1, (OPJ_INT32*)(dwt->wavelet + dwt->cas + 2 * dwt->win_l_x0), - 1, 8, OPJ_TRUE); + 1, 2 * NB_ELTS_V8, OPJ_TRUE); assert(ret); ret = opj_sparse_array_int32_read(sa, sa_col, (OPJ_UINT32)dwt->sn + dwt->win_h_x0, sa_col + nb_elts_read, (OPJ_UINT32)dwt->sn + dwt->win_h_x1, (OPJ_INT32*)(dwt->wavelet + 1 - dwt->cas + 2 * dwt->win_h_x0), - 1, 8, OPJ_TRUE); + 1, 2 * NB_ELTS_V8, OPJ_TRUE); assert(ret); OPJ_UNUSED(ret); } #ifdef __SSE__ -static void opj_v4dwt_decode_step1_sse(opj_v4_t* w, +static void opj_v8dwt_decode_step1_sse(opj_v8_t* w, OPJ_UINT32 start, OPJ_UINT32 end, const __m128 c) { __m128* OPJ_RESTRICT vw = (__m128*) w; - OPJ_UINT32 i; - /* 4x unrolled loop */ - vw += 2 * start; - for (i = start; i + 3 < end; i += 4, vw += 8) { - __m128 xmm0 = _mm_mul_ps(vw[0], c); - __m128 xmm2 = _mm_mul_ps(vw[2], c); - __m128 xmm4 = _mm_mul_ps(vw[4], c); - __m128 xmm6 = _mm_mul_ps(vw[6], c); - vw[0] = xmm0; - vw[2] = xmm2; - vw[4] = xmm4; - vw[6] = xmm6; - } - for (; i < end; ++i, vw += 2) { + OPJ_UINT32 i = start; + /* To be adapted if NB_ELTS_V8 changes */ + vw += 4 * start; + /* Note: attempt at loop unrolling x2 doesn't help */ + for (; i < end; ++i, vw += 4) { vw[0] = _mm_mul_ps(vw[0], c); + vw[1] = _mm_mul_ps(vw[1], c); } } -static void opj_v4dwt_decode_step2_sse(opj_v4_t* l, opj_v4_t* w, +static void opj_v8dwt_decode_step2_sse(opj_v8_t* l, opj_v8_t* w, OPJ_UINT32 start, OPJ_UINT32 end, OPJ_UINT32 m, @@ -2489,74 +2477,58 @@ static void opj_v4dwt_decode_step2_sse(opj_v4_t* l, opj_v4_t* w, { __m128* OPJ_RESTRICT vl = (__m128*) l; __m128* OPJ_RESTRICT vw = (__m128*) w; + /* To be adapted if NB_ELTS_V8 changes */ OPJ_UINT32 i; OPJ_UINT32 imax = opj_uint_min(end, m); - __m128 tmp1, tmp2, tmp3; if (start == 0) { - tmp1 = vl[0]; + if (imax >= 1) { + vw[-2] = _mm_add_ps(vw[-2], _mm_mul_ps(_mm_add_ps(vl[0], vw[0]), c)); + vw[-1] = _mm_add_ps(vw[-1], _mm_mul_ps(_mm_add_ps(vl[1], vw[1]), c)); + vw += 4; + start = 1; + } } else { - vw += start * 2; - tmp1 = vw[-3]; + vw += start * 4; } i = start; - - /* 4x loop unrolling */ - for (; i + 3 < imax; i += 4) { - __m128 tmp4, tmp5, tmp6, tmp7, tmp8, tmp9; - tmp2 = vw[-1]; - tmp3 = vw[ 0]; - tmp4 = vw[ 1]; - tmp5 = vw[ 2]; - tmp6 = vw[ 3]; - tmp7 = vw[ 4]; - tmp8 = vw[ 5]; - tmp9 = vw[ 6]; - vw[-1] = _mm_add_ps(tmp2, _mm_mul_ps(_mm_add_ps(tmp1, tmp3), c)); - vw[ 1] = _mm_add_ps(tmp4, _mm_mul_ps(_mm_add_ps(tmp3, tmp5), c)); - vw[ 3] = _mm_add_ps(tmp6, _mm_mul_ps(_mm_add_ps(tmp5, tmp7), c)); - vw[ 5] = _mm_add_ps(tmp8, _mm_mul_ps(_mm_add_ps(tmp7, tmp9), c)); - tmp1 = tmp9; - vw += 8; - } - + /* Note: attempt at loop unrolling x2 doesn't help */ for (; i < imax; ++i) { - tmp2 = vw[-1]; - tmp3 = vw[ 0]; - vw[-1] = _mm_add_ps(tmp2, _mm_mul_ps(_mm_add_ps(tmp1, tmp3), c)); - tmp1 = tmp3; - vw += 2; + vw[-2] = _mm_add_ps(vw[-2], _mm_mul_ps(_mm_add_ps(vw[-4], vw[0]), c)); + vw[-1] = _mm_add_ps(vw[-1], _mm_mul_ps(_mm_add_ps(vw[-3], vw[1]), c)); + vw += 4; } if (m < end) { assert(m + 1 == end); c = _mm_add_ps(c, c); - c = _mm_mul_ps(c, vw[-2]); - vw[-1] = _mm_add_ps(vw[-1], c); + vw[-2] = _mm_add_ps(vw[-2], _mm_mul_ps(c, vw[-4])); + vw[-1] = _mm_add_ps(vw[-1], _mm_mul_ps(c, vw[-3])); } } #else -static void opj_v4dwt_decode_step1(opj_v4_t* w, +static void opj_v8dwt_decode_step1(opj_v8_t* w, OPJ_UINT32 start, OPJ_UINT32 end, const OPJ_FLOAT32 c) { OPJ_FLOAT32* OPJ_RESTRICT fw = (OPJ_FLOAT32*) w; OPJ_UINT32 i; + /* To be adapted if NB_ELTS_V8 changes */ for (i = start; i < end; ++i) { - OPJ_FLOAT32 tmp1 = fw[i * 8 ]; - OPJ_FLOAT32 tmp2 = fw[i * 8 + 1]; - OPJ_FLOAT32 tmp3 = fw[i * 8 + 2]; - OPJ_FLOAT32 tmp4 = fw[i * 8 + 3]; - fw[i * 8 ] = tmp1 * c; - fw[i * 8 + 1] = tmp2 * c; - fw[i * 8 + 2] = tmp3 * c; - fw[i * 8 + 3] = tmp4 * c; + fw[i * 2 * 8 ] = fw[i * 2 * 8 ] * c; + fw[i * 2 * 8 + 1] = fw[i * 2 * 8 + 1] * c; + fw[i * 2 * 8 + 2] = fw[i * 2 * 8 + 2] * c; + fw[i * 2 * 8 + 3] = fw[i * 2 * 8 + 3] * c; + fw[i * 2 * 8 + 4] = fw[i * 2 * 8 + 4] * c; + fw[i * 2 * 8 + 5] = fw[i * 2 * 8 + 5] * c; + fw[i * 2 * 8 + 6] = fw[i * 2 * 8 + 6] * c; + fw[i * 2 * 8 + 7] = fw[i * 2 * 8 + 7] * c; } } -static void opj_v4dwt_decode_step2(opj_v4_t* l, opj_v4_t* w, +static void opj_v8dwt_decode_step2(opj_v8_t* l, opj_v8_t* w, OPJ_UINT32 start, OPJ_UINT32 end, OPJ_UINT32 m, @@ -2567,36 +2539,33 @@ static void opj_v4dwt_decode_step2(opj_v4_t* l, opj_v4_t* w, OPJ_UINT32 i; OPJ_UINT32 imax = opj_uint_min(end, m); if (start > 0) { - fw += 8 * start; - fl = fw - 8; + fw += 2 * NB_ELTS_V8 * start; + fl = fw - 2 * NB_ELTS_V8; } + /* To be adapted if NB_ELTS_V8 changes */ for (i = start; i < imax; ++i) { - OPJ_FLOAT32 tmp1_1 = fl[0]; - OPJ_FLOAT32 tmp1_2 = fl[1]; - OPJ_FLOAT32 tmp1_3 = fl[2]; - OPJ_FLOAT32 tmp1_4 = fl[3]; - OPJ_FLOAT32 tmp2_1 = fw[-4]; - OPJ_FLOAT32 tmp2_2 = fw[-3]; - OPJ_FLOAT32 tmp2_3 = fw[-2]; - OPJ_FLOAT32 tmp2_4 = fw[-1]; - OPJ_FLOAT32 tmp3_1 = fw[0]; - OPJ_FLOAT32 tmp3_2 = fw[1]; - OPJ_FLOAT32 tmp3_3 = fw[2]; - OPJ_FLOAT32 tmp3_4 = fw[3]; - fw[-4] = tmp2_1 + ((tmp1_1 + tmp3_1) * c); - fw[-3] = tmp2_2 + ((tmp1_2 + tmp3_2) * c); - fw[-2] = tmp2_3 + ((tmp1_3 + tmp3_3) * c); - fw[-1] = tmp2_4 + ((tmp1_4 + tmp3_4) * c); + fw[-8] = fw[-8] + ((fl[0] + fw[0]) * c); + fw[-7] = fw[-7] + ((fl[1] + fw[1]) * c); + fw[-6] = fw[-6] + ((fl[2] + fw[2]) * c); + fw[-5] = fw[-5] + ((fl[3] + fw[3]) * c); + fw[-4] = fw[-4] + ((fl[4] + fw[4]) * c); + fw[-3] = fw[-3] + ((fl[5] + fw[5]) * c); + fw[-2] = fw[-2] + ((fl[6] + fw[6]) * c); + fw[-1] = fw[-1] + ((fl[7] + fw[7]) * c); fl = fw; - fw += 8; + fw += 2 * NB_ELTS_V8; } if (m < end) { assert(m + 1 == end); c += c; - fw[-4] = fw[-4] + fl[0] * c; - fw[-3] = fw[-3] + fl[1] * c; - fw[-2] = fw[-2] + fl[2] * c; - fw[-1] = fw[-1] + fl[3] * c; + fw[-8] = fw[-8] + fl[0] * c; + fw[-7] = fw[-7] + fl[1] * c; + fw[-6] = fw[-6] + fl[2] * c; + fw[-5] = fw[-5] + fl[3] * c; + fw[-4] = fw[-4] + fl[4] * c; + fw[-3] = fw[-3] + fl[5] * c; + fw[-2] = fw[-2] + fl[6] * c; + fw[-1] = fw[-1] + fl[7] * c; } } @@ -2605,7 +2574,7 @@ static void opj_v4dwt_decode_step2(opj_v4_t* l, opj_v4_t* w, /* */ /* Inverse 9-7 wavelet transform in 1-D. */ /* */ -static void opj_v4dwt_decode(opj_v4dwt_t* OPJ_RESTRICT dwt) +static void opj_v8dwt_decode(opj_v8dwt_t* OPJ_RESTRICT dwt) { OPJ_INT32 a, b; /* BUG_WEIRD_TWO_INVK (look for this identifier in tcd.c) */ @@ -2630,44 +2599,44 @@ static void opj_v4dwt_decode(opj_v4dwt_t* OPJ_RESTRICT dwt) b = 0; } #ifdef __SSE__ - opj_v4dwt_decode_step1_sse(dwt->wavelet + a, dwt->win_l_x0, dwt->win_l_x1, + opj_v8dwt_decode_step1_sse(dwt->wavelet + a, dwt->win_l_x0, dwt->win_l_x1, _mm_set1_ps(opj_K)); - opj_v4dwt_decode_step1_sse(dwt->wavelet + b, dwt->win_h_x0, dwt->win_h_x1, + opj_v8dwt_decode_step1_sse(dwt->wavelet + b, dwt->win_h_x0, dwt->win_h_x1, _mm_set1_ps(two_invK)); - opj_v4dwt_decode_step2_sse(dwt->wavelet + b, dwt->wavelet + a + 1, + opj_v8dwt_decode_step2_sse(dwt->wavelet + b, dwt->wavelet + a + 1, dwt->win_l_x0, dwt->win_l_x1, (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a), _mm_set1_ps(-opj_dwt_delta)); - opj_v4dwt_decode_step2_sse(dwt->wavelet + a, dwt->wavelet + b + 1, + opj_v8dwt_decode_step2_sse(dwt->wavelet + a, dwt->wavelet + b + 1, dwt->win_h_x0, dwt->win_h_x1, (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b), _mm_set1_ps(-opj_dwt_gamma)); - opj_v4dwt_decode_step2_sse(dwt->wavelet + b, dwt->wavelet + a + 1, + opj_v8dwt_decode_step2_sse(dwt->wavelet + b, dwt->wavelet + a + 1, dwt->win_l_x0, dwt->win_l_x1, (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a), _mm_set1_ps(-opj_dwt_beta)); - opj_v4dwt_decode_step2_sse(dwt->wavelet + a, dwt->wavelet + b + 1, + opj_v8dwt_decode_step2_sse(dwt->wavelet + a, dwt->wavelet + b + 1, dwt->win_h_x0, dwt->win_h_x1, (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b), _mm_set1_ps(-opj_dwt_alpha)); #else - opj_v4dwt_decode_step1(dwt->wavelet + a, dwt->win_l_x0, dwt->win_l_x1, + opj_v8dwt_decode_step1(dwt->wavelet + a, dwt->win_l_x0, dwt->win_l_x1, opj_K); - opj_v4dwt_decode_step1(dwt->wavelet + b, dwt->win_h_x0, dwt->win_h_x1, + opj_v8dwt_decode_step1(dwt->wavelet + b, dwt->win_h_x0, dwt->win_h_x1, two_invK); - opj_v4dwt_decode_step2(dwt->wavelet + b, dwt->wavelet + a + 1, + opj_v8dwt_decode_step2(dwt->wavelet + b, dwt->wavelet + a + 1, dwt->win_l_x0, dwt->win_l_x1, (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a), -opj_dwt_delta); - opj_v4dwt_decode_step2(dwt->wavelet + a, dwt->wavelet + b + 1, + opj_v8dwt_decode_step2(dwt->wavelet + a, dwt->wavelet + b + 1, dwt->win_h_x0, dwt->win_h_x1, (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b), -opj_dwt_gamma); - opj_v4dwt_decode_step2(dwt->wavelet + b, dwt->wavelet + a + 1, + opj_v8dwt_decode_step2(dwt->wavelet + b, dwt->wavelet + a + 1, dwt->win_l_x0, dwt->win_l_x1, (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a), -opj_dwt_beta); - opj_v4dwt_decode_step2(dwt->wavelet + a, dwt->wavelet + b + 1, + opj_v8dwt_decode_step2(dwt->wavelet + a, dwt->wavelet + b + 1, dwt->win_h_x0, dwt->win_h_x1, (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b), -opj_dwt_alpha); @@ -2682,8 +2651,8 @@ static OPJ_BOOL opj_dwt_decode_tile_97(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec, OPJ_UINT32 numres) { - opj_v4dwt_t h; - opj_v4dwt_t v; + opj_v8dwt_t h; + opj_v8dwt_t v; opj_tcd_resolution_t* res = tilec->resolutions; @@ -2706,11 +2675,11 @@ OPJ_BOOL opj_dwt_decode_tile_97(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec, } l_data_size += 5U; /* overflow check */ - if (l_data_size > (SIZE_MAX / sizeof(opj_v4_t))) { + if (l_data_size > (SIZE_MAX / sizeof(opj_v8_t))) { /* FIXME event manager error callback */ return OPJ_FALSE; } - h.wavelet = (opj_v4_t*) opj_aligned_malloc(l_data_size * sizeof(opj_v4_t)); + h.wavelet = (opj_v8_t*) opj_aligned_malloc(l_data_size * sizeof(opj_v8_t)); if (!h.wavelet) { /* FIXME event manager error callback */ return OPJ_FALSE; @@ -2738,35 +2707,36 @@ OPJ_BOOL opj_dwt_decode_tile_97(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec, h.win_l_x1 = (OPJ_UINT32)h.sn; h.win_h_x0 = 0; h.win_h_x1 = (OPJ_UINT32)h.dn; - for (j = 0; j + 3 < rh; j += 4) { + for (j = 0; j + (NB_ELTS_V8 - 1) < rh; j += NB_ELTS_V8) { OPJ_UINT32 k; - opj_v4dwt_interleave_h(&h, aj, w, rh - j); - opj_v4dwt_decode(&h); + opj_v8dwt_interleave_h(&h, aj, w, rh - j); + opj_v8dwt_decode(&h); + /* To be adapted if NB_ELTS_V8 changes */ for (k = 0; k < rw; k++) { aj[k ] = h.wavelet[k].f[0]; aj[k + (OPJ_SIZE_T)w ] = h.wavelet[k].f[1]; aj[k + (OPJ_SIZE_T)w * 2] = h.wavelet[k].f[2]; aj[k + (OPJ_SIZE_T)w * 3] = h.wavelet[k].f[3]; } + for (k = 0; k < rw; k++) { + aj[k + (OPJ_SIZE_T)w * 4] = h.wavelet[k].f[4]; + aj[k + (OPJ_SIZE_T)w * 5] = h.wavelet[k].f[5]; + aj[k + (OPJ_SIZE_T)w * 6] = h.wavelet[k].f[6]; + aj[k + (OPJ_SIZE_T)w * 7] = h.wavelet[k].f[7]; + } - aj += w * 4; + aj += w * NB_ELTS_V8; } if (j < rh) { OPJ_UINT32 k; - opj_v4dwt_interleave_h(&h, aj, w, rh - j); - opj_v4dwt_decode(&h); + opj_v8dwt_interleave_h(&h, aj, w, rh - j); + opj_v8dwt_decode(&h); for (k = 0; k < rw; k++) { - switch (rh - j) { - case 3: - aj[k + (OPJ_SIZE_T)w * 2] = h.wavelet[k].f[2]; - /* FALLTHRU */ - case 2: - aj[k + (OPJ_SIZE_T)w ] = h.wavelet[k].f[1]; - /* FALLTHRU */ - case 1: - aj[k] = h.wavelet[k].f[0]; + OPJ_UINT32 l; + for (l = 0; l < rh - j; l++) { + aj[k + (OPJ_SIZE_T)w * l ] = h.wavelet[k].f[l]; } } } @@ -2779,25 +2749,25 @@ OPJ_BOOL opj_dwt_decode_tile_97(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec, v.win_h_x1 = (OPJ_UINT32)v.dn; aj = (OPJ_FLOAT32*) tilec->data; - for (j = rw; j > 3; j -= 4) { + for (j = rw; j > (NB_ELTS_V8 - 1); j -= NB_ELTS_V8) { OPJ_UINT32 k; - opj_v4dwt_interleave_v(&v, aj, w, 4); - opj_v4dwt_decode(&v); + opj_v8dwt_interleave_v(&v, aj, w, NB_ELTS_V8); + opj_v8dwt_decode(&v); for (k = 0; k < rh; ++k) { - memcpy(&aj[k * (OPJ_SIZE_T)w], &v.wavelet[k], 4 * sizeof(OPJ_FLOAT32)); + memcpy(&aj[k * (OPJ_SIZE_T)w], &v.wavelet[k], NB_ELTS_V8 * sizeof(OPJ_FLOAT32)); } - aj += 4; + aj += NB_ELTS_V8; } - if (rw & 0x03) { + if (rw & (NB_ELTS_V8 - 1)) { OPJ_UINT32 k; - j = rw & 0x03; + j = rw & (NB_ELTS_V8 - 1); - opj_v4dwt_interleave_v(&v, aj, w, j); - opj_v4dwt_decode(&v); + opj_v8dwt_interleave_v(&v, aj, w, j); + opj_v8dwt_decode(&v); for (k = 0; k < rh; ++k) { memcpy(&aj[k * (OPJ_SIZE_T)w], &v.wavelet[k], @@ -2815,8 +2785,8 @@ OPJ_BOOL opj_dwt_decode_partial_97(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec, OPJ_UINT32 numres) { opj_sparse_array_int32_t* sa; - opj_v4dwt_t h; - opj_v4dwt_t v; + opj_v8dwt_t h; + opj_v8dwt_t v; OPJ_UINT32 resno; /* This value matches the maximum left/right extension given in tables */ /* F.2 and F.3 of the standard. Note: in opj_tcd_is_subband_area_of_interest() */ @@ -2873,12 +2843,12 @@ OPJ_BOOL opj_dwt_decode_partial_97(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec, } l_data_size += 5U; /* overflow check */ - if (l_data_size > (SIZE_MAX / sizeof(opj_v4_t))) { + if (l_data_size > (SIZE_MAX / sizeof(opj_v8_t))) { /* FIXME event manager error callback */ opj_sparse_array_int32_free(sa); return OPJ_FALSE; } - h.wavelet = (opj_v4_t*) opj_aligned_malloc(l_data_size * sizeof(opj_v4_t)); + h.wavelet = (opj_v8_t*) opj_aligned_malloc(l_data_size * sizeof(opj_v8_t)); if (!h.wavelet) { /* FIXME event manager error callback */ opj_sparse_array_int32_free(sa); @@ -2973,17 +2943,17 @@ OPJ_BOOL opj_dwt_decode_partial_97(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec, h.win_l_x1 = win_ll_x1; h.win_h_x0 = win_hl_x0; h.win_h_x1 = win_hl_x1; - for (j = 0; j + 3 < rh; j += 4) { - if ((j + 3 >= win_ll_y0 && j < win_ll_y1) || - (j + 3 >= win_lh_y0 + (OPJ_UINT32)v.sn && + for (j = 0; j + (NB_ELTS_V8 - 1) < rh; j += NB_ELTS_V8) { + if ((j + (NB_ELTS_V8 - 1) >= win_ll_y0 && j < win_ll_y1) || + (j + (NB_ELTS_V8 - 1) >= win_lh_y0 + (OPJ_UINT32)v.sn && j < win_lh_y1 + (OPJ_UINT32)v.sn)) { - opj_v4dwt_interleave_partial_h(&h, sa, j, opj_uint_min(4U, rh - j)); - opj_v4dwt_decode(&h); + opj_v8dwt_interleave_partial_h(&h, sa, j, opj_uint_min(NB_ELTS_V8, rh - j)); + opj_v8dwt_decode(&h); if (!opj_sparse_array_int32_write(sa, win_tr_x0, j, - win_tr_x1, j + 4, + win_tr_x1, j + NB_ELTS_V8, (OPJ_INT32*)&h.wavelet[win_tr_x0].f[0], - 4, 1, OPJ_TRUE)) { + NB_ELTS_V8, 1, OPJ_TRUE)) { /* FIXME event manager error callback */ opj_sparse_array_int32_free(sa); opj_aligned_free(h.wavelet); @@ -2993,16 +2963,16 @@ OPJ_BOOL opj_dwt_decode_partial_97(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec, } if (j < rh && - ((j + 3 >= win_ll_y0 && j < win_ll_y1) || - (j + 3 >= win_lh_y0 + (OPJ_UINT32)v.sn && + ((j + (NB_ELTS_V8 - 1) >= win_ll_y0 && j < win_ll_y1) || + (j + (NB_ELTS_V8 - 1) >= win_lh_y0 + (OPJ_UINT32)v.sn && j < win_lh_y1 + (OPJ_UINT32)v.sn))) { - opj_v4dwt_interleave_partial_h(&h, sa, j, rh - j); - opj_v4dwt_decode(&h); + opj_v8dwt_interleave_partial_h(&h, sa, j, rh - j); + opj_v8dwt_decode(&h); if (!opj_sparse_array_int32_write(sa, win_tr_x0, j, win_tr_x1, rh, (OPJ_INT32*)&h.wavelet[win_tr_x0].f[0], - 4, 1, OPJ_TRUE)) { + NB_ELTS_V8, 1, OPJ_TRUE)) { /* FIXME event manager error callback */ opj_sparse_array_int32_free(sa); opj_aligned_free(h.wavelet); @@ -3014,17 +2984,17 @@ OPJ_BOOL opj_dwt_decode_partial_97(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec, v.win_l_x1 = win_ll_y1; v.win_h_x0 = win_lh_y0; v.win_h_x1 = win_lh_y1; - for (j = win_tr_x0; j < win_tr_x1; j += 4) { - OPJ_UINT32 nb_elts = opj_uint_min(4U, win_tr_x1 - j); + for (j = win_tr_x0; j < win_tr_x1; j += NB_ELTS_V8) { + OPJ_UINT32 nb_elts = opj_uint_min(NB_ELTS_V8, win_tr_x1 - j); - opj_v4dwt_interleave_partial_v(&v, sa, j, nb_elts); - opj_v4dwt_decode(&v); + opj_v8dwt_interleave_partial_v(&v, sa, j, nb_elts); + opj_v8dwt_decode(&v); if (!opj_sparse_array_int32_write(sa, j, win_tr_y0, j + nb_elts, win_tr_y1, (OPJ_INT32*)&h.wavelet[win_tr_y0].f[0], - 1, 4, OPJ_TRUE)) { + 1, NB_ELTS_V8, OPJ_TRUE)) { /* FIXME event manager error callback */ opj_sparse_array_int32_free(sa); opj_aligned_free(h.wavelet); From 272b3e0fb2530ca8bffdc8c64f1505a8ff5f6ecc Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Thu, 21 May 2020 11:24:29 +0200 Subject: [PATCH 16/24] Remove useless + 5U margin in opj_dwt_decode_tile_97() Nothing in code analysis nor test suite shows that this margin is needed. It dates back to commit dbeebe72b9d35f6ff807c21c7f217b569fa894f6 where vector 9x7 decoding was introduced. --- src/lib/openjp2/dwt.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/src/lib/openjp2/dwt.c b/src/lib/openjp2/dwt.c index 9fef2234..84d5aaf5 100644 --- a/src/lib/openjp2/dwt.c +++ b/src/lib/openjp2/dwt.c @@ -2667,13 +2667,11 @@ OPJ_BOOL opj_dwt_decode_tile_97(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec, OPJ_SIZE_T l_data_size; - l_data_size = opj_dwt_max_resolution(res, numres); - /* overflow check */ - if (l_data_size > (SIZE_MAX - 5U)) { - /* FIXME event manager error callback */ - return OPJ_FALSE; + if (numres == 1) { + return OPJ_TRUE; } - l_data_size += 5U; + + l_data_size = opj_dwt_max_resolution(res, numres); /* overflow check */ if (l_data_size > (SIZE_MAX / sizeof(opj_v8_t))) { /* FIXME event manager error callback */ @@ -2836,13 +2834,6 @@ OPJ_BOOL opj_dwt_decode_partial_97(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec, l_data_size = opj_dwt_max_resolution(tr, numres); /* overflow check */ - if (l_data_size > (SIZE_MAX - 5U)) { - /* FIXME event manager error callback */ - opj_sparse_array_int32_free(sa); - return OPJ_FALSE; - } - l_data_size += 5U; - /* overflow check */ if (l_data_size > (SIZE_MAX / sizeof(opj_v8_t))) { /* FIXME event manager error callback */ opj_sparse_array_int32_free(sa); From 45a35223b79dee65a0059f999b690072e829669f Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Thu, 21 May 2020 16:54:48 +0200 Subject: [PATCH 17/24] Speed-up 9x7 IDWD by ~30% with OPJ_NUM_THREADS=2 "bench_dwt -I" time goes from 2.2s to 1.5s --- src/lib/openjp2/dwt.c | 232 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 205 insertions(+), 27 deletions(-) diff --git a/src/lib/openjp2/dwt.c b/src/lib/openjp2/dwt.c index 84d5aaf5..8790626e 100644 --- a/src/lib/openjp2/dwt.c +++ b/src/lib/openjp2/dwt.c @@ -2643,12 +2643,99 @@ static void opj_v8dwt_decode(opj_v8dwt_t* OPJ_RESTRICT dwt) #endif } +typedef struct { + opj_v8dwt_t h; + OPJ_UINT32 rw; + OPJ_UINT32 w; + OPJ_FLOAT32 * OPJ_RESTRICT aj; + OPJ_UINT32 nb_rows; +} opj_dwt97_decode_h_job_t; + +static void opj_dwt97_decode_h_func(void* user_data, opj_tls_t* tls) +{ + OPJ_UINT32 j; + opj_dwt97_decode_h_job_t* job; + OPJ_FLOAT32 * OPJ_RESTRICT aj; + OPJ_UINT32 w; + (void)tls; + + job = (opj_dwt97_decode_h_job_t*)user_data; + w = job->w; + + assert((job->nb_rows % NB_ELTS_V8) == 0); + + aj = job->aj; + for (j = 0; j + NB_ELTS_V8 <= job->nb_rows; j += NB_ELTS_V8) { + OPJ_UINT32 k; + opj_v8dwt_interleave_h(&job->h, aj, job->w, NB_ELTS_V8); + opj_v8dwt_decode(&job->h); + + /* To be adapted if NB_ELTS_V8 changes */ + for (k = 0; k < job->rw; k++) { + aj[k ] = job->h.wavelet[k].f[0]; + aj[k + (OPJ_SIZE_T)w ] = job->h.wavelet[k].f[1]; + aj[k + (OPJ_SIZE_T)w * 2] = job->h.wavelet[k].f[2]; + aj[k + (OPJ_SIZE_T)w * 3] = job->h.wavelet[k].f[3]; + } + for (k = 0; k < job->rw; k++) { + aj[k + (OPJ_SIZE_T)w * 4] = job->h.wavelet[k].f[4]; + aj[k + (OPJ_SIZE_T)w * 5] = job->h.wavelet[k].f[5]; + aj[k + (OPJ_SIZE_T)w * 6] = job->h.wavelet[k].f[6]; + aj[k + (OPJ_SIZE_T)w * 7] = job->h.wavelet[k].f[7]; + } + + aj += w * NB_ELTS_V8; + } + + opj_aligned_free(job->h.wavelet); + opj_free(job); +} + + +typedef struct { + opj_v8dwt_t v; + OPJ_UINT32 rh; + OPJ_UINT32 w; + OPJ_FLOAT32 * OPJ_RESTRICT aj; + OPJ_UINT32 nb_columns; +} opj_dwt97_decode_v_job_t; + +static void opj_dwt97_decode_v_func(void* user_data, opj_tls_t* tls) +{ + OPJ_UINT32 j; + opj_dwt97_decode_v_job_t* job; + OPJ_FLOAT32 * OPJ_RESTRICT aj; + (void)tls; + + job = (opj_dwt97_decode_v_job_t*)user_data; + + assert((job->nb_columns % NB_ELTS_V8) == 0); + + aj = job->aj; + for (j = 0; j + NB_ELTS_V8 <= job->nb_columns; j += NB_ELTS_V8) { + OPJ_UINT32 k; + + opj_v8dwt_interleave_v(&job->v, aj, job->w, NB_ELTS_V8); + opj_v8dwt_decode(&job->v); + + for (k = 0; k < job->rh; ++k) { + memcpy(&aj[k * (OPJ_SIZE_T)job->w], &job->v.wavelet[k], + NB_ELTS_V8 * sizeof(OPJ_FLOAT32)); + } + aj += NB_ELTS_V8; + } + + opj_aligned_free(job->v.wavelet); + opj_free(job); +} + /* */ /* Inverse 9-7 wavelet transform in 2-D. */ /* */ static -OPJ_BOOL opj_dwt_decode_tile_97(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec, +OPJ_BOOL opj_dwt_decode_tile_97(opj_thread_pool_t* tp, + opj_tcd_tilecomp_t* OPJ_RESTRICT tilec, OPJ_UINT32 numres) { opj_v8dwt_t h; @@ -2666,6 +2753,7 @@ OPJ_BOOL opj_dwt_decode_tile_97(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec, tilec->resolutions[tilec->minimum_num_resolutions - 1].x0); OPJ_SIZE_T l_data_size; + const int num_threads = opj_thread_pool_get_thread_count(tp); if (numres == 1) { return OPJ_TRUE; @@ -2705,26 +2793,70 @@ OPJ_BOOL opj_dwt_decode_tile_97(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec, h.win_l_x1 = (OPJ_UINT32)h.sn; h.win_h_x0 = 0; h.win_h_x1 = (OPJ_UINT32)h.dn; - for (j = 0; j + (NB_ELTS_V8 - 1) < rh; j += NB_ELTS_V8) { - OPJ_UINT32 k; - opj_v8dwt_interleave_h(&h, aj, w, rh - j); - opj_v8dwt_decode(&h); - /* To be adapted if NB_ELTS_V8 changes */ - for (k = 0; k < rw; k++) { - aj[k ] = h.wavelet[k].f[0]; - aj[k + (OPJ_SIZE_T)w ] = h.wavelet[k].f[1]; - aj[k + (OPJ_SIZE_T)w * 2] = h.wavelet[k].f[2]; - aj[k + (OPJ_SIZE_T)w * 3] = h.wavelet[k].f[3]; - } - for (k = 0; k < rw; k++) { - aj[k + (OPJ_SIZE_T)w * 4] = h.wavelet[k].f[4]; - aj[k + (OPJ_SIZE_T)w * 5] = h.wavelet[k].f[5]; - aj[k + (OPJ_SIZE_T)w * 6] = h.wavelet[k].f[6]; - aj[k + (OPJ_SIZE_T)w * 7] = h.wavelet[k].f[7]; - } + if (num_threads <= 1 || rh < 2 * NB_ELTS_V8) { + for (j = 0; j + (NB_ELTS_V8 - 1) < rh; j += NB_ELTS_V8) { + OPJ_UINT32 k; + opj_v8dwt_interleave_h(&h, aj, w, NB_ELTS_V8); + opj_v8dwt_decode(&h); - aj += w * NB_ELTS_V8; + /* To be adapted if NB_ELTS_V8 changes */ + for (k = 0; k < rw; k++) { + aj[k ] = h.wavelet[k].f[0]; + aj[k + (OPJ_SIZE_T)w ] = h.wavelet[k].f[1]; + aj[k + (OPJ_SIZE_T)w * 2] = h.wavelet[k].f[2]; + aj[k + (OPJ_SIZE_T)w * 3] = h.wavelet[k].f[3]; + } + for (k = 0; k < rw; k++) { + aj[k + (OPJ_SIZE_T)w * 4] = h.wavelet[k].f[4]; + aj[k + (OPJ_SIZE_T)w * 5] = h.wavelet[k].f[5]; + aj[k + (OPJ_SIZE_T)w * 6] = h.wavelet[k].f[6]; + aj[k + (OPJ_SIZE_T)w * 7] = h.wavelet[k].f[7]; + } + + aj += w * NB_ELTS_V8; + } + } else { + OPJ_UINT32 num_jobs = (OPJ_UINT32)num_threads; + OPJ_UINT32 step_j; + + if ((rh / NB_ELTS_V8) < num_jobs) { + num_jobs = rh / NB_ELTS_V8; + } + step_j = ((rh / num_jobs) / NB_ELTS_V8) * NB_ELTS_V8; + for (j = 0; j < num_jobs; j++) { + opj_dwt97_decode_h_job_t* job; + + job = (opj_dwt97_decode_h_job_t*) opj_malloc(sizeof(opj_dwt97_decode_h_job_t)); + if (!job) { + opj_thread_pool_wait_completion(tp, 0); + opj_aligned_free(h.wavelet); + return OPJ_FALSE; + } + job->h.wavelet = (opj_v8_t*)opj_aligned_malloc(l_data_size * sizeof(opj_v8_t)); + if (!job->h.wavelet) { + opj_thread_pool_wait_completion(tp, 0); + opj_free(job); + opj_aligned_free(h.wavelet); + return OPJ_FALSE; + } + job->h.dn = h.dn; + job->h.sn = h.sn; + job->h.cas = h.cas; + job->h.win_l_x0 = h.win_l_x0; + job->h.win_l_x1 = h.win_l_x1; + job->h.win_h_x0 = h.win_h_x0; + job->h.win_h_x1 = h.win_h_x1; + job->rw = rw; + job->w = w; + job->aj = aj; + job->nb_rows = (j + 1 == num_jobs) ? (rh & (OPJ_UINT32)~ + (NB_ELTS_V8 - 1)) - j * step_j : step_j; + aj += w * job->nb_rows; + opj_thread_pool_submit_job(tp, opj_dwt97_decode_h_func, job); + } + opj_thread_pool_wait_completion(tp, 0); + j = rh & (OPJ_UINT32)~(NB_ELTS_V8 - 1); } if (j < rh) { @@ -2747,16 +2879,62 @@ OPJ_BOOL opj_dwt_decode_tile_97(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec, v.win_h_x1 = (OPJ_UINT32)v.dn; aj = (OPJ_FLOAT32*) tilec->data; - for (j = rw; j > (NB_ELTS_V8 - 1); j -= NB_ELTS_V8) { - OPJ_UINT32 k; + if (num_threads <= 1 || rw < 2 * NB_ELTS_V8) { + for (j = rw; j > (NB_ELTS_V8 - 1); j -= NB_ELTS_V8) { + OPJ_UINT32 k; - opj_v8dwt_interleave_v(&v, aj, w, NB_ELTS_V8); - opj_v8dwt_decode(&v); + opj_v8dwt_interleave_v(&v, aj, w, NB_ELTS_V8); + opj_v8dwt_decode(&v); - for (k = 0; k < rh; ++k) { - memcpy(&aj[k * (OPJ_SIZE_T)w], &v.wavelet[k], NB_ELTS_V8 * sizeof(OPJ_FLOAT32)); + for (k = 0; k < rh; ++k) { + memcpy(&aj[k * (OPJ_SIZE_T)w], &v.wavelet[k], NB_ELTS_V8 * sizeof(OPJ_FLOAT32)); + } + aj += NB_ELTS_V8; } - aj += NB_ELTS_V8; + } else { + /* "bench_dwt -I" shows that scaling is poor, likely due to RAM + transfer being the limiting factor. So limit the number of + threads. + */ + OPJ_UINT32 num_jobs = opj_uint_max((OPJ_UINT32)num_threads / 2, 2U); + OPJ_UINT32 step_j; + + if ((rw / NB_ELTS_V8) < num_jobs) { + num_jobs = rw / NB_ELTS_V8; + } + step_j = ((rw / num_jobs) / NB_ELTS_V8) * NB_ELTS_V8; + for (j = 0; j < num_jobs; j++) { + opj_dwt97_decode_v_job_t* job; + + job = (opj_dwt97_decode_v_job_t*) opj_malloc(sizeof(opj_dwt97_decode_v_job_t)); + if (!job) { + opj_thread_pool_wait_completion(tp, 0); + opj_aligned_free(h.wavelet); + return OPJ_FALSE; + } + job->v.wavelet = (opj_v8_t*)opj_aligned_malloc(l_data_size * sizeof(opj_v8_t)); + if (!job->v.wavelet) { + opj_thread_pool_wait_completion(tp, 0); + opj_free(job); + opj_aligned_free(h.wavelet); + return OPJ_FALSE; + } + job->v.dn = v.dn; + job->v.sn = v.sn; + job->v.cas = v.cas; + job->v.win_l_x0 = v.win_l_x0; + job->v.win_l_x1 = v.win_l_x1; + job->v.win_h_x0 = v.win_h_x0; + job->v.win_h_x1 = v.win_h_x1; + job->rh = rh; + job->w = w; + job->aj = aj; + job->nb_columns = (j + 1 == num_jobs) ? (rw & (OPJ_UINT32)~ + (NB_ELTS_V8 - 1)) - j * step_j : step_j; + aj += job->nb_columns; + opj_thread_pool_submit_job(tp, opj_dwt97_decode_v_func, job); + } + opj_thread_pool_wait_completion(tp, 0); } if (rw & (NB_ELTS_V8 - 1)) { @@ -3018,7 +3196,7 @@ OPJ_BOOL opj_dwt_decode_real(opj_tcd_t *p_tcd, OPJ_UINT32 numres) { if (p_tcd->whole_tile_decoding) { - return opj_dwt_decode_tile_97(tilec, numres); + return opj_dwt_decode_tile_97(p_tcd->thread_pool, tilec, numres); } else { return opj_dwt_decode_partial_97(tilec, numres); } From bd5f5ee7dea851aedae630fee094ef3b0ff0c888 Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Thu, 21 May 2020 21:20:19 +0200 Subject: [PATCH 18/24] Forward DWT: small code refactoring to allow future improvements for the horizontal pass --- src/lib/openjp2/dwt.c | 106 ++++++++++++++++++++++++++++++------------ 1 file changed, 76 insertions(+), 30 deletions(-) diff --git a/src/lib/openjp2/dwt.c b/src/lib/openjp2/dwt.c index 8790626e..ff0375ef 100644 --- a/src/lib/openjp2/dwt.c +++ b/src/lib/openjp2/dwt.c @@ -126,12 +126,16 @@ typedef void (*DWT1DFN)(const opj_dwt_t* v); /** Forward lazy transform (horizontal) */ -static void opj_dwt_deinterleave_h(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn, +static void opj_dwt_deinterleave_h(const OPJ_INT32 * OPJ_RESTRICT a, + OPJ_INT32 * OPJ_RESTRICT b, + OPJ_INT32 dn, OPJ_INT32 sn, OPJ_INT32 cas); /** Forward lazy transform (vertical) */ -static void opj_dwt_deinterleave_v(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn, +static void opj_dwt_deinterleave_v(const OPJ_INT32 * OPJ_RESTRICT a, + OPJ_INT32 * OPJ_RESTRICT b, + OPJ_INT32 dn, OPJ_INT32 sn, OPJ_UINT32 x, OPJ_INT32 cas); /** Forward 5-3 wavelet transform in 1-D @@ -162,9 +166,17 @@ static OPJ_BOOL opj_dwt_decode_partial_tile( typedef void (*opj_encode_one_row_fnptr_type)(void *, OPJ_INT32, OPJ_INT32, OPJ_INT32); +typedef void (*opj_encode_and_deinterleave_h_one_row_fnptr_type)( + void *row, + void *tmp, + OPJ_UINT32 width, + OPJ_BOOL even); + static OPJ_BOOL opj_dwt_encode_procedure(opj_thread_pool_t* tp, opj_tcd_tilecomp_t * tilec, - opj_encode_one_row_fnptr_type p_function); + opj_encode_one_row_fnptr_type p_function, + opj_encode_and_deinterleave_h_one_row_fnptr_type + p_encode_and_deinterleave_h_one_row); static OPJ_UINT32 opj_dwt_max_resolution(opj_tcd_resolution_t* OPJ_RESTRICT r, OPJ_UINT32 i); @@ -218,12 +230,14 @@ static const OPJ_FLOAT64 opj_dwt_norms_real[4][10] = { /* */ /* Forward lazy transform (horizontal). */ /* */ -static void opj_dwt_deinterleave_h(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn, +static void opj_dwt_deinterleave_h(const OPJ_INT32 * OPJ_RESTRICT a, + OPJ_INT32 * OPJ_RESTRICT b, + OPJ_INT32 dn, OPJ_INT32 sn, OPJ_INT32 cas) { OPJ_INT32 i; - OPJ_INT32 * l_dest = b; - OPJ_INT32 * l_src = a + cas; + OPJ_INT32 * OPJ_RESTRICT l_dest = b; + const OPJ_INT32 * OPJ_RESTRICT l_src = a + cas; for (i = 0; i < sn; ++i) { *l_dest++ = *l_src; @@ -242,12 +256,14 @@ static void opj_dwt_deinterleave_h(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn, /* */ /* Forward lazy transform (vertical). */ /* */ -static void opj_dwt_deinterleave_v(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn, +static void opj_dwt_deinterleave_v(const OPJ_INT32 * OPJ_RESTRICT a, + OPJ_INT32 * OPJ_RESTRICT b, + OPJ_INT32 dn, OPJ_INT32 sn, OPJ_UINT32 x, OPJ_INT32 cas) { OPJ_INT32 i = sn; - OPJ_INT32 * l_dest = b; - OPJ_INT32 * l_src = a + cas; + OPJ_INT32 * OPJ_RESTRICT l_dest = b; + const OPJ_INT32 * OPJ_RESTRICT l_src = a + cas; while (i--) { *l_dest = *l_src; @@ -272,7 +288,7 @@ static void opj_dwt_deinterleave_v(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn, /* */ static void opj_dwt_interleave_h(const opj_dwt_t* h, OPJ_INT32 *a) { - OPJ_INT32 *ai = a; + const OPJ_INT32 *ai = a; OPJ_INT32 *bi = h->mem + h->cas; OPJ_INT32 i = h->sn; while (i--) { @@ -293,7 +309,7 @@ static void opj_dwt_interleave_h(const opj_dwt_t* h, OPJ_INT32 *a) /* */ static void opj_dwt_interleave_v(const opj_dwt_t* v, OPJ_INT32 *a, OPJ_INT32 x) { - OPJ_INT32 *ai = a; + const OPJ_INT32 *ai = a; OPJ_INT32 *bi = v->mem + v->cas; OPJ_INT32 i = v->sn; while (i--) { @@ -1095,15 +1111,48 @@ static void opj_dwt_encode_stepsize(OPJ_INT32 stepsize, OPJ_INT32 numbps, ========================================================== */ +/** Process one line for the horizontal pass of the 5x3 forward transform */ +static +void opj_dwt_encode_and_deinterleave_h_one_row(void* rowIn, + void* tmpIn, + OPJ_UINT32 width, + OPJ_BOOL even) +{ + OPJ_INT32* OPJ_RESTRICT row = (OPJ_INT32*)rowIn; + OPJ_INT32* OPJ_RESTRICT tmp = (OPJ_INT32*)tmpIn; + const OPJ_INT32 sn = (OPJ_INT32)((width + (even ? 1 : 0)) >> 1); + const OPJ_INT32 dn = (OPJ_INT32)(width - (OPJ_UINT32)sn); + memcpy(tmp, row, width * sizeof(OPJ_INT32)); + opj_dwt_encode_1(tmp, dn, sn, even ? 0 : 1); + opj_dwt_deinterleave_h(tmp, row, dn, sn, even ? 0 : 1); +} + +/** Process one line for the horizontal pass of the 9x7 forward transform */ +static +void opj_dwt_encode_and_deinterleave_h_one_row_real(void* rowIn, + void* tmpIn, + OPJ_UINT32 width, + OPJ_BOOL even) +{ + OPJ_FLOAT32* OPJ_RESTRICT row = (OPJ_FLOAT32*)rowIn; + OPJ_FLOAT32* OPJ_RESTRICT tmp = (OPJ_FLOAT32*)tmpIn; + const OPJ_INT32 sn = (OPJ_INT32)((width + (even ? 1 : 0)) >> 1); + const OPJ_INT32 dn = (OPJ_INT32)(width - (OPJ_UINT32)sn); + memcpy(tmp, row, width * sizeof(OPJ_FLOAT32)); + opj_dwt_encode_1_real(tmp, dn, sn, even ? 0 : 1); + opj_dwt_deinterleave_h((OPJ_INT32 * OPJ_RESTRICT)tmp, + (OPJ_INT32 * OPJ_RESTRICT)row, + dn, sn, even ? 0 : 1); +} typedef struct { opj_dwt_t h; - OPJ_UINT32 rw; - OPJ_UINT32 w; + OPJ_UINT32 rw; /* Width of the resolution to process */ + OPJ_UINT32 w; /* Width of tiledp */ OPJ_INT32 * OPJ_RESTRICT tiledp; OPJ_UINT32 min_j; OPJ_UINT32 max_j; - opj_encode_one_row_fnptr_type p_function; + opj_encode_and_deinterleave_h_one_row_fnptr_type p_function; } opj_dwt_encode_h_job_t; static void opj_dwt_encode_h_func(void* user_data, opj_tls_t* tls) @@ -1115,12 +1164,8 @@ static void opj_dwt_encode_h_func(void* user_data, opj_tls_t* tls) job = (opj_dwt_encode_h_job_t*)user_data; for (j = job->min_j; j < job->max_j; j++) { OPJ_INT32* OPJ_RESTRICT aj = job->tiledp + j * job->w; - OPJ_UINT32 k; - for (k = 0; k < job->rw; k++) { - job->h.mem[k] = aj[k]; - } - (*job->p_function)(job->h.mem, job->h.dn, job->h.sn, job->h.cas); - opj_dwt_deinterleave_h(job->h.mem, aj, job->h.dn, job->h.sn, job->h.cas); + (*job->p_function)(aj, job->h.mem, job->rw, + job->h.cas == 0 ? OPJ_TRUE : OPJ_FALSE); } opj_aligned_free(job->h.mem); @@ -1166,7 +1211,9 @@ static void opj_dwt_encode_v_func(void* user_data, opj_tls_t* tls) /* */ static INLINE OPJ_BOOL opj_dwt_encode_procedure(opj_thread_pool_t* tp, opj_tcd_tilecomp_t * tilec, - opj_encode_one_row_fnptr_type p_function) + opj_encode_one_row_fnptr_type p_function, + opj_encode_and_deinterleave_h_one_row_fnptr_type + p_encode_and_deinterleave_h_one_row) { OPJ_INT32 i; OPJ_INT32 *bj = 00; @@ -1286,12 +1333,8 @@ static INLINE OPJ_BOOL opj_dwt_encode_procedure(opj_thread_pool_t* tp, if (num_threads <= 1 || rh <= 1) { for (j = 0; j < rh; j++) { OPJ_INT32* OPJ_RESTRICT aj = tiledp + j * w; - OPJ_UINT32 k; - for (k = 0; k < rw; k++) { - bj[k] = aj[k]; - } - (*p_function)(bj, dn, sn, cas_row); - opj_dwt_deinterleave_h(bj, aj, dn, sn, cas_row); + (*p_encode_and_deinterleave_h_one_row)(aj, bj, rw, + cas_row == 0 ? OPJ_TRUE : OPJ_FALSE); } } else { OPJ_UINT32 num_jobs = (OPJ_UINT32)num_threads; @@ -1329,7 +1372,7 @@ static INLINE OPJ_BOOL opj_dwt_encode_procedure(opj_thread_pool_t* tp, if (j == (num_jobs - 1U)) { /* this will take care of the overflow */ job->max_j = rh; } - job->p_function = p_function; + job->p_function = p_encode_and_deinterleave_h_one_row; opj_thread_pool_submit_job(tp, opj_dwt_encode_h_func, job); } opj_thread_pool_wait_completion(tp, 0); @@ -1349,7 +1392,9 @@ static INLINE OPJ_BOOL opj_dwt_encode_procedure(opj_thread_pool_t* tp, OPJ_BOOL opj_dwt_encode(opj_tcd_t *p_tcd, opj_tcd_tilecomp_t * tilec) { - return opj_dwt_encode_procedure(p_tcd->thread_pool, tilec, opj_dwt_encode_1); + return opj_dwt_encode_procedure(p_tcd->thread_pool, tilec, + opj_dwt_encode_1, + opj_dwt_encode_and_deinterleave_h_one_row); } /* */ @@ -1388,7 +1433,8 @@ OPJ_BOOL opj_dwt_encode_real(opj_tcd_t *p_tcd, opj_tcd_tilecomp_t * tilec) { return opj_dwt_encode_procedure(p_tcd->thread_pool, tilec, - opj_dwt_encode_1_real); + opj_dwt_encode_1_real, + opj_dwt_encode_and_deinterleave_h_one_row_real); } /* */ From 97b384aecdeea0c286213f5caf6244ec08ea32d7 Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Fri, 22 May 2020 15:03:40 +0200 Subject: [PATCH 19/24] Forward DWT 5x3: performance improvements in horizontal pass, and modest in vertical pass --- src/lib/openjp2/dwt.c | 80 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 66 insertions(+), 14 deletions(-) diff --git a/src/lib/openjp2/dwt.c b/src/lib/openjp2/dwt.c index ff0375ef..27efd9b1 100644 --- a/src/lib/openjp2/dwt.c +++ b/src/lib/openjp2/dwt.c @@ -339,23 +339,37 @@ static void opj_dwt_encode_1(void *aIn, OPJ_INT32 dn, OPJ_INT32 sn, OPJ_INT32* a = (OPJ_INT32*)aIn; if (!cas) { - if ((dn > 0) || (sn > 1)) { /* NEW : CASE ONE ELEMENT */ - for (i = 0; i < dn; i++) { - OPJ_D(i) -= (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1; + if (sn + dn > 1) { + for (i = 0; i < sn - 1; i++) { + OPJ_D(i) -= (OPJ_S(i) + OPJ_S(i + 1)) >> 1; } - for (i = 0; i < sn; i++) { - OPJ_S(i) += (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2; + if (((sn + dn) % 2) == 0) { + OPJ_D(i) -= OPJ_S(i); + } + OPJ_S(0) += (OPJ_D(0) + OPJ_D(0) + 2) >> 2; + for (i = 1; i < dn; i++) { + OPJ_S(i) += (OPJ_D(i - 1) + OPJ_D(i) + 2) >> 2; + } + if (((sn + dn) % 2) == 1) { + OPJ_S(i) += (OPJ_D(i - 1) + OPJ_D(i - 1) + 2) >> 2; } } } else { - if (!sn && dn == 1) { /* NEW : CASE ONE ELEMENT */ - OPJ_S(0) *= 2; + if (sn + dn == 1) { + a[0] *= 2; } else { - for (i = 0; i < dn; i++) { - OPJ_S(i) -= (OPJ_DD_(i) + OPJ_DD_(i - 1)) >> 1; + OPJ_S(0) -= OPJ_D(0); + for (i = 1; i < sn; i++) { + OPJ_S(i) -= (OPJ_D(i) + OPJ_D(i - 1)) >> 1; } - for (i = 0; i < sn; i++) { - OPJ_D(i) += (OPJ_SS_(i) + OPJ_SS_(i + 1) + 2) >> 2; + if (((sn + dn) % 2) == 1) { + OPJ_S(i) -= OPJ_D(i - 1); + } + for (i = 0; i < dn - 1; i++) { + OPJ_D(i) += (OPJ_S(i) + OPJ_S(i + 1) + 2) >> 2; + } + if (((sn + dn) % 2) == 0) { + OPJ_D(i) += (OPJ_S(i) + OPJ_S(i) + 2) >> 2; } } } @@ -1122,9 +1136,47 @@ void opj_dwt_encode_and_deinterleave_h_one_row(void* rowIn, OPJ_INT32* OPJ_RESTRICT tmp = (OPJ_INT32*)tmpIn; const OPJ_INT32 sn = (OPJ_INT32)((width + (even ? 1 : 0)) >> 1); const OPJ_INT32 dn = (OPJ_INT32)(width - (OPJ_UINT32)sn); - memcpy(tmp, row, width * sizeof(OPJ_INT32)); - opj_dwt_encode_1(tmp, dn, sn, even ? 0 : 1); - opj_dwt_deinterleave_h(tmp, row, dn, sn, even ? 0 : 1); + + if (even) { + if (width > 1) { + OPJ_INT32 i; + for (i = 0; i < sn - 1; i++) { + tmp[sn + i] = row[2 * i + 1] - ((row[(i) * 2] + row[(i + 1) * 2]) >> 1); + } + if ((width % 2) == 0) { + tmp[sn + i] = row[2 * i + 1] - row[(i) * 2]; + } + row[0] += (tmp[sn] + tmp[sn] + 2) >> 2; + for (i = 1; i < dn; i++) { + row[i] = row[2 * i] + ((tmp[sn + (i - 1)] + tmp[sn + i] + 2) >> 2); + } + if ((width % 2) == 1) { + row[i] = row[2 * i] + ((tmp[sn + (i - 1)] + tmp[sn + (i - 1)] + 2) >> 2); + } + memcpy(row + sn, tmp + sn, (OPJ_SIZE_T)dn * sizeof(OPJ_INT32)); + } + } else { + if (width == 1) { + row[0] *= 2; + } else { + OPJ_INT32 i; + tmp[sn + 0] = row[0] - row[1]; + for (i = 1; i < sn; i++) { + tmp[sn + i] = row[2 * i] - ((row[2 * i + 1] + row[2 * (i - 1) + 1]) >> 1); + } + if ((width % 2) == 1) { + tmp[sn + i] = row[2 * i] - row[2 * (i - 1) + 1]; + } + + for (i = 0; i < dn - 1; i++) { + row[i] = row[2 * i + 1] + ((tmp[sn + i] + tmp[sn + i + 1] + 2) >> 2); + } + if ((width % 2) == 0) { + row[i] = row[2 * i + 1] + ((tmp[sn + i] + tmp[sn + i] + 2) >> 2); + } + memcpy(row + sn, tmp + sn, (OPJ_SIZE_T)dn * sizeof(OPJ_INT32)); + } + } } /** Process one line for the horizontal pass of the 9x7 forward transform */ From 33d3d0de07be710f53940c7548b9f2bd58ff3210 Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Fri, 22 May 2020 15:06:29 +0200 Subject: [PATCH 20/24] dwt.c: remove unused typedef --- src/lib/openjp2/dwt.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/lib/openjp2/dwt.c b/src/lib/openjp2/dwt.c index 27efd9b1..79be0f56 100644 --- a/src/lib/openjp2/dwt.c +++ b/src/lib/openjp2/dwt.c @@ -115,11 +115,6 @@ static const OPJ_FLOAT32 opj_invK = (OPJ_FLOAT32)(1.0 / 1.230174105); /*@}*/ -/** -Virtual function type for wavelet transform in 1-D -*/ -typedef void (*DWT1DFN)(const opj_dwt_t* v); - /** @name Local static functions */ /*@{*/ From e69fa09f604bc472f9fbff5c2b2db65e8dbe2418 Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Fri, 22 May 2020 15:58:47 +0200 Subject: [PATCH 21/24] Forward DWT: small code refactoring to allow future improvements for the vertical pass --- src/lib/openjp2/dwt.c | 145 +++++++++++++++++++++++++++++++----------- 1 file changed, 108 insertions(+), 37 deletions(-) diff --git a/src/lib/openjp2/dwt.c b/src/lib/openjp2/dwt.c index 79be0f56..4f54c57a 100644 --- a/src/lib/openjp2/dwt.c +++ b/src/lib/openjp2/dwt.c @@ -157,10 +157,18 @@ static OPJ_BOOL opj_dwt_decode_partial_tile( opj_tcd_tilecomp_t* tilec, OPJ_UINT32 numres); +/* Forward transform, for the vertical pass, processing cols columns */ +/* where cols <= NB_ELTS_V8 */ /* Where void* is a OPJ_INT32* for 5x3 and OPJ_FLOAT32* for 9x7 */ -typedef void (*opj_encode_one_row_fnptr_type)(void *, OPJ_INT32, OPJ_INT32, - OPJ_INT32); +typedef void (*opj_encode_and_deinterleave_v_fnptr_type)( + void *array, + void *tmp, + OPJ_UINT32 height, + OPJ_BOOL even, + OPJ_UINT32 stride_width, + OPJ_UINT32 cols); +/* Where void* is a OPJ_INT32* for 5x3 and OPJ_FLOAT32* for 9x7 */ typedef void (*opj_encode_and_deinterleave_h_one_row_fnptr_type)( void *row, void *tmp, @@ -169,7 +177,7 @@ typedef void (*opj_encode_and_deinterleave_h_one_row_fnptr_type)( static OPJ_BOOL opj_dwt_encode_procedure(opj_thread_pool_t* tp, opj_tcd_tilecomp_t * tilec, - opj_encode_one_row_fnptr_type p_function, + opj_encode_and_deinterleave_v_fnptr_type p_encode_and_deinterleave_v, opj_encode_and_deinterleave_h_one_row_fnptr_type p_encode_and_deinterleave_h_one_row); @@ -1226,7 +1234,7 @@ typedef struct { OPJ_INT32 * OPJ_RESTRICT tiledp; OPJ_UINT32 min_j; OPJ_UINT32 max_j; - opj_encode_one_row_fnptr_type p_function; + opj_encode_and_deinterleave_v_fnptr_type p_encode_and_deinterleave_v; } opj_dwt_encode_v_job_t; static void opj_dwt_encode_v_func(void* user_data, opj_tls_t* tls) @@ -1236,29 +1244,90 @@ static void opj_dwt_encode_v_func(void* user_data, opj_tls_t* tls) (void)tls; job = (opj_dwt_encode_v_job_t*)user_data; - for (j = job->min_j; j < job->max_j; j++) { - OPJ_INT32* OPJ_RESTRICT aj = job->tiledp + j; - OPJ_UINT32 k; - for (k = 0; k < job->rh; ++k) { - job->v.mem[k] = aj[k * job->w]; - } - - (*job->p_function)(job->v.mem, job->v.dn, job->v.sn, job->v.cas); - - opj_dwt_deinterleave_v(job->v.mem, aj, job->v.dn, job->v.sn, job->w, - job->v.cas); + for (j = job->min_j; j + NB_ELTS_V8 - 1 < job->max_j; j += NB_ELTS_V8) { + (*job->p_encode_and_deinterleave_v)(job->tiledp + j, + job->v.mem, + job->rh, + job->v.cas == 0, + job->w, + NB_ELTS_V8); + } + if (j < job->max_j) { + (*job->p_encode_and_deinterleave_v)(job->tiledp + j, + job->v.mem, + job->rh, + job->v.cas == 0, + job->w, + job->max_j - j); } opj_aligned_free(job->v.mem); opj_free(job); } +/* Forward 5-3 transform, for the vertical pass, processing cols columns */ +/* where cols <= NB_ELTS_V8 */ +static void opj_dwt_encode_and_deinterleave_v( + void *arrayIn, + void *tmpIn, + OPJ_UINT32 height, + OPJ_BOOL even, + OPJ_UINT32 stride_width, + OPJ_UINT32 cols) +{ + OPJ_INT32* OPJ_RESTRICT array = (OPJ_INT32 * OPJ_RESTRICT)arrayIn; + OPJ_INT32* OPJ_RESTRICT tmp = (OPJ_INT32 * OPJ_RESTRICT)tmpIn; + OPJ_UINT32 c; + const OPJ_INT32 sn = (OPJ_INT32)((height + (even ? 1 : 0)) >> 1); + const OPJ_INT32 dn = (OPJ_INT32)(height - (OPJ_UINT32)sn); + for (c = 0; c < cols; c++) { + OPJ_UINT32 k; + for (k = 0; k < height; ++k) { + tmp[k] = array[c + k * stride_width]; + } + + opj_dwt_encode_1(tmp, dn, sn, even ? 0 : 1); + + opj_dwt_deinterleave_v(tmp, array + c, dn, sn, stride_width, even ? 0 : 1); + } +} + +/* Forward 9-7 transform, for the vertical pass, processing cols columns */ +/* where cols <= NB_ELTS_V8 */ +static void opj_dwt_encode_and_deinterleave_v_real( + void *arrayIn, + void *tmpIn, + OPJ_UINT32 height, + OPJ_BOOL even, + OPJ_UINT32 stride_width, + OPJ_UINT32 cols) +{ + OPJ_FLOAT32* OPJ_RESTRICT array = (OPJ_FLOAT32 * OPJ_RESTRICT)arrayIn; + OPJ_FLOAT32* OPJ_RESTRICT tmp = (OPJ_FLOAT32 * OPJ_RESTRICT)tmpIn; + OPJ_UINT32 c; + const OPJ_INT32 sn = (OPJ_INT32)((height + (even ? 1 : 0)) >> 1); + const OPJ_INT32 dn = (OPJ_INT32)(height - (OPJ_UINT32)sn); + for (c = 0; c < cols; c++) { + OPJ_UINT32 k; + for (k = 0; k < height; ++k) { + tmp[k] = array[c + k * stride_width]; + } + + opj_dwt_encode_1_real(tmp, dn, sn, even ? 0 : 1); + + opj_dwt_deinterleave_v((OPJ_INT32*)tmpIn, + ((OPJ_INT32*)(arrayIn)) + c, + dn, sn, stride_width, even ? 0 : 1); + } +} + + /* */ /* Forward 5-3 wavelet transform in 2-D. */ /* */ static INLINE OPJ_BOOL opj_dwt_encode_procedure(opj_thread_pool_t* tp, opj_tcd_tilecomp_t * tilec, - opj_encode_one_row_fnptr_type p_function, + opj_encode_and_deinterleave_v_fnptr_type p_encode_and_deinterleave_v, opj_encode_and_deinterleave_h_one_row_fnptr_type p_encode_and_deinterleave_h_one_row) { @@ -1282,11 +1351,11 @@ static INLINE OPJ_BOOL opj_dwt_encode_procedure(opj_thread_pool_t* tp, l_data_size = opj_dwt_max_resolution(tilec->resolutions, tilec->numresolutions); /* overflow check */ - if (l_data_size > (SIZE_MAX / sizeof(OPJ_INT32))) { + if (l_data_size > (SIZE_MAX / (NB_ELTS_V8 * sizeof(OPJ_INT32)))) { /* FIXME event manager error callback */ return OPJ_FALSE; } - l_data_size *= sizeof(OPJ_INT32); + l_data_size *= NB_ELTS_V8 * sizeof(OPJ_INT32); bj = (OPJ_INT32*)opj_aligned_32_malloc(l_data_size); /* l_data_size is equal to 0 when numresolutions == 1 but bj is not used */ /* in that case, so do not error out */ @@ -1319,17 +1388,22 @@ static INLINE OPJ_BOOL opj_dwt_encode_procedure(opj_thread_pool_t* tp, dn = (OPJ_INT32)(rh - rh1); /* Perform vertical pass */ - if (num_threads <= 1 || rw <= 1) { - for (j = 0; j < rw; ++j) { - OPJ_INT32* OPJ_RESTRICT aj = tiledp + j; - OPJ_UINT32 k; - for (k = 0; k < rh; ++k) { - bj[k] = aj[k * w]; - } - - (*p_function)(bj, dn, sn, cas_col); - - opj_dwt_deinterleave_v(bj, aj, dn, sn, w, cas_col); + if (num_threads <= 1 || rw < 2 * NB_ELTS_V8) { + for (j = 0; j + NB_ELTS_V8 - 1 < rw; j += NB_ELTS_V8) { + p_encode_and_deinterleave_v(tiledp + j, + bj, + rh, + cas_col == 0, + w, + NB_ELTS_V8); + } + if (j < rw) { + p_encode_and_deinterleave_v(tiledp + j, + bj, + rh, + cas_col == 0, + w, + rw - j); } } else { OPJ_UINT32 num_jobs = (OPJ_UINT32)num_threads; @@ -1338,7 +1412,7 @@ static INLINE OPJ_BOOL opj_dwt_encode_procedure(opj_thread_pool_t* tp, if (rw < num_jobs) { num_jobs = rw; } - step_j = (rw / num_jobs); + step_j = ((rw / num_jobs) / NB_ELTS_V8) * NB_ELTS_V8; for (j = 0; j < num_jobs; j++) { opj_dwt_encode_v_job_t* job; @@ -1363,11 +1437,8 @@ static INLINE OPJ_BOOL opj_dwt_encode_procedure(opj_thread_pool_t* tp, job->w = w; job->tiledp = tiledp; job->min_j = j * step_j; - job->max_j = (j + 1U) * step_j; /* this can overflow */ - if (j == (num_jobs - 1U)) { /* this will take care of the overflow */ - job->max_j = rw; - } - job->p_function = p_function; + job->max_j = (j + 1 == num_jobs) ? rw : (j + 1) * step_j; + job->p_encode_and_deinterleave_v = p_encode_and_deinterleave_v; opj_thread_pool_submit_job(tp, opj_dwt_encode_v_func, job); } opj_thread_pool_wait_completion(tp, 0); @@ -1440,7 +1511,7 @@ OPJ_BOOL opj_dwt_encode(opj_tcd_t *p_tcd, opj_tcd_tilecomp_t * tilec) { return opj_dwt_encode_procedure(p_tcd->thread_pool, tilec, - opj_dwt_encode_1, + opj_dwt_encode_and_deinterleave_v, opj_dwt_encode_and_deinterleave_h_one_row); } @@ -1480,7 +1551,7 @@ OPJ_BOOL opj_dwt_encode_real(opj_tcd_t *p_tcd, opj_tcd_tilecomp_t * tilec) { return opj_dwt_encode_procedure(p_tcd->thread_pool, tilec, - opj_dwt_encode_1_real, + opj_dwt_encode_and_deinterleave_v_real, opj_dwt_encode_and_deinterleave_h_one_row_real); } From a38e970fa59abd796c703ec469e578b09f7ffa33 Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Fri, 22 May 2020 17:50:15 +0200 Subject: [PATCH 22/24] Forward DWT 5-3: major speed up by vectorizing vertical pass `bench_dwt -encode` times goes from 7.9s to 1.7s --- src/lib/openjp2/dwt.c | 347 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 286 insertions(+), 61 deletions(-) diff --git a/src/lib/openjp2/dwt.c b/src/lib/openjp2/dwt.c index 4f54c57a..c422917c 100644 --- a/src/lib/openjp2/dwt.c +++ b/src/lib/openjp2/dwt.c @@ -132,11 +132,7 @@ static void opj_dwt_deinterleave_v(const OPJ_INT32 * OPJ_RESTRICT a, OPJ_INT32 * OPJ_RESTRICT b, OPJ_INT32 dn, OPJ_INT32 sn, OPJ_UINT32 x, OPJ_INT32 cas); -/** -Forward 5-3 wavelet transform in 1-D -*/ -static void opj_dwt_encode_1(void *a, OPJ_INT32 dn, OPJ_INT32 sn, - OPJ_INT32 cas); + /** Forward 9-7 wavelet transform in 1-D */ @@ -332,52 +328,6 @@ static void opj_dwt_interleave_v(const opj_dwt_t* v, OPJ_INT32 *a, OPJ_INT32 x) #endif /* STANDARD_SLOW_VERSION */ -/* */ -/* Forward 5-3 wavelet transform in 1-D. */ -/* */ -static void opj_dwt_encode_1(void *aIn, OPJ_INT32 dn, OPJ_INT32 sn, - OPJ_INT32 cas) -{ - OPJ_INT32 i; - OPJ_INT32* a = (OPJ_INT32*)aIn; - - if (!cas) { - if (sn + dn > 1) { - for (i = 0; i < sn - 1; i++) { - OPJ_D(i) -= (OPJ_S(i) + OPJ_S(i + 1)) >> 1; - } - if (((sn + dn) % 2) == 0) { - OPJ_D(i) -= OPJ_S(i); - } - OPJ_S(0) += (OPJ_D(0) + OPJ_D(0) + 2) >> 2; - for (i = 1; i < dn; i++) { - OPJ_S(i) += (OPJ_D(i - 1) + OPJ_D(i) + 2) >> 2; - } - if (((sn + dn) % 2) == 1) { - OPJ_S(i) += (OPJ_D(i - 1) + OPJ_D(i - 1) + 2) >> 2; - } - } - } else { - if (sn + dn == 1) { - a[0] *= 2; - } else { - OPJ_S(0) -= OPJ_D(0); - for (i = 1; i < sn; i++) { - OPJ_S(i) -= (OPJ_D(i) + OPJ_D(i - 1)) >> 1; - } - if (((sn + dn) % 2) == 1) { - OPJ_S(i) -= OPJ_D(i - 1); - } - for (i = 0; i < dn - 1; i++) { - OPJ_D(i) += (OPJ_S(i) + OPJ_S(i + 1) + 2) >> 2; - } - if (((sn + dn) % 2) == 0) { - OPJ_D(i) += (OPJ_S(i) + OPJ_S(i) + 2) >> 2; - } - } - } -} - #ifdef STANDARD_SLOW_VERSION /* */ /* Inverse 5-3 wavelet transform in 1-D. */ @@ -1265,6 +1215,76 @@ static void opj_dwt_encode_v_func(void* user_data, opj_tls_t* tls) opj_free(job); } +/** Fetch up to cols <= NB_ELTS_V8 for each line, and put them in tmpOut */ +/* that has a NB_ELTS_V8 interleave factor. */ +static void opj_dwt_fetch_cols_vertical_pass(const void *arrayIn, + void *tmpOut, + OPJ_UINT32 height, + OPJ_UINT32 stride_width, + OPJ_UINT32 cols) +{ + const OPJ_INT32* OPJ_RESTRICT array = (const OPJ_INT32 * OPJ_RESTRICT)arrayIn; + OPJ_INT32* OPJ_RESTRICT tmp = (OPJ_INT32 * OPJ_RESTRICT)tmpOut; + if (cols == NB_ELTS_V8) { + OPJ_UINT32 k; + for (k = 0; k < height; ++k) { + memcpy(tmp + NB_ELTS_V8 * k, + array + k * stride_width, + NB_ELTS_V8 * sizeof(OPJ_INT32)); + } + } else { + OPJ_UINT32 k; + for (k = 0; k < height; ++k) { + OPJ_UINT32 c; + for (c = 0; c < cols; c++) { + tmp[NB_ELTS_V8 * k + c] = array[c + k * stride_width]; + } + for (; c < NB_ELTS_V8; c++) { + tmp[NB_ELTS_V8 * k + c] = 0; + } + } + } +} + +/* Deinterleave result of forward transform, where cols <= NB_ELTS_V8 */ +/* and src contains NB_ELTS_V8 consecutive values for up to NB_ELTS_V8 */ +/* columns. */ +static INLINE void opj_dwt_deinterleave_v_cols( + const OPJ_INT32 * OPJ_RESTRICT src, + OPJ_INT32 * OPJ_RESTRICT dst, + OPJ_INT32 dn, + OPJ_INT32 sn, + OPJ_UINT32 stride_width, + OPJ_INT32 cas, + OPJ_UINT32 cols) +{ + OPJ_INT32 i = sn; + OPJ_INT32 * OPJ_RESTRICT l_dest = dst; + const OPJ_INT32 * OPJ_RESTRICT l_src = src + cas * NB_ELTS_V8; + OPJ_UINT32 c; + + while (i--) { + for (c = 0; c < cols; c++) { + l_dest[c] = l_src[c]; + } + l_dest += stride_width; + l_src += 2 * NB_ELTS_V8; + } + + l_dest = dst + (OPJ_SIZE_T)sn * (OPJ_SIZE_T)stride_width; + l_src = src + (1 - cas) * NB_ELTS_V8; + + i = dn; + while (i--) { + for (c = 0; c < cols; c++) { + l_dest[c] = l_src[c]; + } + l_dest += stride_width; + l_src += 2 * NB_ELTS_V8; + } +} + + /* Forward 5-3 transform, for the vertical pass, processing cols columns */ /* where cols <= NB_ELTS_V8 */ static void opj_dwt_encode_and_deinterleave_v( @@ -1277,18 +1297,223 @@ static void opj_dwt_encode_and_deinterleave_v( { OPJ_INT32* OPJ_RESTRICT array = (OPJ_INT32 * OPJ_RESTRICT)arrayIn; OPJ_INT32* OPJ_RESTRICT tmp = (OPJ_INT32 * OPJ_RESTRICT)tmpIn; - OPJ_UINT32 c; - const OPJ_INT32 sn = (OPJ_INT32)((height + (even ? 1 : 0)) >> 1); - const OPJ_INT32 dn = (OPJ_INT32)(height - (OPJ_UINT32)sn); - for (c = 0; c < cols; c++) { - OPJ_UINT32 k; - for (k = 0; k < height; ++k) { - tmp[k] = array[c + k * stride_width]; + const OPJ_UINT32 sn = (height + (even ? 1 : 0)) >> 1; + const OPJ_UINT32 dn = height - sn; + + opj_dwt_fetch_cols_vertical_pass(arrayIn, tmpIn, height, stride_width, cols); + +#define OPJ_Sc(i) tmp[(i)*2* NB_ELTS_V8 + c] +#define OPJ_Dc(i) tmp[((1+(i)*2))* NB_ELTS_V8 + c] + +#ifdef __SSE2__ + if (height == 1) { + if (!even) { + OPJ_UINT32 c; + for (c = 0; c < NB_ELTS_V8; c++) { + tmp[c] *= 2; + } } + } else if (even) { + OPJ_UINT32 c; + OPJ_UINT32 i; + i = 0; + if (i + 1 < sn) { + __m128i xmm_Si_0 = *(const __m128i*)(tmp + 4 * 0); + __m128i xmm_Si_1 = *(const __m128i*)(tmp + 4 * 1); + for (; i + 1 < sn; i++) { + __m128i xmm_Sip1_0 = *(const __m128i*)(tmp + + (i + 1) * 2 * NB_ELTS_V8 + 4 * 0); + __m128i xmm_Sip1_1 = *(const __m128i*)(tmp + + (i + 1) * 2 * NB_ELTS_V8 + 4 * 1); + __m128i xmm_Di_0 = *(const __m128i*)(tmp + + (1 + i * 2) * NB_ELTS_V8 + 4 * 0); + __m128i xmm_Di_1 = *(const __m128i*)(tmp + + (1 + i * 2) * NB_ELTS_V8 + 4 * 1); + xmm_Di_0 = _mm_sub_epi32(xmm_Di_0, + _mm_srai_epi32(_mm_add_epi32(xmm_Si_0, xmm_Sip1_0), 1)); + xmm_Di_1 = _mm_sub_epi32(xmm_Di_1, + _mm_srai_epi32(_mm_add_epi32(xmm_Si_1, xmm_Sip1_1), 1)); + *(__m128i*)(tmp + (1 + i * 2) * NB_ELTS_V8 + 4 * 0) = xmm_Di_0; + *(__m128i*)(tmp + (1 + i * 2) * NB_ELTS_V8 + 4 * 1) = xmm_Di_1; + xmm_Si_0 = xmm_Sip1_0; + xmm_Si_1 = xmm_Sip1_1; + } + } + if (((height) % 2) == 0) { + for (c = 0; c < NB_ELTS_V8; c++) { + OPJ_Dc(i) -= OPJ_Sc(i); + } + } + for (c = 0; c < NB_ELTS_V8; c++) { + OPJ_Sc(0) += (OPJ_Dc(0) + OPJ_Dc(0) + 2) >> 2; + } + i = 1; + if (i < dn) { + __m128i xmm_Dim1_0 = *(const __m128i*)(tmp + (1 + + (i - 1) * 2) * NB_ELTS_V8 + 4 * 0); + __m128i xmm_Dim1_1 = *(const __m128i*)(tmp + (1 + + (i - 1) * 2) * NB_ELTS_V8 + 4 * 1); + const __m128i xmm_two = _mm_set1_epi32(2); + for (; i < dn; i++) { + __m128i xmm_Di_0 = *(const __m128i*)(tmp + + (1 + i * 2) * NB_ELTS_V8 + 4 * 0); + __m128i xmm_Di_1 = *(const __m128i*)(tmp + + (1 + i * 2) * NB_ELTS_V8 + 4 * 1); + __m128i xmm_Si_0 = *(const __m128i*)(tmp + + (i * 2) * NB_ELTS_V8 + 4 * 0); + __m128i xmm_Si_1 = *(const __m128i*)(tmp + + (i * 2) * NB_ELTS_V8 + 4 * 1); + xmm_Si_0 = _mm_add_epi32(xmm_Si_0, + _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(xmm_Dim1_0, xmm_Di_0), xmm_two), 2)); + xmm_Si_1 = _mm_add_epi32(xmm_Si_1, + _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(xmm_Dim1_1, xmm_Di_1), xmm_two), 2)); + *(__m128i*)(tmp + (i * 2) * NB_ELTS_V8 + 4 * 0) = xmm_Si_0; + *(__m128i*)(tmp + (i * 2) * NB_ELTS_V8 + 4 * 1) = xmm_Si_1; + xmm_Dim1_0 = xmm_Di_0; + xmm_Dim1_1 = xmm_Di_1; + } + } + if (((height) % 2) == 1) { + for (c = 0; c < NB_ELTS_V8; c++) { + OPJ_Sc(i) += (OPJ_Dc(i - 1) + OPJ_Dc(i - 1) + 2) >> 2; + } + } + } else { + OPJ_UINT32 c; + OPJ_UINT32 i; + for (c = 0; c < NB_ELTS_V8; c++) { + OPJ_Sc(0) -= OPJ_Dc(0); + } + i = 1; + if (i < sn) { + __m128i xmm_Dim1_0 = *(const __m128i*)(tmp + (1 + + (i - 1) * 2) * NB_ELTS_V8 + 4 * 0); + __m128i xmm_Dim1_1 = *(const __m128i*)(tmp + (1 + + (i - 1) * 2) * NB_ELTS_V8 + 4 * 1); + for (; i < sn; i++) { + __m128i xmm_Di_0 = *(const __m128i*)(tmp + + (1 + i * 2) * NB_ELTS_V8 + 4 * 0); + __m128i xmm_Di_1 = *(const __m128i*)(tmp + + (1 + i * 2) * NB_ELTS_V8 + 4 * 1); + __m128i xmm_Si_0 = *(const __m128i*)(tmp + + (i * 2) * NB_ELTS_V8 + 4 * 0); + __m128i xmm_Si_1 = *(const __m128i*)(tmp + + (i * 2) * NB_ELTS_V8 + 4 * 1); + xmm_Si_0 = _mm_sub_epi32(xmm_Si_0, + _mm_srai_epi32(_mm_add_epi32(xmm_Di_0, xmm_Dim1_0), 1)); + xmm_Si_1 = _mm_sub_epi32(xmm_Si_1, + _mm_srai_epi32(_mm_add_epi32(xmm_Di_1, xmm_Dim1_1), 1)); + *(__m128i*)(tmp + (i * 2) * NB_ELTS_V8 + 4 * 0) = xmm_Si_0; + *(__m128i*)(tmp + (i * 2) * NB_ELTS_V8 + 4 * 1) = xmm_Si_1; + xmm_Dim1_0 = xmm_Di_0; + xmm_Dim1_1 = xmm_Di_1; + } + } + if (((height) % 2) == 1) { + for (c = 0; c < NB_ELTS_V8; c++) { + OPJ_Sc(i) -= OPJ_Dc(i - 1); + } + } + i = 0; + if (i + 1 < dn) { + __m128i xmm_Si_0 = *((const __m128i*)(tmp + 4 * 0)); + __m128i xmm_Si_1 = *((const __m128i*)(tmp + 4 * 1)); + const __m128i xmm_two = _mm_set1_epi32(2); + for (; i + 1 < dn; i++) { + __m128i xmm_Sip1_0 = *(const __m128i*)(tmp + + (i + 1) * 2 * NB_ELTS_V8 + 4 * 0); + __m128i xmm_Sip1_1 = *(const __m128i*)(tmp + + (i + 1) * 2 * NB_ELTS_V8 + 4 * 1); + __m128i xmm_Di_0 = *(const __m128i*)(tmp + + (1 + i * 2) * NB_ELTS_V8 + 4 * 0); + __m128i xmm_Di_1 = *(const __m128i*)(tmp + + (1 + i * 2) * NB_ELTS_V8 + 4 * 1); + xmm_Di_0 = _mm_add_epi32(xmm_Di_0, + _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(xmm_Si_0, xmm_Sip1_0), xmm_two), 2)); + xmm_Di_1 = _mm_add_epi32(xmm_Di_1, + _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(xmm_Si_1, xmm_Sip1_1), xmm_two), 2)); + *(__m128i*)(tmp + (1 + i * 2) * NB_ELTS_V8 + 4 * 0) = xmm_Di_0; + *(__m128i*)(tmp + (1 + i * 2) * NB_ELTS_V8 + 4 * 1) = xmm_Di_1; + xmm_Si_0 = xmm_Sip1_0; + xmm_Si_1 = xmm_Sip1_1; + } + } + if (((height) % 2) == 0) { + for (c = 0; c < NB_ELTS_V8; c++) { + OPJ_Dc(i) += (OPJ_Sc(i) + OPJ_Sc(i) + 2) >> 2; + } + } + } +#else + if (even) { + OPJ_UINT32 c; + if (height > 1) { + OPJ_UINT32 i; + for (i = 0; i + 1 < sn; i++) { + for (c = 0; c < NB_ELTS_V8; c++) { + OPJ_Dc(i) -= (OPJ_Sc(i) + OPJ_Sc(i + 1)) >> 1; + } + } + if (((height) % 2) == 0) { + for (c = 0; c < NB_ELTS_V8; c++) { + OPJ_Dc(i) -= OPJ_Sc(i); + } + } + for (c = 0; c < NB_ELTS_V8; c++) { + OPJ_Sc(0) += (OPJ_Dc(0) + OPJ_Dc(0) + 2) >> 2; + } + for (i = 1; i < dn; i++) { + for (c = 0; c < NB_ELTS_V8; c++) { + OPJ_Sc(i) += (OPJ_Dc(i - 1) + OPJ_Dc(i) + 2) >> 2; + } + } + if (((height) % 2) == 1) { + for (c = 0; c < NB_ELTS_V8; c++) { + OPJ_Sc(i) += (OPJ_Dc(i - 1) + OPJ_Dc(i - 1) + 2) >> 2; + } + } + } + } else { + OPJ_UINT32 c; + if (height == 1) { + for (c = 0; c < NB_ELTS_V8; c++) { + OPJ_Sc(0) *= 2; + } + } else { + OPJ_UINT32 i; + for (c = 0; c < NB_ELTS_V8; c++) { + OPJ_Sc(0) -= OPJ_Dc(0); + } + for (i = 1; i < sn; i++) { + for (c = 0; c < NB_ELTS_V8; c++) { + OPJ_Sc(i) -= (OPJ_Dc(i) + OPJ_Dc(i - 1)) >> 1; + } + } + if (((height) % 2) == 1) { + for (c = 0; c < NB_ELTS_V8; c++) { + OPJ_Sc(i) -= OPJ_Dc(i - 1); + } + } + for (i = 0; i + 1 < dn; i++) { + for (c = 0; c < NB_ELTS_V8; c++) { + OPJ_Dc(i) += (OPJ_Sc(i) + OPJ_Sc(i + 1) + 2) >> 2; + } + } + if (((height) % 2) == 0) { + for (c = 0; c < NB_ELTS_V8; c++) { + OPJ_Dc(i) += (OPJ_Sc(i) + OPJ_Sc(i) + 2) >> 2; + } + } + } + } +#endif - opj_dwt_encode_1(tmp, dn, sn, even ? 0 : 1); - - opj_dwt_deinterleave_v(tmp, array + c, dn, sn, stride_width, even ? 0 : 1); + if (cols == NB_ELTS_V8) { + opj_dwt_deinterleave_v_cols(tmp, array, (OPJ_INT32)dn, (OPJ_INT32)sn, + stride_width, even ? 0 : 1, NB_ELTS_V8); + } else { + opj_dwt_deinterleave_v_cols(tmp, array, (OPJ_INT32)dn, (OPJ_INT32)sn, + stride_width, even ? 0 : 1, cols); } } From 1e931fdb3655c64ab60ea5657f79309331a86485 Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Fri, 22 May 2020 23:57:51 +0200 Subject: [PATCH 23/24] Forward DWT 9-7: major speed up by vectorizing vertical pass `bench_dwt -I -encode` times goes from 8.6s to 2.1s --- src/lib/openjp2/dwt.c | 336 +++++++++++++++++++++++++++++++----------- 1 file changed, 250 insertions(+), 86 deletions(-) diff --git a/src/lib/openjp2/dwt.c b/src/lib/openjp2/dwt.c index c422917c..ee9eb5e6 100644 --- a/src/lib/openjp2/dwt.c +++ b/src/lib/openjp2/dwt.c @@ -125,13 +125,6 @@ static void opj_dwt_deinterleave_h(const OPJ_INT32 * OPJ_RESTRICT a, OPJ_INT32 * OPJ_RESTRICT b, OPJ_INT32 dn, OPJ_INT32 sn, OPJ_INT32 cas); -/** -Forward lazy transform (vertical) -*/ -static void opj_dwt_deinterleave_v(const OPJ_INT32 * OPJ_RESTRICT a, - OPJ_INT32 * OPJ_RESTRICT b, - OPJ_INT32 dn, - OPJ_INT32 sn, OPJ_UINT32 x, OPJ_INT32 cas); /** Forward 9-7 wavelet transform in 1-D @@ -252,35 +245,6 @@ static void opj_dwt_deinterleave_h(const OPJ_INT32 * OPJ_RESTRICT a, } } -/* */ -/* Forward lazy transform (vertical). */ -/* */ -static void opj_dwt_deinterleave_v(const OPJ_INT32 * OPJ_RESTRICT a, - OPJ_INT32 * OPJ_RESTRICT b, - OPJ_INT32 dn, - OPJ_INT32 sn, OPJ_UINT32 x, OPJ_INT32 cas) -{ - OPJ_INT32 i = sn; - OPJ_INT32 * OPJ_RESTRICT l_dest = b; - const OPJ_INT32 * OPJ_RESTRICT l_src = a + cas; - - while (i--) { - *l_dest = *l_src; - l_dest += x; - l_src += 2; - } /* b[i*x]=a[2*i+cas]; */ - - l_dest = b + (OPJ_SIZE_T)sn * (OPJ_SIZE_T)x; - l_src = a + 1 - cas; - - i = dn; - while (i--) { - *l_dest = *l_src; - l_dest += x; - l_src += 2; - } /*b[(sn+i)*x]=a[(2*i+1-cas)];*/ -} - #ifdef STANDARD_SLOW_VERSION /* */ /* Inverse lazy transform (horizontal). */ @@ -989,36 +953,85 @@ static void opj_idwt53_v(const opj_dwt_t *dwt, #endif } +#if 0 static void opj_dwt_encode_step1(OPJ_FLOAT32* fw, - OPJ_UINT32 start, OPJ_UINT32 end, const OPJ_FLOAT32 c) { - OPJ_UINT32 i; - for (i = start; i < end; ++i) { - fw[i * 2] *= c; + OPJ_UINT32 i = 0; + for (; i < end; ++i) { + fw[0] *= c; + fw += 2; } } +#else +static void opj_dwt_encode_step1_combined(OPJ_FLOAT32* fw, + OPJ_UINT32 iters_c1, + OPJ_UINT32 iters_c2, + const OPJ_FLOAT32 c1, + const OPJ_FLOAT32 c2) +{ + OPJ_UINT32 i = 0; + const OPJ_UINT32 iters_common = opj_uint_min(iters_c1, iters_c2); + assert((((OPJ_SIZE_T)fw) & 0xf) == 0); + assert(opj_int_abs((OPJ_INT32)iters_c1 - (OPJ_INT32)iters_c2) <= 1); + for (; i + 3 < iters_common; i += 4) { +#ifdef __SSE__ + const __m128 vcst = _mm_set_ps(c2, c1, c2, c1); + *(__m128*)fw = _mm_mul_ps(*(__m128*)fw, vcst); + *(__m128*)(fw + 4) = _mm_mul_ps(*(__m128*)(fw + 4), vcst); +#else + fw[0] *= c1; + fw[1] *= c2; + fw[2] *= c1; + fw[3] *= c2; + fw[4] *= c1; + fw[5] *= c2; + fw[6] *= c1; + fw[7] *= c2; +#endif + fw += 8; + } + for (; i < iters_common; i++) { + fw[0] *= c1; + fw[1] *= c2; + fw += 2; + } + if (i < iters_c1) { + fw[0] *= c1; + } else if (i < iters_c2) { + fw[1] *= c2; + } +} + +#endif + static void opj_dwt_encode_step2(OPJ_FLOAT32* fl, OPJ_FLOAT32* fw, - OPJ_UINT32 start, OPJ_UINT32 end, OPJ_UINT32 m, OPJ_FLOAT32 c) { OPJ_UINT32 i; OPJ_UINT32 imax = opj_uint_min(end, m); - if (start > 0) { - fw += 2 * start; - fl = fw - 2; - } - for (i = start; i < imax; ++i) { + if (imax > 0) { fw[-1] += (fl[0] + fw[0]) * c; - fl = fw; fw += 2; + i = 1; + for (; i + 3 < imax; i += 4) { + fw[-1] += (fw[-2] + fw[0]) * c; + fw[1] += (fw[0] + fw[2]) * c; + fw[3] += (fw[2] + fw[4]) * c; + fw[5] += (fw[4] + fw[6]) * c; + fw += 8; + } + for (; i < imax; ++i) { + fw[-1] += (fw[-2] + fw[0]) * c; + fw += 2; + } } if (m < end) { assert(m + 1 == end); - fw[-1] += (2 * fl[0]) * c; + fw[-1] += (2 * fw[-2]) * c; } } @@ -1027,39 +1040,50 @@ static void opj_dwt_encode_1_real(void *aIn, OPJ_INT32 dn, OPJ_INT32 sn, { OPJ_FLOAT32* w = (OPJ_FLOAT32*)aIn; OPJ_INT32 a, b; + assert(dn + sn > 1); if (cas == 0) { - if (!((dn > 0) || (sn > 1))) { - return; - } a = 0; b = 1; } else { - if (!((sn > 0) || (dn > 1))) { - return; - } a = 1; b = 0; } opj_dwt_encode_step2(w + a, w + b + 1, - 0, (OPJ_UINT32)dn, + (OPJ_UINT32)dn, (OPJ_UINT32)opj_int_min(dn, sn - b), opj_dwt_alpha); opj_dwt_encode_step2(w + b, w + a + 1, - 0, (OPJ_UINT32)sn, + (OPJ_UINT32)sn, (OPJ_UINT32)opj_int_min(sn, dn - a), opj_dwt_beta); opj_dwt_encode_step2(w + a, w + b + 1, - 0, (OPJ_UINT32)dn, + (OPJ_UINT32)dn, (OPJ_UINT32)opj_int_min(dn, sn - b), opj_dwt_gamma); opj_dwt_encode_step2(w + b, w + a + 1, - 0, (OPJ_UINT32)sn, + (OPJ_UINT32)sn, (OPJ_UINT32)opj_int_min(sn, dn - a), opj_dwt_delta); - opj_dwt_encode_step1(w + b, 0, (OPJ_UINT32)dn, +#if 0 + opj_dwt_encode_step1(w + b, (OPJ_UINT32)dn, opj_K); - opj_dwt_encode_step1(w + a, 0, (OPJ_UINT32)sn, + opj_dwt_encode_step1(w + a, (OPJ_UINT32)sn, opj_invK); +#else + if (a == 0) { + opj_dwt_encode_step1_combined(w, + (OPJ_UINT32)sn, + (OPJ_UINT32)dn, + opj_invK, + opj_K); + } else { + opj_dwt_encode_step1_combined(w, + (OPJ_UINT32)dn, + (OPJ_UINT32)sn, + opj_K, + opj_invK); + } +#endif } static void opj_dwt_encode_stepsize(OPJ_INT32 stepsize, OPJ_INT32 numbps, @@ -1143,6 +1167,9 @@ void opj_dwt_encode_and_deinterleave_h_one_row_real(void* rowIn, OPJ_FLOAT32* OPJ_RESTRICT tmp = (OPJ_FLOAT32*)tmpIn; const OPJ_INT32 sn = (OPJ_INT32)((width + (even ? 1 : 0)) >> 1); const OPJ_INT32 dn = (OPJ_INT32)(width - (OPJ_UINT32)sn); + if (width == 1) { + return; + } memcpy(tmp, row, width * sizeof(OPJ_FLOAT32)); opj_dwt_encode_1_real(tmp, dn, sn, even ? 0 : 1); opj_dwt_deinterleave_h((OPJ_INT32 * OPJ_RESTRICT)tmp, @@ -1258,29 +1285,49 @@ static INLINE void opj_dwt_deinterleave_v_cols( OPJ_INT32 cas, OPJ_UINT32 cols) { + OPJ_INT32 k; OPJ_INT32 i = sn; OPJ_INT32 * OPJ_RESTRICT l_dest = dst; const OPJ_INT32 * OPJ_RESTRICT l_src = src + cas * NB_ELTS_V8; OPJ_UINT32 c; - while (i--) { - for (c = 0; c < cols; c++) { - l_dest[c] = l_src[c]; + for (k = 0; k < 2; k++) { + while (i--) { + if (cols == NB_ELTS_V8) { + memcpy(l_dest, l_src, NB_ELTS_V8 * sizeof(OPJ_INT32)); + } else { + c = 0; + switch (cols) { + case 7: + l_dest[c] = l_src[c]; + c++; /* fallthru */ + case 6: + l_dest[c] = l_src[c]; + c++; /* fallthru */ + case 5: + l_dest[c] = l_src[c]; + c++; /* fallthru */ + case 4: + l_dest[c] = l_src[c]; + c++; /* fallthru */ + case 3: + l_dest[c] = l_src[c]; + c++; /* fallthru */ + case 2: + l_dest[c] = l_src[c]; + c++; /* fallthru */ + default: + l_dest[c] = l_src[c]; + break; + } + } + l_dest += stride_width; + l_src += 2 * NB_ELTS_V8; } - l_dest += stride_width; - l_src += 2 * NB_ELTS_V8; - } - l_dest = dst + (OPJ_SIZE_T)sn * (OPJ_SIZE_T)stride_width; - l_src = src + (1 - cas) * NB_ELTS_V8; - - i = dn; - while (i--) { - for (c = 0; c < cols; c++) { - l_dest[c] = l_src[c]; - } - l_dest += stride_width; - l_src += 2 * NB_ELTS_V8; + l_dest = dst + (OPJ_SIZE_T)sn * (OPJ_SIZE_T)stride_width; + l_src = src + (1 - cas) * NB_ELTS_V8; + i = dn; } } @@ -1517,6 +1564,84 @@ static void opj_dwt_encode_and_deinterleave_v( } } +static void opj_v8dwt_encode_step1(OPJ_FLOAT32* fw, + OPJ_UINT32 end, + const OPJ_FLOAT32 cst) +{ + OPJ_UINT32 i; +#ifdef __SSE__ + __m128* vw = (__m128*) fw; + const __m128 vcst = _mm_set1_ps(cst); + for (i = 0; i < end; ++i) { + vw[0] = _mm_mul_ps(vw[0], vcst); + vw[1] = _mm_mul_ps(vw[1], vcst); + vw += 2 * (NB_ELTS_V8 * sizeof(OPJ_FLOAT32) / sizeof(__m128)); + } +#else + OPJ_UINT32 c; + for (i = 0; i < end; ++i) { + for (c = 0; c < NB_ELTS_V8; c++) { + fw[i * 2 * NB_ELTS_V8 + c] *= cst; + } + } +#endif +} + +static void opj_v8dwt_encode_step2(OPJ_FLOAT32* fl, OPJ_FLOAT32* fw, + OPJ_UINT32 end, + OPJ_UINT32 m, + OPJ_FLOAT32 cst) +{ + OPJ_UINT32 i; + OPJ_UINT32 imax = opj_uint_min(end, m); +#ifdef __SSE__ + __m128* vw = (__m128*) fw; + __m128 vcst = _mm_set1_ps(cst); + if (imax > 0) { + __m128* vl = (__m128*) fl; + vw[-2] = _mm_add_ps(vw[-2], _mm_mul_ps(_mm_add_ps(vl[0], vw[0]), vcst)); + vw[-1] = _mm_add_ps(vw[-1], _mm_mul_ps(_mm_add_ps(vl[1], vw[1]), vcst)); + vw += 2 * (NB_ELTS_V8 * sizeof(OPJ_FLOAT32) / sizeof(__m128)); + i = 1; + + for (; i < imax; ++i) { + vw[-2] = _mm_add_ps(vw[-2], _mm_mul_ps(_mm_add_ps(vw[-4], vw[0]), vcst)); + vw[-1] = _mm_add_ps(vw[-1], _mm_mul_ps(_mm_add_ps(vw[-3], vw[1]), vcst)); + vw += 2 * (NB_ELTS_V8 * sizeof(OPJ_FLOAT32) / sizeof(__m128)); + } + } + if (m < end) { + assert(m + 1 == end); + vcst = _mm_add_ps(vcst, vcst); + vw[-2] = _mm_add_ps(vw[-2], _mm_mul_ps(vw[-4], vcst)); + vw[-1] = _mm_add_ps(vw[-1], _mm_mul_ps(vw[-3], vcst)); + } +#else + OPJ_INT32 c; + if (imax > 0) { + for (c = 0; c < NB_ELTS_V8; c++) { + fw[-1 * NB_ELTS_V8 + c] += (fl[0 * NB_ELTS_V8 + c] + fw[0 * NB_ELTS_V8 + c]) * + cst; + } + fw += 2 * NB_ELTS_V8; + i = 1; + for (; i < imax; ++i) { + for (c = 0; c < NB_ELTS_V8; c++) { + fw[-1 * NB_ELTS_V8 + c] += (fw[-2 * NB_ELTS_V8 + c] + fw[0 * NB_ELTS_V8 + c]) * + cst; + } + fw += 2 * NB_ELTS_V8; + } + } + if (m < end) { + assert(m + 1 == end); + for (c = 0; c < NB_ELTS_V8; c++) { + fw[-1 * NB_ELTS_V8 + c] += (2 * fw[-2 * NB_ELTS_V8 + c]) * cst; + } + } +#endif +} + /* Forward 9-7 transform, for the vertical pass, processing cols columns */ /* where cols <= NB_ELTS_V8 */ static void opj_dwt_encode_and_deinterleave_v_real( @@ -1529,20 +1654,59 @@ static void opj_dwt_encode_and_deinterleave_v_real( { OPJ_FLOAT32* OPJ_RESTRICT array = (OPJ_FLOAT32 * OPJ_RESTRICT)arrayIn; OPJ_FLOAT32* OPJ_RESTRICT tmp = (OPJ_FLOAT32 * OPJ_RESTRICT)tmpIn; - OPJ_UINT32 c; const OPJ_INT32 sn = (OPJ_INT32)((height + (even ? 1 : 0)) >> 1); const OPJ_INT32 dn = (OPJ_INT32)(height - (OPJ_UINT32)sn); - for (c = 0; c < cols; c++) { - OPJ_UINT32 k; - for (k = 0; k < height; ++k) { - tmp[k] = array[c + k * stride_width]; - } + OPJ_INT32 a, b; - opj_dwt_encode_1_real(tmp, dn, sn, even ? 0 : 1); + if (height == 1) { + return; + } - opj_dwt_deinterleave_v((OPJ_INT32*)tmpIn, - ((OPJ_INT32*)(arrayIn)) + c, - dn, sn, stride_width, even ? 0 : 1); + opj_dwt_fetch_cols_vertical_pass(arrayIn, tmpIn, height, stride_width, cols); + + if (even) { + a = 0; + b = 1; + } else { + a = 1; + b = 0; + } + opj_v8dwt_encode_step2(tmp + a * NB_ELTS_V8, + tmp + (b + 1) * NB_ELTS_V8, + (OPJ_UINT32)dn, + (OPJ_UINT32)opj_int_min(dn, sn - b), + opj_dwt_alpha); + opj_v8dwt_encode_step2(tmp + b * NB_ELTS_V8, + tmp + (a + 1) * NB_ELTS_V8, + (OPJ_UINT32)sn, + (OPJ_UINT32)opj_int_min(sn, dn - a), + opj_dwt_beta); + opj_v8dwt_encode_step2(tmp + a * NB_ELTS_V8, + tmp + (b + 1) * NB_ELTS_V8, + (OPJ_UINT32)dn, + (OPJ_UINT32)opj_int_min(dn, sn - b), + opj_dwt_gamma); + opj_v8dwt_encode_step2(tmp + b * NB_ELTS_V8, + tmp + (a + 1) * NB_ELTS_V8, + (OPJ_UINT32)sn, + (OPJ_UINT32)opj_int_min(sn, dn - a), + opj_dwt_delta); + opj_v8dwt_encode_step1(tmp + b * NB_ELTS_V8, (OPJ_UINT32)dn, + opj_K); + opj_v8dwt_encode_step1(tmp + a * NB_ELTS_V8, (OPJ_UINT32)sn, + opj_invK); + + + if (cols == NB_ELTS_V8) { + opj_dwt_deinterleave_v_cols((OPJ_INT32*)tmp, + (OPJ_INT32*)array, + (OPJ_INT32)dn, (OPJ_INT32)sn, + stride_width, even ? 0 : 1, NB_ELTS_V8); + } else { + opj_dwt_deinterleave_v_cols((OPJ_INT32*)tmp, + (OPJ_INT32*)array, + (OPJ_INT32)dn, (OPJ_INT32)sn, + stride_width, even ? 0 : 1, cols); } } From 1c5627ee7406f84cfb40809b7ac31c63342427df Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Sun, 24 May 2020 15:38:21 +0200 Subject: [PATCH 24/24] T1 encoder: speed-up by aggressive inlining and more cache friendly data organization ~ 9% speed improvement seen on 10980x10980 uint16 image, T36JTT_20160914T074612_B02.tif opj_compress time from 17.2s to 15.8s --- src/lib/openjp2/mqc.c | 176 +++++------- src/lib/openjp2/mqc.h | 9 +- src/lib/openjp2/mqc_inl.h | 90 +++++- src/lib/openjp2/t1.c | 576 ++++++++++++++++++++------------------ src/lib/openjp2/t1.h | 1 - 5 files changed, 460 insertions(+), 392 deletions(-) diff --git a/src/lib/openjp2/mqc.c b/src/lib/openjp2/mqc.c index 6299b171..4cbfabd0 100644 --- a/src/lib/openjp2/mqc.c +++ b/src/lib/openjp2/mqc.c @@ -46,27 +46,6 @@ /** @name Local static functions */ /*@{*/ -/** -Output a byte, doing bit-stuffing if necessary. -After a 0xff byte, the next byte must be smaller than 0x90. -@param mqc MQC handle -*/ -static void opj_mqc_byteout(opj_mqc_t *mqc); -/** -Renormalize mqc->a and mqc->c while encoding, so that mqc->a stays between 0x8000 and 0x10000 -@param mqc MQC handle -*/ -static void opj_mqc_renorme(opj_mqc_t *mqc); -/** -Encode the most probable symbol -@param mqc MQC handle -*/ -static void opj_mqc_codemps(opj_mqc_t *mqc); -/** -Encode the most least symbol -@param mqc MQC handle -*/ -static void opj_mqc_codelps(opj_mqc_t *mqc); /** Fill mqc->c with 1's for flushing @param mqc MQC handle @@ -182,80 +161,6 @@ static const opj_mqc_state_t mqc_states[47 * 2] = { ========================================================== */ -static void opj_mqc_byteout(opj_mqc_t *mqc) -{ - /* bp is initialized to start - 1 in opj_mqc_init_enc() */ - /* but this is safe, see opj_tcd_code_block_enc_allocate_data() */ - assert(mqc->bp >= mqc->start - 1); - if (*mqc->bp == 0xff) { - mqc->bp++; - *mqc->bp = (OPJ_BYTE)(mqc->c >> 20); - mqc->c &= 0xfffff; - mqc->ct = 7; - } else { - if ((mqc->c & 0x8000000) == 0) { - mqc->bp++; - *mqc->bp = (OPJ_BYTE)(mqc->c >> 19); - mqc->c &= 0x7ffff; - mqc->ct = 8; - } else { - (*mqc->bp)++; - if (*mqc->bp == 0xff) { - mqc->c &= 0x7ffffff; - mqc->bp++; - *mqc->bp = (OPJ_BYTE)(mqc->c >> 20); - mqc->c &= 0xfffff; - mqc->ct = 7; - } else { - mqc->bp++; - *mqc->bp = (OPJ_BYTE)(mqc->c >> 19); - mqc->c &= 0x7ffff; - mqc->ct = 8; - } - } - } -} - -static void opj_mqc_renorme(opj_mqc_t *mqc) -{ - do { - mqc->a <<= 1; - mqc->c <<= 1; - mqc->ct--; - if (mqc->ct == 0) { - opj_mqc_byteout(mqc); - } - } while ((mqc->a & 0x8000) == 0); -} - -static void opj_mqc_codemps(opj_mqc_t *mqc) -{ - mqc->a -= (*mqc->curctx)->qeval; - if ((mqc->a & 0x8000) == 0) { - if (mqc->a < (*mqc->curctx)->qeval) { - mqc->a = (*mqc->curctx)->qeval; - } else { - mqc->c += (*mqc->curctx)->qeval; - } - *mqc->curctx = (*mqc->curctx)->nmps; - opj_mqc_renorme(mqc); - } else { - mqc->c += (*mqc->curctx)->qeval; - } -} - -static void opj_mqc_codelps(opj_mqc_t *mqc) -{ - mqc->a -= (*mqc->curctx)->qeval; - if (mqc->a < (*mqc->curctx)->qeval) { - mqc->c += (*mqc->curctx)->qeval; - } else { - mqc->a = (*mqc->curctx)->qeval; - } - *mqc->curctx = (*mqc->curctx)->nlps; - opj_mqc_renorme(mqc); -} - static void opj_mqc_setbits(opj_mqc_t *mqc) { OPJ_UINT32 tempc = mqc->c + mqc->a; @@ -303,14 +208,6 @@ void opj_mqc_init_enc(opj_mqc_t *mqc, OPJ_BYTE *bp) mqc->end_of_byte_stream_counter = 0; } -void opj_mqc_encode(opj_mqc_t *mqc, OPJ_UINT32 d) -{ - if ((*mqc->curctx)->mps == d) { - opj_mqc_codemps(mqc); - } else { - opj_mqc_codelps(mqc); - } -} void opj_mqc_flush(opj_mqc_t *mqc) { @@ -329,8 +226,6 @@ void opj_mqc_flush(opj_mqc_t *mqc) } } -#define BYPASS_CT_INIT 0xDEADBEEF - void opj_mqc_bypass_init_enc(opj_mqc_t *mqc) { /* This function is normally called after at least one opj_mqc_flush() */ @@ -475,6 +370,43 @@ void opj_mqc_erterm_enc(opj_mqc_t *mqc) } } +static INLINE void opj_mqc_renorme(opj_mqc_t *mqc) +{ + opj_mqc_renorme_macro(mqc, mqc->a, mqc->c, mqc->ct); +} + +/** +Encode the most probable symbol +@param mqc MQC handle +*/ +static INLINE void opj_mqc_codemps(opj_mqc_t *mqc) +{ + opj_mqc_codemps_macro(mqc, mqc->curctx, mqc->a, mqc->c, mqc->ct); +} + +/** +Encode the most least symbol +@param mqc MQC handle +*/ +static INLINE void opj_mqc_codelps(opj_mqc_t *mqc) +{ + opj_mqc_codelps_macro(mqc, mqc->curctx, mqc->a, mqc->c, mqc->ct); +} + +/** +Encode a symbol using the MQ-coder +@param mqc MQC handle +@param d The symbol to be encoded (0 or 1) +*/ +static INLINE void opj_mqc_encode(opj_mqc_t *mqc, OPJ_UINT32 d) +{ + if ((*mqc->curctx)->mps == d) { + opj_mqc_codemps(mqc); + } else { + opj_mqc_codelps(mqc); + } +} + void opj_mqc_segmark_enc(opj_mqc_t *mqc) { OPJ_UINT32 i; @@ -557,4 +489,36 @@ void opj_mqc_setstate(opj_mqc_t *mqc, OPJ_UINT32 ctxno, OPJ_UINT32 msb, mqc->ctxs[ctxno] = &mqc_states[msb + (OPJ_UINT32)(prob << 1)]; } - +void opj_mqc_byteout(opj_mqc_t *mqc) +{ + /* bp is initialized to start - 1 in opj_mqc_init_enc() */ + /* but this is safe, see opj_tcd_code_block_enc_allocate_data() */ + assert(mqc->bp >= mqc->start - 1); + if (*mqc->bp == 0xff) { + mqc->bp++; + *mqc->bp = (OPJ_BYTE)(mqc->c >> 20); + mqc->c &= 0xfffff; + mqc->ct = 7; + } else { + if ((mqc->c & 0x8000000) == 0) { + mqc->bp++; + *mqc->bp = (OPJ_BYTE)(mqc->c >> 19); + mqc->c &= 0x7ffff; + mqc->ct = 8; + } else { + (*mqc->bp)++; + if (*mqc->bp == 0xff) { + mqc->c &= 0x7ffffff; + mqc->bp++; + *mqc->bp = (OPJ_BYTE)(mqc->c >> 20); + mqc->c &= 0xfffff; + mqc->ct = 7; + } else { + mqc->bp++; + *mqc->bp = (OPJ_BYTE)(mqc->c >> 19); + mqc->c &= 0x7ffff; + mqc->ct = 8; + } + } + } +} \ No newline at end of file diff --git a/src/lib/openjp2/mqc.h b/src/lib/openjp2/mqc.h index 69a2a79d..9850fed0 100644 --- a/src/lib/openjp2/mqc.h +++ b/src/lib/openjp2/mqc.h @@ -96,6 +96,8 @@ typedef struct opj_mqc { OPJ_BYTE backup[OPJ_COMMON_CBLK_DATA_EXTRA]; } opj_mqc_t; +#define BYPASS_CT_INIT 0xDEADBEEF + #include "mqc_inl.h" /** @name Exported functions */ @@ -135,12 +137,7 @@ Set the current context used for coding/decoding @param ctxno Number that identifies the context */ #define opj_mqc_setcurctx(mqc, ctxno) (mqc)->curctx = &(mqc)->ctxs[(OPJ_UINT32)(ctxno)] -/** -Encode a symbol using the MQ-coder -@param mqc MQC handle -@param d The symbol to be encoded (0 or 1) -*/ -void opj_mqc_encode(opj_mqc_t *mqc, OPJ_UINT32 d); + /** Flush the encoder, so that all remaining data is written @param mqc MQC handle diff --git a/src/lib/openjp2/mqc_inl.h b/src/lib/openjp2/mqc_inl.h index 310a3287..0031b94b 100644 --- a/src/lib/openjp2/mqc_inl.h +++ b/src/lib/openjp2/mqc_inl.h @@ -156,13 +156,13 @@ static INLINE OPJ_UINT32 opj_mqc_raw_decode(opj_mqc_t *mqc) } \ } -#define DOWNLOAD_MQC_VARIABLES(mqc, curctx, c, a, ct) \ +#define DOWNLOAD_MQC_VARIABLES(mqc, curctx, a, c, ct) \ register const opj_mqc_state_t **curctx = mqc->curctx; \ register OPJ_UINT32 c = mqc->c; \ register OPJ_UINT32 a = mqc->a; \ register OPJ_UINT32 ct = mqc->ct -#define UPLOAD_MQC_VARIABLES(mqc, curctx, c, a, ct) \ +#define UPLOAD_MQC_VARIABLES(mqc, curctx, a, c, ct) \ mqc->curctx = curctx; \ mqc->c = c; \ mqc->a = a; \ @@ -193,4 +193,90 @@ Decode a symbol #define opj_mqc_decode(d, mqc) \ opj_mqc_decode_macro(d, mqc, mqc->curctx, mqc->a, mqc->c, mqc->ct) +/** +Output a byte, doing bit-stuffing if necessary. +After a 0xff byte, the next byte must be smaller than 0x90. +@param mqc MQC handle +*/ +void opj_mqc_byteout(opj_mqc_t *mqc); + +/** +Renormalize mqc->a and mqc->c while encoding, so that mqc->a stays between 0x8000 and 0x10000 +@param mqc MQC handle +@param a_ value of mqc->a +@param c_ value of mqc->c_ +@param ct_ value of mqc->ct_ +*/ +#define opj_mqc_renorme_macro(mqc, a_, c_, ct_) \ +{ \ + do { \ + a_ <<= 1; \ + c_ <<= 1; \ + ct_--; \ + if (ct_ == 0) { \ + mqc->c = c_; \ + opj_mqc_byteout(mqc); \ + c_ = mqc->c; \ + ct_ = mqc->ct; \ + } \ + } while( (a_ & 0x8000) == 0); \ +} + +#define opj_mqc_codemps_macro(mqc, curctx, a, c, ct) \ +{ \ + a -= (*curctx)->qeval; \ + if ((a & 0x8000) == 0) { \ + if (a < (*curctx)->qeval) { \ + a = (*curctx)->qeval; \ + } else { \ + c += (*curctx)->qeval; \ + } \ + *curctx = (*curctx)->nmps; \ + opj_mqc_renorme_macro(mqc, a, c, ct); \ + } else { \ + c += (*curctx)->qeval; \ + } \ +} + +#define opj_mqc_codelps_macro(mqc, curctx, a, c, ct) \ +{ \ + a -= (*curctx)->qeval; \ + if (a < (*curctx)->qeval) { \ + c += (*curctx)->qeval; \ + } else { \ + a = (*curctx)->qeval; \ + } \ + *curctx = (*curctx)->nlps; \ + opj_mqc_renorme_macro(mqc, a, c, ct); \ +} + +#define opj_mqc_encode_macro(mqc, curctx, a, c, ct, d) \ +{ \ + if ((*curctx)->mps == (d)) { \ + opj_mqc_codemps_macro(mqc, curctx, a, c, ct); \ + } else { \ + opj_mqc_codelps_macro(mqc, curctx, a, c, ct); \ + } \ +} + + +#define opj_mqc_bypass_enc_macro(mqc, c, ct, d) \ +{\ + if (ct == BYPASS_CT_INIT) {\ + ct = 8;\ + }\ + ct--;\ + c = c + ((d) << ct);\ + if (ct == 0) {\ + *mqc->bp = (OPJ_BYTE)c;\ + ct = 8;\ + /* If the previous byte was 0xff, make sure that the next msb is 0 */ \ + if (*mqc->bp == 0xff) {\ + ct = 7;\ + }\ + mqc->bp++;\ + c = 0;\ + }\ +} + #endif /* OPJ_MQC_INL_H */ diff --git a/src/lib/openjp2/t1.c b/src/lib/openjp2/t1.c index 937f420a..92030b21 100644 --- a/src/lib/openjp2/t1.c +++ b/src/lib/openjp2/t1.c @@ -61,6 +61,13 @@ #define opj_t1_setcurctx(curctx, ctxno) curctx = &(mqc)->ctxs[(OPJ_UINT32)(ctxno)] +/* Macros to deal with signed integer with just MSB bit set for + * negative values (smr = signed magnitude representation) */ +#define opj_smr_abs(x) (((OPJ_UINT32)(x)) & 0x7FFFFFFFU) +#define opj_smr_sign(x) (((OPJ_UINT32)(x)) >> 31) +#define opj_to_smr(x) ((x) >= 0 ? (OPJ_UINT32)(x) : ((OPJ_UINT32)(-x) | 0x80000000U)) + + /** @name Local static functions */ /*@{*/ @@ -329,61 +336,53 @@ static INLINE void opj_t1_update_flags(opj_flag_t *flagsp, OPJ_UINT32 ci, /** Encode significant pass */ -static INLINE void opj_t1_enc_sigpass_step(opj_t1_t *t1, - opj_flag_t *flagsp, - OPJ_INT32 *datap, - OPJ_INT32 bpno, - OPJ_INT32 one, - OPJ_INT32 *nmsedec, - OPJ_BYTE type, - OPJ_UINT32 ci, - OPJ_UINT32 vsc) -{ - OPJ_UINT32 v; - - opj_mqc_t *mqc = &(t1->mqc); /* MQC component */ - - OPJ_UINT32 const flags = *flagsp; - - if ((flags & ((T1_SIGMA_THIS | T1_PI_THIS) << (ci * 3U))) == 0U && - (flags & (T1_SIGMA_NEIGHBOURS << (ci * 3U))) != 0U) { - OPJ_UINT32 ctxt1 = opj_t1_getctxno_zc(mqc, flags >> (ci * 3U)); - v = (opj_int_abs(*datap) & one) ? 1 : 0; -#ifdef DEBUG_ENC_SIG - fprintf(stderr, " ctxt1=%d\n", ctxt1); -#endif - opj_mqc_setcurctx(mqc, ctxt1); - if (type == T1_TYPE_RAW) { /* BYPASS/LAZY MODE */ - opj_mqc_bypass_enc(mqc, v); - } else { - opj_mqc_encode(mqc, v); - } - if (v) { - OPJ_UINT32 lu = opj_t1_getctxtno_sc_or_spb_index( - *flagsp, - flagsp[-1], flagsp[1], - ci); - OPJ_UINT32 ctxt2 = opj_t1_getctxno_sc(lu); - v = *datap < 0 ? 1U : 0U; - *nmsedec += opj_t1_getnmsedec_sig((OPJ_UINT32)opj_int_abs(*datap), - (OPJ_UINT32)bpno); -#ifdef DEBUG_ENC_SIG - fprintf(stderr, " ctxt2=%d\n", ctxt2); -#endif - opj_mqc_setcurctx(mqc, ctxt2); - if (type == T1_TYPE_RAW) { /* BYPASS/LAZY MODE */ - opj_mqc_bypass_enc(mqc, v); - } else { - OPJ_UINT32 spb = opj_t1_getspb(lu); -#ifdef DEBUG_ENC_SIG - fprintf(stderr, " spb=%d\n", spb); -#endif - opj_mqc_encode(mqc, v ^ spb); - } - opj_t1_update_flags(flagsp, ci, v, t1->w + 2, vsc); - } - *flagsp |= T1_PI_THIS << (ci * 3U); - } +#define opj_t1_enc_sigpass_step_macro(mqc, curctx, a, c, ct, flagspIn, datapIn, bpno, one, nmsedec, type, ciIn, vscIn) \ +{ \ + OPJ_UINT32 v; \ + const OPJ_UINT32 ci = (ciIn); \ + const OPJ_UINT32 vsc = (vscIn); \ + const OPJ_INT32* l_datap = (datapIn); \ + opj_flag_t* flagsp = (flagspIn); \ + OPJ_UINT32 const flags = *flagsp; \ + if ((flags & ((T1_SIGMA_THIS | T1_PI_THIS) << (ci * 3U))) == 0U && \ + (flags & (T1_SIGMA_NEIGHBOURS << (ci * 3U))) != 0U) { \ + OPJ_UINT32 ctxt1 = opj_t1_getctxno_zc(mqc, flags >> (ci * 3U)); \ + v = (opj_smr_abs(*l_datap) & (OPJ_UINT32)one) ? 1 : 0; \ +/* #ifdef DEBUG_ENC_SIG */ \ +/* fprintf(stderr, " ctxt1=%d\n", ctxt1); */ \ +/* #endif */ \ + opj_t1_setcurctx(curctx, ctxt1); \ + if (type == T1_TYPE_RAW) { /* BYPASS/LAZY MODE */ \ + opj_mqc_bypass_enc_macro(mqc, c, ct, v); \ + } else { \ + opj_mqc_encode_macro(mqc, curctx, a, c, ct, v); \ + } \ + if (v) { \ + OPJ_UINT32 lu = opj_t1_getctxtno_sc_or_spb_index( \ + *flagsp, \ + flagsp[-1], flagsp[1], \ + ci); \ + OPJ_UINT32 ctxt2 = opj_t1_getctxno_sc(lu); \ + v = opj_smr_sign(*l_datap); \ + *nmsedec += opj_t1_getnmsedec_sig(opj_smr_abs(*l_datap), \ + (OPJ_UINT32)bpno); \ +/* #ifdef DEBUG_ENC_SIG */ \ +/* fprintf(stderr, " ctxt2=%d\n", ctxt2); */ \ +/* #endif */ \ + opj_t1_setcurctx(curctx, ctxt2); \ + if (type == T1_TYPE_RAW) { /* BYPASS/LAZY MODE */ \ + opj_mqc_bypass_enc_macro(mqc, c, ct, v); \ + } else { \ + OPJ_UINT32 spb = opj_t1_getspb(lu); \ +/* #ifdef DEBUG_ENC_SIG */ \ +/* fprintf(stderr, " spb=%d\n", spb); */ \ +/* #endif */ \ + opj_mqc_encode_macro(mqc, curctx, a, c, ct, v ^ spb); \ + } \ + opj_t1_update_flags(flagsp, ci, v, t1->w + 2, vsc); \ + } \ + *flagsp |= T1_PI_THIS << (ci * 3U); \ + } \ } static INLINE void opj_t1_dec_sigpass_step_raw( @@ -464,63 +463,64 @@ static void opj_t1_enc_sigpass(opj_t1_t *t1, OPJ_INT32 const one = 1 << (bpno + T1_NMSEDEC_FRACBITS); opj_flag_t* f = &T1_FLAGS(0, 0); OPJ_UINT32 const extra = 2; + opj_mqc_t* mqc = &(t1->mqc); + DOWNLOAD_MQC_VARIABLES(mqc, curctx, a, c, ct); + const OPJ_INT32* datap = t1->data; *nmsedec = 0; #ifdef DEBUG_ENC_SIG fprintf(stderr, "enc_sigpass: bpno=%d\n", bpno); #endif - for (k = 0; k < (t1->h & ~3U); k += 4) { + for (k = 0; k < (t1->h & ~3U); k += 4, f += extra) { + const OPJ_UINT32 w = t1->w; #ifdef DEBUG_ENC_SIG fprintf(stderr, " k=%d\n", k); #endif - for (i = 0; i < t1->w; ++i) { + for (i = 0; i < w; ++i, ++f, datap += 4) { #ifdef DEBUG_ENC_SIG fprintf(stderr, " i=%d\n", i); #endif if (*f == 0U) { /* Nothing to do for any of the 4 data points */ - f++; continue; } - opj_t1_enc_sigpass_step( - t1, + opj_t1_enc_sigpass_step_macro( + mqc, curctx, a, c, ct, f, - &t1->data[((k + 0) * t1->data_stride) + i], + &datap[0], bpno, one, nmsedec, type, 0, cblksty & J2K_CCP_CBLKSTY_VSC); - opj_t1_enc_sigpass_step( - t1, + opj_t1_enc_sigpass_step_macro( + mqc, curctx, a, c, ct, f, - &t1->data[((k + 1) * t1->data_stride) + i], + &datap[1], bpno, one, nmsedec, type, 1, 0); - opj_t1_enc_sigpass_step( - t1, + opj_t1_enc_sigpass_step_macro( + mqc, curctx, a, c, ct, f, - &t1->data[((k + 2) * t1->data_stride) + i], + &datap[2], bpno, one, nmsedec, type, 2, 0); - opj_t1_enc_sigpass_step( - t1, + opj_t1_enc_sigpass_step_macro( + mqc, curctx, a, c, ct, f, - &t1->data[((k + 3) * t1->data_stride) + i], + &datap[3], bpno, one, nmsedec, type, 3, 0); - ++f; } - f += extra; } if (k < t1->h) { @@ -528,20 +528,20 @@ static void opj_t1_enc_sigpass(opj_t1_t *t1, #ifdef DEBUG_ENC_SIG fprintf(stderr, " k=%d\n", k); #endif - for (i = 0; i < t1->w; ++i) { + for (i = 0; i < t1->w; ++i, ++f) { #ifdef DEBUG_ENC_SIG fprintf(stderr, " i=%d\n", i); #endif if (*f == 0U) { /* Nothing to do for any of the 4 data points */ - f++; + datap += (t1->h - k); continue; } - for (j = k; j < t1->h; ++j) { - opj_t1_enc_sigpass_step( - t1, + for (j = k; j < t1->h; ++j, ++datap) { + opj_t1_enc_sigpass_step_macro( + mqc, curctx, a, c, ct, f, - &t1->data[(j * t1->data_stride) + i], + &datap[0], bpno, one, nmsedec, @@ -549,9 +549,10 @@ static void opj_t1_enc_sigpass(opj_t1_t *t1, j - k, (j == k && (cblksty & J2K_CCP_CBLKSTY_VSC) != 0)); } - ++f; } } + + UPLOAD_MQC_VARIABLES(mqc, curctx, a, c, ct); } static void opj_t1_dec_sigpass_raw( @@ -626,7 +627,7 @@ static void opj_t1_dec_sigpass_raw( register opj_flag_t *flagsp = &t1->flags[(flags_stride) + 1]; \ const OPJ_UINT32 l_w = w; \ opj_mqc_t* mqc = &(t1->mqc); \ - DOWNLOAD_MQC_VARIABLES(mqc, curctx, c, a, ct); \ + DOWNLOAD_MQC_VARIABLES(mqc, curctx, a, c, ct); \ register OPJ_UINT32 v; \ one = 1 << bpno; \ half = one >> 1; \ @@ -651,7 +652,7 @@ static void opj_t1_dec_sigpass_raw( } \ } \ } \ - UPLOAD_MQC_VARIABLES(mqc, curctx, c, a, ct); \ + UPLOAD_MQC_VARIABLES(mqc, curctx, a, c, ct); \ if( k < h ) { \ for (i = 0; i < l_w; ++i, ++data, ++flagsp) { \ for (j = 0; j < h - k; ++j) { \ @@ -715,38 +716,27 @@ static void opj_t1_dec_sigpass_mqc( /** Encode refinement pass step */ -static INLINE void opj_t1_enc_refpass_step(opj_t1_t *t1, - opj_flag_t *flagsp, - OPJ_INT32 *datap, - OPJ_INT32 bpno, - OPJ_INT32 one, - OPJ_INT32 *nmsedec, - OPJ_BYTE type, - OPJ_UINT32 ci) -{ - OPJ_UINT32 v; - - opj_mqc_t *mqc = &(t1->mqc); /* MQC component */ - - OPJ_UINT32 const shift_flags = - (*flagsp >> (ci * 3U)); - - if ((shift_flags & (T1_SIGMA_THIS | T1_PI_THIS)) == T1_SIGMA_THIS) { - OPJ_UINT32 ctxt = opj_t1_getctxno_mag(shift_flags); - *nmsedec += opj_t1_getnmsedec_ref((OPJ_UINT32)opj_int_abs(*datap), - (OPJ_UINT32)bpno); - v = (opj_int_abs(*datap) & one) ? 1 : 0; -#ifdef DEBUG_ENC_REF - fprintf(stderr, " ctxt=%d\n", ctxt); -#endif - opj_mqc_setcurctx(mqc, ctxt); - if (type == T1_TYPE_RAW) { /* BYPASS/LAZY MODE */ - opj_mqc_bypass_enc(mqc, v); - } else { - opj_mqc_encode(mqc, v); - } - *flagsp |= T1_MU_THIS << (ci * 3U); - } +#define opj_t1_enc_refpass_step_macro(mqc, curctx, a, c, ct, flags, flagsUpdated, datap, bpno, one, nmsedec, type, ci) \ +{\ + OPJ_UINT32 v; \ + if ((flags & ((T1_SIGMA_THIS | T1_PI_THIS) << ((ci) * 3U))) == (T1_SIGMA_THIS << ((ci) * 3U))) { \ + const OPJ_UINT32 shift_flags = (flags >> ((ci) * 3U)); \ + OPJ_UINT32 ctxt = opj_t1_getctxno_mag(shift_flags); \ + OPJ_UINT32 abs_data = opj_smr_abs(*datap); \ + *nmsedec += opj_t1_getnmsedec_ref(abs_data, \ + (OPJ_UINT32)bpno); \ + v = ((OPJ_INT32)abs_data & one) ? 1 : 0; \ +/* #ifdef DEBUG_ENC_REF */ \ +/* fprintf(stderr, " ctxt=%d\n", ctxt); */ \ +/* #endif */ \ + opj_t1_setcurctx(curctx, ctxt); \ + if (type == T1_TYPE_RAW) { /* BYPASS/LAZY MODE */ \ + opj_mqc_bypass_enc_macro(mqc, c, ct, v); \ + } else { \ + opj_mqc_encode_macro(mqc, curctx, a, c, ct, v); \ + } \ + flagsUpdated |= T1_MU_THIS << ((ci) * 3U); \ + } \ } @@ -807,100 +797,104 @@ static void opj_t1_enc_refpass( const OPJ_INT32 one = 1 << (bpno + T1_NMSEDEC_FRACBITS); opj_flag_t* f = &T1_FLAGS(0, 0); const OPJ_UINT32 extra = 2U; + opj_mqc_t* mqc = &(t1->mqc); + DOWNLOAD_MQC_VARIABLES(mqc, curctx, a, c, ct); + const OPJ_INT32* datap = t1->data; *nmsedec = 0; #ifdef DEBUG_ENC_REF fprintf(stderr, "enc_refpass: bpno=%d\n", bpno); #endif - for (k = 0; k < (t1->h & ~3U); k += 4) { + for (k = 0; k < (t1->h & ~3U); k += 4, f += extra) { #ifdef DEBUG_ENC_REF fprintf(stderr, " k=%d\n", k); #endif - for (i = 0; i < t1->w; ++i) { + for (i = 0; i < t1->w; ++i, f++, datap += 4) { + const OPJ_UINT32 flags = *f; + OPJ_UINT32 flagsUpdated = flags; #ifdef DEBUG_ENC_REF fprintf(stderr, " i=%d\n", i); #endif - if ((*f & (T1_SIGMA_4 | T1_SIGMA_7 | T1_SIGMA_10 | T1_SIGMA_13)) == 0) { + if ((flags & (T1_SIGMA_4 | T1_SIGMA_7 | T1_SIGMA_10 | T1_SIGMA_13)) == 0) { /* none significant */ - f++; continue; } - if ((*f & (T1_PI_0 | T1_PI_1 | T1_PI_2 | T1_PI_3)) == + if ((flags & (T1_PI_0 | T1_PI_1 | T1_PI_2 | T1_PI_3)) == (T1_PI_0 | T1_PI_1 | T1_PI_2 | T1_PI_3)) { /* all processed by sigpass */ - f++; continue; } - opj_t1_enc_refpass_step( - t1, - f, - &t1->data[((k + 0) * t1->data_stride) + i], + opj_t1_enc_refpass_step_macro( + mqc, curctx, a, c, ct, + flags, flagsUpdated, + &datap[0], bpno, one, nmsedec, type, 0); - opj_t1_enc_refpass_step( - t1, - f, - &t1->data[((k + 1) * t1->data_stride) + i], + opj_t1_enc_refpass_step_macro( + mqc, curctx, a, c, ct, + flags, flagsUpdated, + &datap[1], bpno, one, nmsedec, type, 1); - opj_t1_enc_refpass_step( - t1, - f, - &t1->data[((k + 2) * t1->data_stride) + i], + opj_t1_enc_refpass_step_macro( + mqc, curctx, a, c, ct, + flags, flagsUpdated, + &datap[2], bpno, one, nmsedec, type, 2); - opj_t1_enc_refpass_step( - t1, - f, - &t1->data[((k + 3) * t1->data_stride) + i], + opj_t1_enc_refpass_step_macro( + mqc, curctx, a, c, ct, + flags, flagsUpdated, + &datap[3], bpno, one, nmsedec, type, 3); - ++f; + *f = flagsUpdated; } - f += extra; } if (k < t1->h) { OPJ_UINT32 j; + const OPJ_UINT32 remaining_lines = t1->h - k; #ifdef DEBUG_ENC_REF fprintf(stderr, " k=%d\n", k); #endif - for (i = 0; i < t1->w; ++i) { + for (i = 0; i < t1->w; ++i, ++f) { #ifdef DEBUG_ENC_REF fprintf(stderr, " i=%d\n", i); #endif if ((*f & (T1_SIGMA_4 | T1_SIGMA_7 | T1_SIGMA_10 | T1_SIGMA_13)) == 0) { /* none significant */ - f++; + datap += remaining_lines; continue; } - for (j = k; j < t1->h; ++j) { - opj_t1_enc_refpass_step( - t1, - f, - &t1->data[(j * t1->data_stride) + i], + for (j = 0; j < remaining_lines; ++j, datap ++) { + opj_t1_enc_refpass_step_macro( + mqc, curctx, a, c, ct, + *f, *f, + &datap[0], bpno, one, nmsedec, type, - j - k); + j); } - ++f; } } + + UPLOAD_MQC_VARIABLES(mqc, curctx, a, c, ct); } @@ -968,7 +962,7 @@ static void opj_t1_dec_refpass_raw( register opj_flag_t *flagsp = &t1->flags[flags_stride + 1]; \ const OPJ_UINT32 l_w = w; \ opj_mqc_t* mqc = &(t1->mqc); \ - DOWNLOAD_MQC_VARIABLES(mqc, curctx, c, a, ct); \ + DOWNLOAD_MQC_VARIABLES(mqc, curctx, a, c, ct); \ register OPJ_UINT32 v; \ one = 1 << bpno; \ poshalf = one >> 1; \ @@ -992,7 +986,7 @@ static void opj_t1_dec_refpass_raw( } \ } \ } \ - UPLOAD_MQC_VARIABLES(mqc, curctx, c, a, ct); \ + UPLOAD_MQC_VARIABLES(mqc, curctx, a, c, ct); \ if( k < h ) { \ for (i = 0; i < l_w; ++i, ++data, ++flagsp) { \ for (j = 0; j < h - k; ++j) { \ @@ -1030,86 +1024,71 @@ static void opj_t1_dec_refpass_mqc( /** Encode clean-up pass step */ -static void opj_t1_enc_clnpass_step( - opj_t1_t *t1, - opj_flag_t *flagsp, - OPJ_INT32 *datap, - OPJ_INT32 bpno, - OPJ_INT32 one, - OPJ_INT32 *nmsedec, - OPJ_UINT32 agg, - OPJ_UINT32 runlen, - OPJ_UINT32 lim, - OPJ_UINT32 cblksty) -{ - OPJ_UINT32 v; - OPJ_UINT32 ci; - opj_mqc_t *mqc = &(t1->mqc); /* MQC component */ - - const OPJ_UINT32 check = (T1_SIGMA_4 | T1_SIGMA_7 | T1_SIGMA_10 | T1_SIGMA_13 | - T1_PI_0 | T1_PI_1 | T1_PI_2 | T1_PI_3); - - if ((*flagsp & check) == check) { - if (runlen == 0) { - *flagsp &= ~(T1_PI_0 | T1_PI_1 | T1_PI_2 | T1_PI_3); - } else if (runlen == 1) { - *flagsp &= ~(T1_PI_1 | T1_PI_2 | T1_PI_3); - } else if (runlen == 2) { - *flagsp &= ~(T1_PI_2 | T1_PI_3); - } else if (runlen == 3) { - *flagsp &= ~(T1_PI_3); - } - return; - } - - for (ci = runlen; ci < lim; ++ci) { - OPJ_UINT32 vsc; - opj_flag_t flags; - OPJ_UINT32 ctxt1; - - flags = *flagsp; - - if ((agg != 0) && (ci == runlen)) { - goto LABEL_PARTIAL; - } - - if (!(flags & ((T1_SIGMA_THIS | T1_PI_THIS) << (ci * 3U)))) { - ctxt1 = opj_t1_getctxno_zc(mqc, flags >> (ci * 3U)); -#ifdef DEBUG_ENC_CLN - printf(" ctxt1=%d\n", ctxt1); -#endif - opj_mqc_setcurctx(mqc, ctxt1); - v = (opj_int_abs(*datap) & one) ? 1 : 0; - opj_mqc_encode(mqc, v); - if (v) { - OPJ_UINT32 ctxt2, spb; - OPJ_UINT32 lu; -LABEL_PARTIAL: - lu = opj_t1_getctxtno_sc_or_spb_index( - *flagsp, - flagsp[-1], flagsp[1], - ci); - *nmsedec += opj_t1_getnmsedec_sig((OPJ_UINT32)opj_int_abs(*datap), - (OPJ_UINT32)bpno); - ctxt2 = opj_t1_getctxno_sc(lu); -#ifdef DEBUG_ENC_CLN - printf(" ctxt2=%d\n", ctxt2); -#endif - opj_mqc_setcurctx(mqc, ctxt2); - - v = *datap < 0 ? 1U : 0U; - spb = opj_t1_getspb(lu); -#ifdef DEBUG_ENC_CLN - printf(" spb=%d\n", spb); -#endif - opj_mqc_encode(mqc, v ^ spb); - vsc = ((cblksty & J2K_CCP_CBLKSTY_VSC) && (ci == 0)) ? 1 : 0; - opj_t1_update_flags(flagsp, ci, v, t1->w + 2U, vsc); - } - } - *flagsp &= ~(T1_PI_THIS << (3U * ci)); - datap += t1->data_stride; - } +#define opj_t1_enc_clnpass_step_macro(mqc, curctx, a, c, ct, flagspIn, datapIn, bpno, one, nmsedec, agg, runlen, lim, cblksty) \ +{ \ + OPJ_UINT32 v; \ + OPJ_UINT32 ci; \ + opj_flag_t* const flagsp = (flagspIn); \ + const OPJ_INT32* l_datap = (datapIn); \ + const OPJ_UINT32 check = (T1_SIGMA_4 | T1_SIGMA_7 | T1_SIGMA_10 | T1_SIGMA_13 | \ + T1_PI_0 | T1_PI_1 | T1_PI_2 | T1_PI_3); \ + \ + if ((*flagsp & check) == check) { \ + if (runlen == 0) { \ + *flagsp &= ~(T1_PI_0 | T1_PI_1 | T1_PI_2 | T1_PI_3); \ + } else if (runlen == 1) { \ + *flagsp &= ~(T1_PI_1 | T1_PI_2 | T1_PI_3); \ + } else if (runlen == 2) { \ + *flagsp &= ~(T1_PI_2 | T1_PI_3); \ + } else if (runlen == 3) { \ + *flagsp &= ~(T1_PI_3); \ + } \ + } \ + else \ + for (ci = runlen; ci < lim; ++ci) { \ + OPJ_BOOL goto_PARTIAL = OPJ_FALSE; \ + if ((agg != 0) && (ci == runlen)) { \ + goto_PARTIAL = OPJ_TRUE; \ + } \ + else if (!(*flagsp & ((T1_SIGMA_THIS | T1_PI_THIS) << (ci * 3U)))) { \ + OPJ_UINT32 ctxt1 = opj_t1_getctxno_zc(mqc, *flagsp >> (ci * 3U)); \ +/* #ifdef DEBUG_ENC_CLN */ \ +/* printf(" ctxt1=%d\n", ctxt1); */ \ +/* #endif */ \ + opj_t1_setcurctx(curctx, ctxt1); \ + v = (opj_smr_abs(*l_datap) & (OPJ_UINT32)one) ? 1 : 0; \ + opj_mqc_encode_macro(mqc, curctx, a, c, ct, v); \ + if (v) { \ + goto_PARTIAL = OPJ_TRUE; \ + } \ + } \ + if( goto_PARTIAL ) { \ + OPJ_UINT32 vsc; \ + OPJ_UINT32 ctxt2, spb; \ + OPJ_UINT32 lu = opj_t1_getctxtno_sc_or_spb_index( \ + *flagsp, \ + flagsp[-1], flagsp[1], \ + ci); \ + *nmsedec += opj_t1_getnmsedec_sig(opj_smr_abs(*l_datap), \ + (OPJ_UINT32)bpno); \ + ctxt2 = opj_t1_getctxno_sc(lu); \ +/* #ifdef DEBUG_ENC_CLN */ \ +/* printf(" ctxt2=%d\n", ctxt2); */ \ +/* #endif */ \ + opj_t1_setcurctx(curctx, ctxt2); \ + \ + v = opj_smr_sign(*l_datap); \ + spb = opj_t1_getspb(lu); \ +/* #ifdef DEBUG_ENC_CLN */ \ +/* printf(" spb=%d\n", spb); */\ +/* #endif */ \ + opj_mqc_encode_macro(mqc, curctx, a, c, ct, v ^ spb); \ + vsc = ((cblksty & J2K_CCP_CBLKSTY_VSC) && (ci == 0)) ? 1 : 0; \ + opj_t1_update_flags(flagsp, ci, v, t1->w + 2U, vsc); \ + } \ + *flagsp &= ~(T1_PI_THIS << (3U * ci)); \ + l_datap ++; \ + } \ } #define opj_t1_dec_clnpass_step_macro(check_flags, partial, \ @@ -1165,47 +1144,50 @@ static void opj_t1_enc_clnpass( { OPJ_UINT32 i, k; const OPJ_INT32 one = 1 << (bpno + T1_NMSEDEC_FRACBITS); - OPJ_UINT32 agg, runlen; - - opj_mqc_t *mqc = &(t1->mqc); /* MQC component */ + opj_mqc_t* mqc = &(t1->mqc); + DOWNLOAD_MQC_VARIABLES(mqc, curctx, a, c, ct); + const OPJ_INT32* datap = t1->data; + opj_flag_t *f = &T1_FLAGS(0, 0); + const OPJ_UINT32 extra = 2U; *nmsedec = 0; #ifdef DEBUG_ENC_CLN printf("enc_clnpass: bpno=%d\n", bpno); #endif - for (k = 0; k < (t1->h & ~3U); k += 4) { + for (k = 0; k < (t1->h & ~3U); k += 4, f += extra) { #ifdef DEBUG_ENC_CLN printf(" k=%d\n", k); #endif - for (i = 0; i < t1->w; ++i) { + for (i = 0; i < t1->w; ++i, f++) { + OPJ_UINT32 agg, runlen; #ifdef DEBUG_ENC_CLN printf(" i=%d\n", i); #endif - agg = !(T1_FLAGS(i, k)); + agg = !*f; #ifdef DEBUG_ENC_CLN printf(" agg=%d\n", agg); #endif if (agg) { - for (runlen = 0; runlen < 4; ++runlen) { - if (opj_int_abs(t1->data[((k + runlen)*t1->data_stride) + i]) & one) { + for (runlen = 0; runlen < 4; ++runlen, ++datap) { + if (opj_smr_abs(*datap) & (OPJ_UINT32)one) { break; } } - opj_mqc_setcurctx(mqc, T1_CTXNO_AGG); - opj_mqc_encode(mqc, runlen != 4); + opj_t1_setcurctx(curctx, T1_CTXNO_AGG); + opj_mqc_encode_macro(mqc, curctx, a, c, ct, runlen != 4); if (runlen == 4) { continue; } - opj_mqc_setcurctx(mqc, T1_CTXNO_UNI); - opj_mqc_encode(mqc, runlen >> 1); - opj_mqc_encode(mqc, runlen & 1); + opj_t1_setcurctx(curctx, T1_CTXNO_UNI); + opj_mqc_encode_macro(mqc, curctx, a, c, ct, runlen >> 1); + opj_mqc_encode_macro(mqc, curctx, a, c, ct, runlen & 1); } else { runlen = 0; } - opj_t1_enc_clnpass_step( - t1, - &T1_FLAGS(i, k), - &t1->data[((k + runlen) * t1->data_stride) + i], + opj_t1_enc_clnpass_step_macro( + mqc, curctx, a, c, ct, + f, + datap, bpno, one, nmsedec, @@ -1213,23 +1195,24 @@ static void opj_t1_enc_clnpass( runlen, 4U, cblksty); + datap += 4 - runlen; } } if (k < t1->h) { - agg = 0; - runlen = 0; + const OPJ_UINT32 agg = 0; + const OPJ_UINT32 runlen = 0; #ifdef DEBUG_ENC_CLN printf(" k=%d\n", k); #endif - for (i = 0; i < t1->w; ++i) { + for (i = 0; i < t1->w; ++i, f++) { #ifdef DEBUG_ENC_CLN printf(" i=%d\n", i); printf(" agg=%d\n", agg); #endif - opj_t1_enc_clnpass_step( - t1, - &T1_FLAGS(i, k), - &t1->data[((k + runlen) * t1->data_stride) + i], + opj_t1_enc_clnpass_step_macro( + mqc, curctx, a, c, ct, + f, + datap, bpno, one, nmsedec, @@ -1237,8 +1220,11 @@ static void opj_t1_enc_clnpass( runlen, t1->h - k, cblksty); + datap += t1->h - k; } } + + UPLOAD_MQC_VARIABLES(mqc, curctx, a, c, ct); } #define opj_t1_dec_clnpass_internal(t1, bpno, vsc, w, h, flags_stride) \ @@ -1250,7 +1236,7 @@ static void opj_t1_enc_clnpass( opj_mqc_t* mqc = &(t1->mqc); \ register OPJ_INT32 *data = t1->data; \ register opj_flag_t *flagsp = &t1->flags[flags_stride + 1]; \ - DOWNLOAD_MQC_VARIABLES(mqc, curctx, c, a, ct); \ + DOWNLOAD_MQC_VARIABLES(mqc, curctx, a, c, ct); \ register OPJ_UINT32 v; \ one = 1 << bpno; \ half = one >> 1; \ @@ -1319,7 +1305,7 @@ static void opj_t1_enc_clnpass( *flagsp = flags & ~(T1_PI_0 | T1_PI_1 | T1_PI_2 | T1_PI_3); \ } \ } \ - UPLOAD_MQC_VARIABLES(mqc, curctx, c, a, ct); \ + UPLOAD_MQC_VARIABLES(mqc, curctx, a, c, ct); \ if( k < h ) { \ for (i = 0; i < l_w; ++i, ++flagsp, ++data) { \ for (j = 0; j < h - k; ++j) { \ @@ -1427,7 +1413,7 @@ static OPJ_FLOAT64 opj_t1_getwmsedec( w2 = opj_dwt_getnorm(level, orient); } else { /* if (qmfbid == 0) */ const OPJ_INT32 log2_gain = (orient == 0) ? 0 : - (orient == 3) ? 2 : 1; + (orient == 3) ? 2 : 1; w2 = opj_dwt_getnorm_real(level, orient); /* Not sure this is right. But preserves past behaviour */ stepsize /= (1 << log2_gain); @@ -1454,7 +1440,7 @@ static OPJ_BOOL opj_t1_allocate_buffers( assert(w * h <= 4096); /* encoder uses tile buffer, so no need to allocate */ - if (!t1->encoder) { + { OPJ_UINT32 datasize = w * h; if (datasize > t1->datasize) { @@ -1564,8 +1550,7 @@ void opj_t1_destroy(opj_t1_t *p_t1) return; } - /* encoder uses tile buffer, so no need to free */ - if (!p_t1->encoder && p_t1->data) { + if (p_t1->data) { opj_aligned_free(p_t1->data); p_t1->data = 00; } @@ -2140,8 +2125,7 @@ static void opj_t1_clbl_encode_processor(void* user_data, opj_tls_t* tls) OPJ_INT32* OPJ_RESTRICT tiledp; OPJ_UINT32 cblk_w; OPJ_UINT32 cblk_h; - OPJ_UINT32 i, j, tileLineAdvance; - OPJ_SIZE_T tileIndex = 0; + OPJ_UINT32 i, j; OPJ_INT32 x = cblk->x0 - band->x0; OPJ_INT32 y = cblk->y0 - band->y0; @@ -2177,11 +2161,9 @@ static void opj_t1_clbl_encode_processor(void* user_data, opj_tls_t* tls) cblk_w = t1->w; cblk_h = t1->h; - tileLineAdvance = tile_w - cblk_w; tiledp = &tilec->data[(OPJ_SIZE_T)y * tile_w + (OPJ_SIZE_T)x]; - t1->data = tiledp; - t1->data_stride = tile_w; + if (tccp->qmfbid == 1) { /* Do multiplication on unsigned type, even if the * underlying type is signed, to avoid potential @@ -2192,22 +2174,52 @@ static void opj_t1_clbl_encode_processor(void* user_data, opj_tls_t* tls) * Fixes https://github.com/uclouvain/openjpeg/issues/1053 */ OPJ_UINT32* OPJ_RESTRICT tiledp_u = (OPJ_UINT32*) tiledp; - for (j = 0; j < cblk_h; ++j) { + OPJ_UINT32* OPJ_RESTRICT t1data = (OPJ_UINT32*) t1->data; + /* Change from "natural" order to "zigzag" order of T1 passes */ + for (j = 0; j < (cblk_h & ~3U); j += 4) { for (i = 0; i < cblk_w; ++i) { - tiledp_u[tileIndex] <<= T1_NMSEDEC_FRACBITS; - tileIndex++; + t1data[0] = tiledp_u[(j + 0) * tile_w + i] << T1_NMSEDEC_FRACBITS; + t1data[1] = tiledp_u[(j + 1) * tile_w + i] << T1_NMSEDEC_FRACBITS; + t1data[2] = tiledp_u[(j + 2) * tile_w + i] << T1_NMSEDEC_FRACBITS; + t1data[3] = tiledp_u[(j + 3) * tile_w + i] << T1_NMSEDEC_FRACBITS; + t1data += 4; + } + } + if (j < cblk_h) { + for (i = 0; i < cblk_w; ++i) { + OPJ_UINT32 k; + for (k = j; k < cblk_h; k++) { + t1data[0] = tiledp_u[k * tile_w + i] << T1_NMSEDEC_FRACBITS; + t1data ++; + } } - tileIndex += tileLineAdvance; } } else { /* if (tccp->qmfbid == 0) */ - for (j = 0; j < cblk_h; ++j) { + OPJ_FLOAT32* OPJ_RESTRICT tiledp_f = (OPJ_FLOAT32*) tiledp; + OPJ_INT32* OPJ_RESTRICT t1data = t1->data; + /* Change from "natural" order to "zigzag" order of T1 passes */ + for (j = 0; j < (cblk_h & ~3U); j += 4) { for (i = 0; i < cblk_w; ++i) { - OPJ_FLOAT32 tmp = ((OPJ_FLOAT32*)tiledp)[tileIndex]; - tiledp[tileIndex] = (OPJ_INT32)opj_lrintf((tmp / band->stepsize) * - (1 << T1_NMSEDEC_FRACBITS)); - tileIndex++; + t1data[0] = (OPJ_INT32)opj_lrintf((tiledp_f[(j + 0) * tile_w + i] / + band->stepsize) * (1 << T1_NMSEDEC_FRACBITS)); + t1data[1] = (OPJ_INT32)opj_lrintf((tiledp_f[(j + 1) * tile_w + i] / + band->stepsize) * (1 << T1_NMSEDEC_FRACBITS)); + t1data[2] = (OPJ_INT32)opj_lrintf((tiledp_f[(j + 2) * tile_w + i] / + band->stepsize) * (1 << T1_NMSEDEC_FRACBITS)); + t1data[3] = (OPJ_INT32)opj_lrintf((tiledp_f[(j + 3) * tile_w + i] / + band->stepsize) * (1 << T1_NMSEDEC_FRACBITS)); + t1data += 4; + } + } + if (j < cblk_h) { + for (i = 0; i < cblk_w; ++i) { + OPJ_UINT32 k; + for (k = j; k < cblk_h; k++) { + t1data[0] = (OPJ_INT32)opj_lrintf((tiledp_f[k * tile_w + i] / band->stepsize) + * (1 << T1_NMSEDEC_FRACBITS)); + t1data ++; + } } - tileIndex += tileLineAdvance; } } @@ -2363,6 +2375,7 @@ static OPJ_FLOAT64 opj_t1_encode_cblk(opj_t1_t *t1, OPJ_UINT32 i, j; OPJ_BYTE type = T1_TYPE_MQ; OPJ_FLOAT64 tempwmsedec; + OPJ_INT32* datap; #ifdef EXTRA_DEBUG printf("encode_cblk(x=%d,y=%d,x1=%d,y1=%d,orient=%d,compno=%d,level=%d\n", @@ -2372,10 +2385,19 @@ static OPJ_FLOAT64 opj_t1_encode_cblk(opj_t1_t *t1, mqc->lut_ctxno_zc_orient = lut_ctxno_zc + (orient << 9); max = 0; - for (i = 0; i < t1->w; ++i) { - for (j = 0; j < t1->h; ++j) { - OPJ_INT32 tmp = abs(t1->data[i + j * t1->data_stride]); - max = opj_int_max(max, tmp); + datap = t1->data; + for (j = 0; j < t1->h; ++j) { + const OPJ_UINT32 w = t1->w; + for (i = 0; i < w; ++i, ++datap) { + OPJ_INT32 tmp = *datap; + if (tmp < 0) { + OPJ_UINT32 tmp_unsigned; + max = opj_int_max(max, -tmp); + tmp_unsigned = opj_to_smr(tmp); + memcpy(datap, &tmp_unsigned, sizeof(OPJ_INT32)); + } else { + max = opj_int_max(max, tmp); + } } } diff --git a/src/lib/openjp2/t1.h b/src/lib/openjp2/t1.h index bc8a8111..81ad0d00 100644 --- a/src/lib/openjp2/t1.h +++ b/src/lib/openjp2/t1.h @@ -198,7 +198,6 @@ typedef struct opj_t1 { OPJ_UINT32 h; OPJ_UINT32 datasize; OPJ_UINT32 flagssize; - OPJ_UINT32 data_stride; OPJ_BOOL encoder; /* Thre 3 variables below are only used by the decoder */