Add multithreaded support in the DWT encoder.
Update the bench_dwt utility to have a -decode/-encode switch Measured performance gains for DWT encoder on a Intel(R) Core(TM) i7-6700HQ CPU @ 2.60GHz (4 cores, hyper threaded) Encoding time: $ ./bin/bench_dwt -encode -num_threads 1 time for dwt_encode: total = 8.348 s, wallclock = 8.352 s $ ./bin/bench_dwt -encode -num_threads 2 time for dwt_encode: total = 9.776 s, wallclock = 4.904 s $ ./bin/bench_dwt -encode -num_threads 4 time for dwt_encode: total = 13.188 s, wallclock = 3.310 s $ ./bin/bench_dwt -encode -num_threads 8 time for dwt_encode: total = 30.024 s, wallclock = 4.064 s Scaling is probably limited by memory access patterns causing memory access to be the bottleneck. The slightly worse results with threads==8 than with thread==4 is due to hyperthreading being not appropriate here.
This commit is contained in:
parent
97eb7e0bf1
commit
07d1f775a1
|
@ -256,7 +256,9 @@ if(BUILD_JPIP_SERVER)
|
||||||
endif()
|
endif()
|
||||||
add_subdirectory(src/lib)
|
add_subdirectory(src/lib)
|
||||||
option(BUILD_LUTS_GENERATOR "Build utility to generate t1_luts.h" OFF)
|
option(BUILD_LUTS_GENERATOR "Build utility to generate t1_luts.h" OFF)
|
||||||
|
if(UNIX)
|
||||||
option(BUILD_UNIT_TESTS "Build unit tests (bench_dwt, test_sparse_array, etc..)" OFF)
|
option(BUILD_UNIT_TESTS "Build unit tests (bench_dwt, test_sparse_array, etc..)" OFF)
|
||||||
|
endif()
|
||||||
|
|
||||||
#-----------------------------------------------------------------------------
|
#-----------------------------------------------------------------------------
|
||||||
# Build Applications
|
# Build Applications
|
||||||
|
|
|
@ -199,7 +199,7 @@ if(OPJ_USE_THREAD AND Threads_FOUND AND CMAKE_USE_PTHREADS_INIT)
|
||||||
TARGET_LINK_LIBRARIES(${OPENJPEG_LIBRARY_NAME} ${CMAKE_THREAD_LIBS_INIT})
|
TARGET_LINK_LIBRARIES(${OPENJPEG_LIBRARY_NAME} ${CMAKE_THREAD_LIBS_INIT})
|
||||||
endif(OPJ_USE_THREAD AND Threads_FOUND AND CMAKE_USE_PTHREADS_INIT)
|
endif(OPJ_USE_THREAD AND Threads_FOUND AND CMAKE_USE_PTHREADS_INIT)
|
||||||
|
|
||||||
if(BUILD_UNIT_TESTS)
|
if(BUILD_UNIT_TESTS AND UNIX)
|
||||||
add_executable(bench_dwt bench_dwt.c)
|
add_executable(bench_dwt bench_dwt.c)
|
||||||
if(UNIX)
|
if(UNIX)
|
||||||
target_link_libraries(bench_dwt m ${OPENJPEG_LIBRARY_NAME})
|
target_link_libraries(bench_dwt m ${OPENJPEG_LIBRARY_NAME})
|
||||||
|
@ -215,4 +215,4 @@ if(BUILD_UNIT_TESTS)
|
||||||
if(OPJ_USE_THREAD AND Threads_FOUND AND CMAKE_USE_PTHREADS_INIT)
|
if(OPJ_USE_THREAD AND Threads_FOUND AND CMAKE_USE_PTHREADS_INIT)
|
||||||
target_link_libraries(test_sparse_array ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(test_sparse_array ${CMAKE_THREAD_LIBS_INIT})
|
||||||
endif(OPJ_USE_THREAD AND Threads_FOUND AND CMAKE_USE_PTHREADS_INIT)
|
endif(OPJ_USE_THREAD AND Threads_FOUND AND CMAKE_USE_PTHREADS_INIT)
|
||||||
endif(BUILD_UNIT_TESTS)
|
endif(BUILD_UNIT_TESTS AND UNIX)
|
||||||
|
|
|
@ -67,6 +67,7 @@ void init_tilec(opj_tcd_tilecomp_t * l_tilec,
|
||||||
l_tilec->data[i] = getValue((OPJ_UINT32)i);
|
l_tilec->data[i] = getValue((OPJ_UINT32)i);
|
||||||
}
|
}
|
||||||
l_tilec->numresolutions = numresolutions;
|
l_tilec->numresolutions = numresolutions;
|
||||||
|
l_tilec->minimum_num_resolutions = numresolutions;
|
||||||
l_tilec->resolutions = (opj_tcd_resolution_t*) opj_calloc(
|
l_tilec->resolutions = (opj_tcd_resolution_t*) opj_calloc(
|
||||||
l_tilec->numresolutions,
|
l_tilec->numresolutions,
|
||||||
sizeof(opj_tcd_resolution_t));
|
sizeof(opj_tcd_resolution_t));
|
||||||
|
@ -98,9 +99,9 @@ void free_tilec(opj_tcd_tilecomp_t * l_tilec)
|
||||||
void usage(void)
|
void usage(void)
|
||||||
{
|
{
|
||||||
printf(
|
printf(
|
||||||
"bench_dwt [-size value] [-check] [-display] [-num_resolutions val]\n");
|
"bench_dwt [-decode|encode] [-size value] [-check] [-display]\n");
|
||||||
printf(
|
printf(
|
||||||
" [-offset x y] [-num_threads val]\n");
|
" [-num_resolutions val] [-offset x y] [-num_threads val]\n");
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -131,6 +132,17 @@ OPJ_FLOAT64 opj_clock(void)
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static OPJ_FLOAT64 opj_wallclock(void)
|
||||||
|
{
|
||||||
|
#ifdef _WIN32
|
||||||
|
return opj_clock();
|
||||||
|
#else
|
||||||
|
struct timeval tv;
|
||||||
|
gettimeofday(&tv, NULL);
|
||||||
|
return (OPJ_FLOAT64)tv.tv_sec + 1e-6 * (OPJ_FLOAT64)tv.tv_usec;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
int main(int argc, char** argv)
|
int main(int argc, char** argv)
|
||||||
{
|
{
|
||||||
int num_threads = 0;
|
int num_threads = 0;
|
||||||
|
@ -146,12 +158,18 @@ int main(int argc, char** argv)
|
||||||
OPJ_BOOL check = OPJ_FALSE;
|
OPJ_BOOL check = OPJ_FALSE;
|
||||||
OPJ_INT32 size = 16384 - 1;
|
OPJ_INT32 size = 16384 - 1;
|
||||||
OPJ_FLOAT64 start, stop;
|
OPJ_FLOAT64 start, stop;
|
||||||
|
OPJ_FLOAT64 start_wc, stop_wc;
|
||||||
OPJ_UINT32 offset_x = ((OPJ_UINT32)size + 1) / 2 - 1;
|
OPJ_UINT32 offset_x = ((OPJ_UINT32)size + 1) / 2 - 1;
|
||||||
OPJ_UINT32 offset_y = ((OPJ_UINT32)size + 1) / 2 - 1;
|
OPJ_UINT32 offset_y = ((OPJ_UINT32)size + 1) / 2 - 1;
|
||||||
OPJ_UINT32 num_resolutions = 6;
|
OPJ_UINT32 num_resolutions = 6;
|
||||||
|
OPJ_BOOL bench_decode = OPJ_TRUE;
|
||||||
|
|
||||||
for (i = 1; i < argc; i++) {
|
for (i = 1; i < argc; i++) {
|
||||||
if (strcmp(argv[i], "-display") == 0) {
|
if (strcmp(argv[i], "-encode") == 0) {
|
||||||
|
bench_decode = OPJ_FALSE;
|
||||||
|
} else if (strcmp(argv[i], "-decode") == 0) {
|
||||||
|
bench_decode = OPJ_TRUE;
|
||||||
|
} else if (strcmp(argv[i], "-display") == 0) {
|
||||||
display = OPJ_TRUE;
|
display = OPJ_TRUE;
|
||||||
check = OPJ_TRUE;
|
check = OPJ_TRUE;
|
||||||
} else if (strcmp(argv[i], "-check") == 0) {
|
} else if (strcmp(argv[i], "-check") == 0) {
|
||||||
|
@ -223,13 +241,26 @@ int main(int argc, char** argv)
|
||||||
image_comp.dy = 1;
|
image_comp.dy = 1;
|
||||||
|
|
||||||
start = opj_clock();
|
start = opj_clock();
|
||||||
opj_dwt_decode(&tcd, &tilec, tilec.numresolutions);
|
start_wc = opj_wallclock();
|
||||||
|
if (bench_decode) {
|
||||||
|
opj_dwt_decode(&tcd, &tilec, tilec.numresolutions);
|
||||||
|
} else {
|
||||||
|
opj_dwt_encode(&tcd, &tilec);
|
||||||
|
}
|
||||||
stop = opj_clock();
|
stop = opj_clock();
|
||||||
printf("time for dwt_decode: %.03f s\n", stop - start);
|
stop_wc = opj_wallclock();
|
||||||
|
printf("time for %s: total = %.03f s, wallclock = %.03f s\n",
|
||||||
|
bench_decode ? "dwt_decode" : "dwt_encode",
|
||||||
|
stop - start,
|
||||||
|
stop_wc - start_wc);
|
||||||
|
|
||||||
if (display || check) {
|
if (display || check) {
|
||||||
if (display) {
|
if (display) {
|
||||||
printf("After IDWT\n");
|
if (bench_decode) {
|
||||||
|
printf("After IDWT\n");
|
||||||
|
} else {
|
||||||
|
printf("After FDWT\n");
|
||||||
|
}
|
||||||
k = 0;
|
k = 0;
|
||||||
for (j = 0; j < tilec.y1 - tilec.y0; j++) {
|
for (j = 0; j < tilec.y1 - tilec.y0; j++) {
|
||||||
for (i = 0; i < tilec.x1 - tilec.x0; i++) {
|
for (i = 0; i < tilec.x1 - tilec.x0; i++) {
|
||||||
|
@ -240,9 +271,18 @@ int main(int argc, char** argv)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
opj_dwt_encode(&tilec);
|
if (bench_decode) {
|
||||||
|
opj_dwt_encode(&tcd, &tilec);
|
||||||
|
} else {
|
||||||
|
opj_dwt_decode(&tcd, &tilec, tilec.numresolutions);
|
||||||
|
}
|
||||||
|
|
||||||
if (display) {
|
if (display) {
|
||||||
printf("After FDWT\n");
|
if (bench_decode) {
|
||||||
|
printf("After FDWT\n");
|
||||||
|
} else {
|
||||||
|
printf("After IDWT\n");
|
||||||
|
}
|
||||||
k = 0;
|
k = 0;
|
||||||
for (j = 0; j < tilec.y1 - tilec.y0; j++) {
|
for (j = 0; j < tilec.y1 - tilec.y0; j++) {
|
||||||
for (i = 0; i < tilec.x1 - tilec.x0; i++) {
|
for (i = 0; i < tilec.x1 - tilec.x0; i++) {
|
||||||
|
|
|
@ -129,7 +129,7 @@ static void opj_dwt_deinterleave_h(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn,
|
||||||
Forward lazy transform (vertical)
|
Forward lazy transform (vertical)
|
||||||
*/
|
*/
|
||||||
static void opj_dwt_deinterleave_v(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn,
|
static void opj_dwt_deinterleave_v(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn,
|
||||||
OPJ_INT32 sn, OPJ_INT32 x, OPJ_INT32 cas);
|
OPJ_INT32 sn, OPJ_UINT32 x, OPJ_INT32 cas);
|
||||||
/**
|
/**
|
||||||
Forward 5-3 wavelet transform in 1-D
|
Forward 5-3 wavelet transform in 1-D
|
||||||
*/
|
*/
|
||||||
|
@ -155,7 +155,8 @@ static OPJ_BOOL opj_dwt_decode_partial_tile(
|
||||||
opj_tcd_tilecomp_t* tilec,
|
opj_tcd_tilecomp_t* tilec,
|
||||||
OPJ_UINT32 numres);
|
OPJ_UINT32 numres);
|
||||||
|
|
||||||
static OPJ_BOOL opj_dwt_encode_procedure(opj_tcd_tilecomp_t * tilec,
|
static OPJ_BOOL opj_dwt_encode_procedure(opj_thread_pool_t* tp,
|
||||||
|
opj_tcd_tilecomp_t * tilec,
|
||||||
void (*p_function)(OPJ_INT32 *, OPJ_INT32, OPJ_INT32, OPJ_INT32));
|
void (*p_function)(OPJ_INT32 *, OPJ_INT32, OPJ_INT32, OPJ_INT32));
|
||||||
|
|
||||||
static OPJ_UINT32 opj_dwt_max_resolution(opj_tcd_resolution_t* OPJ_RESTRICT r,
|
static OPJ_UINT32 opj_dwt_max_resolution(opj_tcd_resolution_t* OPJ_RESTRICT r,
|
||||||
|
@ -271,7 +272,7 @@ static void opj_dwt_deinterleave_h(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn,
|
||||||
/* Forward lazy transform (vertical). */
|
/* Forward lazy transform (vertical). */
|
||||||
/* </summary> */
|
/* </summary> */
|
||||||
static void opj_dwt_deinterleave_v(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn,
|
static void opj_dwt_deinterleave_v(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn,
|
||||||
OPJ_INT32 sn, OPJ_INT32 x, OPJ_INT32 cas)
|
OPJ_INT32 sn, OPJ_UINT32 x, OPJ_INT32 cas)
|
||||||
{
|
{
|
||||||
OPJ_INT32 i = sn;
|
OPJ_INT32 i = sn;
|
||||||
OPJ_INT32 * l_dest = b;
|
OPJ_INT32 * l_dest = b;
|
||||||
|
@ -1103,28 +1104,92 @@ static void opj_dwt_encode_stepsize(OPJ_INT32 stepsize, OPJ_INT32 numbps,
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
opj_dwt_t h;
|
||||||
|
OPJ_UINT32 rw;
|
||||||
|
OPJ_UINT32 w;
|
||||||
|
OPJ_INT32 * OPJ_RESTRICT tiledp;
|
||||||
|
OPJ_UINT32 min_j;
|
||||||
|
OPJ_UINT32 max_j;
|
||||||
|
void (*p_function)(OPJ_INT32 *, OPJ_INT32, OPJ_INT32, OPJ_INT32);
|
||||||
|
} opj_dwt_encode_h_job_t;
|
||||||
|
|
||||||
|
static void opj_dwt_encode_h_func(void* user_data, opj_tls_t* tls)
|
||||||
|
{
|
||||||
|
OPJ_UINT32 j;
|
||||||
|
opj_dwt_encode_h_job_t* job;
|
||||||
|
(void)tls;
|
||||||
|
|
||||||
|
job = (opj_dwt_encode_h_job_t*)user_data;
|
||||||
|
for (j = job->min_j; j < job->max_j; j++) {
|
||||||
|
OPJ_INT32* OPJ_RESTRICT aj = job->tiledp + j * job->w;
|
||||||
|
OPJ_UINT32 k;
|
||||||
|
for (k = 0; k < job->rw; k++) {
|
||||||
|
job->h.mem[k] = aj[k];
|
||||||
|
}
|
||||||
|
(*job->p_function)(job->h.mem, job->h.dn, job->h.sn, job->h.cas);
|
||||||
|
opj_dwt_deinterleave_h(job->h.mem, aj, job->h.dn, job->h.sn, job->h.cas);
|
||||||
|
}
|
||||||
|
|
||||||
|
opj_aligned_free(job->h.mem);
|
||||||
|
opj_free(job);
|
||||||
|
}
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
opj_dwt_t v;
|
||||||
|
OPJ_UINT32 rh;
|
||||||
|
OPJ_UINT32 w;
|
||||||
|
OPJ_INT32 * OPJ_RESTRICT tiledp;
|
||||||
|
OPJ_UINT32 min_j;
|
||||||
|
OPJ_UINT32 max_j;
|
||||||
|
void (*p_function)(OPJ_INT32 *, OPJ_INT32, OPJ_INT32, OPJ_INT32);
|
||||||
|
} opj_dwt_encode_v_job_t;
|
||||||
|
|
||||||
|
static void opj_dwt_encode_v_func(void* user_data, opj_tls_t* tls)
|
||||||
|
{
|
||||||
|
OPJ_UINT32 j;
|
||||||
|
opj_dwt_encode_v_job_t* job;
|
||||||
|
(void)tls;
|
||||||
|
|
||||||
|
job = (opj_dwt_encode_v_job_t*)user_data;
|
||||||
|
for (j = job->min_j; j < job->max_j; j++) {
|
||||||
|
OPJ_INT32* OPJ_RESTRICT aj = job->tiledp + j;
|
||||||
|
OPJ_UINT32 k;
|
||||||
|
for (k = 0; k < job->rh; ++k) {
|
||||||
|
job->v.mem[k] = aj[k * job->w];
|
||||||
|
}
|
||||||
|
|
||||||
|
(*job->p_function)(job->v.mem, job->v.dn, job->v.sn, job->v.cas);
|
||||||
|
|
||||||
|
opj_dwt_deinterleave_v(job->v.mem, aj, job->v.dn, job->v.sn, job->w,
|
||||||
|
job->v.cas);
|
||||||
|
}
|
||||||
|
|
||||||
|
opj_aligned_free(job->v.mem);
|
||||||
|
opj_free(job);
|
||||||
|
}
|
||||||
|
|
||||||
/* <summary> */
|
/* <summary> */
|
||||||
/* Forward 5-3 wavelet transform in 2-D. */
|
/* Forward 5-3 wavelet transform in 2-D. */
|
||||||
/* </summary> */
|
/* </summary> */
|
||||||
static INLINE OPJ_BOOL opj_dwt_encode_procedure(opj_tcd_tilecomp_t * tilec,
|
static INLINE OPJ_BOOL opj_dwt_encode_procedure(opj_thread_pool_t* tp,
|
||||||
|
opj_tcd_tilecomp_t * tilec,
|
||||||
void (*p_function)(OPJ_INT32 *, OPJ_INT32, OPJ_INT32, OPJ_INT32))
|
void (*p_function)(OPJ_INT32 *, OPJ_INT32, OPJ_INT32, OPJ_INT32))
|
||||||
{
|
{
|
||||||
OPJ_INT32 i, j, k;
|
OPJ_INT32 i;
|
||||||
OPJ_INT32 *a = 00;
|
|
||||||
OPJ_INT32 *aj = 00;
|
|
||||||
OPJ_INT32 *bj = 00;
|
OPJ_INT32 *bj = 00;
|
||||||
OPJ_INT32 w, l;
|
OPJ_UINT32 w;
|
||||||
|
OPJ_INT32 l;
|
||||||
|
|
||||||
OPJ_INT32 rw; /* width of the resolution level computed */
|
|
||||||
OPJ_INT32 rh; /* height of the resolution level computed */
|
|
||||||
OPJ_SIZE_T l_data_size;
|
OPJ_SIZE_T l_data_size;
|
||||||
|
|
||||||
opj_tcd_resolution_t * l_cur_res = 0;
|
opj_tcd_resolution_t * l_cur_res = 0;
|
||||||
opj_tcd_resolution_t * l_last_res = 0;
|
opj_tcd_resolution_t * l_last_res = 0;
|
||||||
|
const int num_threads = opj_thread_pool_get_thread_count(tp);
|
||||||
|
OPJ_INT32 * OPJ_RESTRICT tiledp = tilec->data;
|
||||||
|
|
||||||
w = tilec->x1 - tilec->x0;
|
w = (OPJ_UINT32)(tilec->x1 - tilec->x0);
|
||||||
l = (OPJ_INT32)tilec->numresolutions - 1;
|
l = (OPJ_INT32)tilec->numresolutions - 1;
|
||||||
a = tilec->data;
|
|
||||||
|
|
||||||
l_cur_res = tilec->resolutions + l;
|
l_cur_res = tilec->resolutions + l;
|
||||||
l_last_res = l_cur_res - 1;
|
l_last_res = l_cur_res - 1;
|
||||||
|
@ -1136,7 +1201,7 @@ static INLINE OPJ_BOOL opj_dwt_encode_procedure(opj_tcd_tilecomp_t * tilec,
|
||||||
return OPJ_FALSE;
|
return OPJ_FALSE;
|
||||||
}
|
}
|
||||||
l_data_size *= sizeof(OPJ_INT32);
|
l_data_size *= sizeof(OPJ_INT32);
|
||||||
bj = (OPJ_INT32*)opj_malloc(l_data_size);
|
bj = (OPJ_INT32*)opj_aligned_32_malloc(l_data_size);
|
||||||
/* l_data_size is equal to 0 when numresolutions == 1 but bj is not used */
|
/* l_data_size is equal to 0 when numresolutions == 1 but bj is not used */
|
||||||
/* in that case, so do not error out */
|
/* in that case, so do not error out */
|
||||||
if (l_data_size != 0 && ! bj) {
|
if (l_data_size != 0 && ! bj) {
|
||||||
|
@ -1145,43 +1210,137 @@ static INLINE OPJ_BOOL opj_dwt_encode_procedure(opj_tcd_tilecomp_t * tilec,
|
||||||
i = l;
|
i = l;
|
||||||
|
|
||||||
while (i--) {
|
while (i--) {
|
||||||
OPJ_INT32 rw1; /* width of the resolution level once lower than computed one */
|
OPJ_UINT32 j;
|
||||||
OPJ_INT32 rh1; /* height of the resolution level once lower than computed one */
|
OPJ_UINT32 rw; /* width of the resolution level computed */
|
||||||
|
OPJ_UINT32 rh; /* height of the resolution level computed */
|
||||||
|
OPJ_UINT32
|
||||||
|
rw1; /* width of the resolution level once lower than computed one */
|
||||||
|
OPJ_UINT32
|
||||||
|
rh1; /* height of the resolution level once lower than computed one */
|
||||||
OPJ_INT32 cas_col; /* 0 = non inversion on horizontal filtering 1 = inversion between low-pass and high-pass filtering */
|
OPJ_INT32 cas_col; /* 0 = non inversion on horizontal filtering 1 = inversion between low-pass and high-pass filtering */
|
||||||
OPJ_INT32 cas_row; /* 0 = non inversion on vertical filtering 1 = inversion between low-pass and high-pass filtering */
|
OPJ_INT32 cas_row; /* 0 = non inversion on vertical filtering 1 = inversion between low-pass and high-pass filtering */
|
||||||
OPJ_INT32 dn, sn;
|
OPJ_INT32 dn, sn;
|
||||||
|
|
||||||
rw = l_cur_res->x1 - l_cur_res->x0;
|
rw = (OPJ_UINT32)(l_cur_res->x1 - l_cur_res->x0);
|
||||||
rh = l_cur_res->y1 - l_cur_res->y0;
|
rh = (OPJ_UINT32)(l_cur_res->y1 - l_cur_res->y0);
|
||||||
rw1 = l_last_res->x1 - l_last_res->x0;
|
rw1 = (OPJ_UINT32)(l_last_res->x1 - l_last_res->x0);
|
||||||
rh1 = l_last_res->y1 - l_last_res->y0;
|
rh1 = (OPJ_UINT32)(l_last_res->y1 - l_last_res->y0);
|
||||||
|
|
||||||
cas_row = l_cur_res->x0 & 1;
|
cas_row = l_cur_res->x0 & 1;
|
||||||
cas_col = l_cur_res->y0 & 1;
|
cas_col = l_cur_res->y0 & 1;
|
||||||
|
|
||||||
sn = rh1;
|
sn = (OPJ_INT32)rh1;
|
||||||
dn = rh - rh1;
|
dn = (OPJ_INT32)(rh - rh1);
|
||||||
for (j = 0; j < rw; ++j) {
|
|
||||||
aj = a + j;
|
/* Perform vertical pass */
|
||||||
for (k = 0; k < rh; ++k) {
|
if (num_threads <= 1 || rw <= 1) {
|
||||||
bj[k] = aj[k * w];
|
for (j = 0; j < rw; ++j) {
|
||||||
|
OPJ_INT32* OPJ_RESTRICT aj = tiledp + j;
|
||||||
|
OPJ_UINT32 k;
|
||||||
|
for (k = 0; k < rh; ++k) {
|
||||||
|
bj[k] = aj[k * w];
|
||||||
|
}
|
||||||
|
|
||||||
|
(*p_function)(bj, dn, sn, cas_col);
|
||||||
|
|
||||||
|
opj_dwt_deinterleave_v(bj, aj, dn, sn, w, cas_col);
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
OPJ_UINT32 num_jobs = (OPJ_UINT32)num_threads;
|
||||||
|
OPJ_UINT32 step_j;
|
||||||
|
|
||||||
(*p_function)(bj, dn, sn, cas_col);
|
if (rw < num_jobs) {
|
||||||
|
num_jobs = rw;
|
||||||
|
}
|
||||||
|
step_j = (rw / num_jobs);
|
||||||
|
|
||||||
opj_dwt_deinterleave_v(bj, aj, dn, sn, w, cas_col);
|
for (j = 0; j < num_jobs; j++) {
|
||||||
|
opj_dwt_encode_v_job_t* job;
|
||||||
|
|
||||||
|
job = (opj_dwt_encode_v_job_t*) opj_malloc(sizeof(opj_dwt_encode_v_job_t));
|
||||||
|
if (!job) {
|
||||||
|
opj_thread_pool_wait_completion(tp, 0);
|
||||||
|
opj_aligned_free(bj);
|
||||||
|
return OPJ_FALSE;
|
||||||
|
}
|
||||||
|
job->v.mem = (OPJ_INT32*)opj_aligned_32_malloc(l_data_size);
|
||||||
|
if (!job->v.mem) {
|
||||||
|
opj_thread_pool_wait_completion(tp, 0);
|
||||||
|
opj_free(job);
|
||||||
|
opj_aligned_free(bj);
|
||||||
|
return OPJ_FALSE;
|
||||||
|
}
|
||||||
|
job->v.dn = dn;
|
||||||
|
job->v.sn = sn;
|
||||||
|
job->v.cas = cas_col;
|
||||||
|
job->rh = rh;
|
||||||
|
job->w = w;
|
||||||
|
job->tiledp = tiledp;
|
||||||
|
job->min_j = j * step_j;
|
||||||
|
job->max_j = (j + 1U) * step_j; /* this can overflow */
|
||||||
|
if (j == (num_jobs - 1U)) { /* this will take care of the overflow */
|
||||||
|
job->max_j = rw;
|
||||||
|
}
|
||||||
|
job->p_function = p_function;
|
||||||
|
opj_thread_pool_submit_job(tp, opj_dwt_encode_v_func, job);
|
||||||
|
}
|
||||||
|
opj_thread_pool_wait_completion(tp, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
sn = rw1;
|
sn = (OPJ_INT32)rw1;
|
||||||
dn = rw - rw1;
|
dn = (OPJ_INT32)(rw - rw1);
|
||||||
|
|
||||||
for (j = 0; j < rh; j++) {
|
/* Perform horizontal pass */
|
||||||
aj = a + j * w;
|
if (num_threads <= 1 || rh <= 1) {
|
||||||
for (k = 0; k < rw; k++) {
|
for (j = 0; j < rh; j++) {
|
||||||
bj[k] = aj[k];
|
OPJ_INT32* OPJ_RESTRICT aj = tiledp + j * w;
|
||||||
|
OPJ_UINT32 k;
|
||||||
|
for (k = 0; k < rw; k++) {
|
||||||
|
bj[k] = aj[k];
|
||||||
|
}
|
||||||
|
(*p_function)(bj, dn, sn, cas_row);
|
||||||
|
opj_dwt_deinterleave_h(bj, aj, dn, sn, cas_row);
|
||||||
}
|
}
|
||||||
(*p_function)(bj, dn, sn, cas_row);
|
} else {
|
||||||
opj_dwt_deinterleave_h(bj, aj, dn, sn, cas_row);
|
OPJ_UINT32 num_jobs = (OPJ_UINT32)num_threads;
|
||||||
|
OPJ_UINT32 step_j;
|
||||||
|
|
||||||
|
if (rh < num_jobs) {
|
||||||
|
num_jobs = rh;
|
||||||
|
}
|
||||||
|
step_j = (rh / num_jobs);
|
||||||
|
|
||||||
|
for (j = 0; j < num_jobs; j++) {
|
||||||
|
opj_dwt_encode_h_job_t* job;
|
||||||
|
|
||||||
|
job = (opj_dwt_encode_h_job_t*) opj_malloc(sizeof(opj_dwt_encode_h_job_t));
|
||||||
|
if (!job) {
|
||||||
|
opj_thread_pool_wait_completion(tp, 0);
|
||||||
|
opj_aligned_free(bj);
|
||||||
|
return OPJ_FALSE;
|
||||||
|
}
|
||||||
|
job->h.mem = (OPJ_INT32*)opj_aligned_32_malloc(l_data_size);
|
||||||
|
if (!job->h.mem) {
|
||||||
|
opj_thread_pool_wait_completion(tp, 0);
|
||||||
|
opj_free(job);
|
||||||
|
opj_aligned_free(bj);
|
||||||
|
return OPJ_FALSE;
|
||||||
|
}
|
||||||
|
job->h.dn = dn;
|
||||||
|
job->h.sn = sn;
|
||||||
|
job->h.cas = cas_row;
|
||||||
|
job->rw = rw;
|
||||||
|
job->w = w;
|
||||||
|
job->tiledp = tiledp;
|
||||||
|
job->min_j = j * step_j;
|
||||||
|
job->max_j = (j + 1U) * step_j; /* this can overflow */
|
||||||
|
if (j == (num_jobs - 1U)) { /* this will take care of the overflow */
|
||||||
|
job->max_j = rh;
|
||||||
|
}
|
||||||
|
job->p_function = p_function;
|
||||||
|
opj_thread_pool_submit_job(tp, opj_dwt_encode_h_func, job);
|
||||||
|
}
|
||||||
|
opj_thread_pool_wait_completion(tp, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
l_cur_res = l_last_res;
|
l_cur_res = l_last_res;
|
||||||
|
@ -1189,15 +1348,16 @@ static INLINE OPJ_BOOL opj_dwt_encode_procedure(opj_tcd_tilecomp_t * tilec,
|
||||||
--l_last_res;
|
--l_last_res;
|
||||||
}
|
}
|
||||||
|
|
||||||
opj_free(bj);
|
opj_aligned_free(bj);
|
||||||
return OPJ_TRUE;
|
return OPJ_TRUE;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Forward 5-3 wavelet transform in 2-D. */
|
/* Forward 5-3 wavelet transform in 2-D. */
|
||||||
/* </summary> */
|
/* </summary> */
|
||||||
OPJ_BOOL opj_dwt_encode(opj_tcd_tilecomp_t * tilec)
|
OPJ_BOOL opj_dwt_encode(opj_tcd_t *p_tcd,
|
||||||
|
opj_tcd_tilecomp_t * tilec)
|
||||||
{
|
{
|
||||||
return opj_dwt_encode_procedure(tilec, opj_dwt_encode_1);
|
return opj_dwt_encode_procedure(p_tcd->thread_pool, tilec, opj_dwt_encode_1);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* <summary> */
|
/* <summary> */
|
||||||
|
@ -1247,9 +1407,11 @@ OPJ_FLOAT64 opj_dwt_getnorm(OPJ_UINT32 level, OPJ_UINT32 orient)
|
||||||
/* <summary> */
|
/* <summary> */
|
||||||
/* Forward 9-7 wavelet transform in 2-D. */
|
/* Forward 9-7 wavelet transform in 2-D. */
|
||||||
/* </summary> */
|
/* </summary> */
|
||||||
OPJ_BOOL opj_dwt_encode_real(opj_tcd_tilecomp_t * tilec)
|
OPJ_BOOL opj_dwt_encode_real(opj_tcd_t *p_tcd,
|
||||||
|
opj_tcd_tilecomp_t * tilec)
|
||||||
{
|
{
|
||||||
return opj_dwt_encode_procedure(tilec, opj_dwt_encode_1_real);
|
return opj_dwt_encode_procedure(p_tcd->thread_pool, tilec,
|
||||||
|
opj_dwt_encode_1_real);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* <summary> */
|
/* <summary> */
|
||||||
|
@ -1328,15 +1490,15 @@ typedef struct {
|
||||||
OPJ_INT32 * OPJ_RESTRICT tiledp;
|
OPJ_INT32 * OPJ_RESTRICT tiledp;
|
||||||
OPJ_UINT32 min_j;
|
OPJ_UINT32 min_j;
|
||||||
OPJ_UINT32 max_j;
|
OPJ_UINT32 max_j;
|
||||||
} opj_dwd_decode_h_job_t;
|
} opj_dwt_decode_h_job_t;
|
||||||
|
|
||||||
static void opj_dwt_decode_h_func(void* user_data, opj_tls_t* tls)
|
static void opj_dwt_decode_h_func(void* user_data, opj_tls_t* tls)
|
||||||
{
|
{
|
||||||
OPJ_UINT32 j;
|
OPJ_UINT32 j;
|
||||||
opj_dwd_decode_h_job_t* job;
|
opj_dwt_decode_h_job_t* job;
|
||||||
(void)tls;
|
(void)tls;
|
||||||
|
|
||||||
job = (opj_dwd_decode_h_job_t*)user_data;
|
job = (opj_dwt_decode_h_job_t*)user_data;
|
||||||
for (j = job->min_j; j < job->max_j; j++) {
|
for (j = job->min_j; j < job->max_j; j++) {
|
||||||
opj_idwt53_h(&job->h, &job->tiledp[j * job->w]);
|
opj_idwt53_h(&job->h, &job->tiledp[j * job->w]);
|
||||||
}
|
}
|
||||||
|
@ -1352,15 +1514,15 @@ typedef struct {
|
||||||
OPJ_INT32 * OPJ_RESTRICT tiledp;
|
OPJ_INT32 * OPJ_RESTRICT tiledp;
|
||||||
OPJ_UINT32 min_j;
|
OPJ_UINT32 min_j;
|
||||||
OPJ_UINT32 max_j;
|
OPJ_UINT32 max_j;
|
||||||
} opj_dwd_decode_v_job_t;
|
} opj_dwt_decode_v_job_t;
|
||||||
|
|
||||||
static void opj_dwt_decode_v_func(void* user_data, opj_tls_t* tls)
|
static void opj_dwt_decode_v_func(void* user_data, opj_tls_t* tls)
|
||||||
{
|
{
|
||||||
OPJ_UINT32 j;
|
OPJ_UINT32 j;
|
||||||
opj_dwd_decode_v_job_t* job;
|
opj_dwt_decode_v_job_t* job;
|
||||||
(void)tls;
|
(void)tls;
|
||||||
|
|
||||||
job = (opj_dwd_decode_v_job_t*)user_data;
|
job = (opj_dwt_decode_v_job_t*)user_data;
|
||||||
for (j = job->min_j; j + PARALLEL_COLS_53 <= job->max_j;
|
for (j = job->min_j; j + PARALLEL_COLS_53 <= job->max_j;
|
||||||
j += PARALLEL_COLS_53) {
|
j += PARALLEL_COLS_53) {
|
||||||
opj_idwt53_v(&job->v, &job->tiledp[j], (OPJ_SIZE_T)job->w,
|
opj_idwt53_v(&job->v, &job->tiledp[j], (OPJ_SIZE_T)job->w,
|
||||||
|
@ -1447,9 +1609,9 @@ static OPJ_BOOL opj_dwt_decode_tile(opj_thread_pool_t* tp,
|
||||||
step_j = (rh / num_jobs);
|
step_j = (rh / num_jobs);
|
||||||
|
|
||||||
for (j = 0; j < num_jobs; j++) {
|
for (j = 0; j < num_jobs; j++) {
|
||||||
opj_dwd_decode_h_job_t* job;
|
opj_dwt_decode_h_job_t* job;
|
||||||
|
|
||||||
job = (opj_dwd_decode_h_job_t*) opj_malloc(sizeof(opj_dwd_decode_h_job_t));
|
job = (opj_dwt_decode_h_job_t*) opj_malloc(sizeof(opj_dwt_decode_h_job_t));
|
||||||
if (!job) {
|
if (!job) {
|
||||||
/* It would be nice to fallback to single thread case, but */
|
/* It would be nice to fallback to single thread case, but */
|
||||||
/* unfortunately some jobs may be launched and have modified */
|
/* unfortunately some jobs may be launched and have modified */
|
||||||
|
@ -1502,9 +1664,9 @@ static OPJ_BOOL opj_dwt_decode_tile(opj_thread_pool_t* tp,
|
||||||
step_j = (rw / num_jobs);
|
step_j = (rw / num_jobs);
|
||||||
|
|
||||||
for (j = 0; j < num_jobs; j++) {
|
for (j = 0; j < num_jobs; j++) {
|
||||||
opj_dwd_decode_v_job_t* job;
|
opj_dwt_decode_v_job_t* job;
|
||||||
|
|
||||||
job = (opj_dwd_decode_v_job_t*) opj_malloc(sizeof(opj_dwd_decode_v_job_t));
|
job = (opj_dwt_decode_v_job_t*) opj_malloc(sizeof(opj_dwt_decode_v_job_t));
|
||||||
if (!job) {
|
if (!job) {
|
||||||
/* It would be nice to fallback to single thread case, but */
|
/* It would be nice to fallback to single thread case, but */
|
||||||
/* unfortunately some jobs may be launched and have modified */
|
/* unfortunately some jobs may be launched and have modified */
|
||||||
|
|
|
@ -56,9 +56,11 @@ DWT.C are used by some function in TCD.C.
|
||||||
/**
|
/**
|
||||||
Forward 5-3 wavelet transform in 2-D.
|
Forward 5-3 wavelet transform in 2-D.
|
||||||
Apply a reversible DWT transform to a component of an image.
|
Apply a reversible DWT transform to a component of an image.
|
||||||
|
@param p_tcd TCD handle
|
||||||
@param tilec Tile component information (current tile)
|
@param tilec Tile component information (current tile)
|
||||||
*/
|
*/
|
||||||
OPJ_BOOL opj_dwt_encode(opj_tcd_tilecomp_t * tilec);
|
OPJ_BOOL opj_dwt_encode(opj_tcd_t *p_tcd,
|
||||||
|
opj_tcd_tilecomp_t * tilec);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
Inverse 5-3 wavelet transform in 2-D.
|
Inverse 5-3 wavelet transform in 2-D.
|
||||||
|
@ -87,9 +89,11 @@ OPJ_FLOAT64 opj_dwt_getnorm(OPJ_UINT32 level, OPJ_UINT32 orient);
|
||||||
/**
|
/**
|
||||||
Forward 9-7 wavelet transform in 2-D.
|
Forward 9-7 wavelet transform in 2-D.
|
||||||
Apply an irreversible DWT transform to a component of an image.
|
Apply an irreversible DWT transform to a component of an image.
|
||||||
|
@param p_tcd TCD handle
|
||||||
@param tilec Tile component information (current tile)
|
@param tilec Tile component information (current tile)
|
||||||
*/
|
*/
|
||||||
OPJ_BOOL opj_dwt_encode_real(opj_tcd_tilecomp_t * tilec);
|
OPJ_BOOL opj_dwt_encode_real(opj_tcd_t *p_tcd,
|
||||||
|
opj_tcd_tilecomp_t * tilec);
|
||||||
/**
|
/**
|
||||||
Inverse 9-7 wavelet transform in 2-D.
|
Inverse 9-7 wavelet transform in 2-D.
|
||||||
Apply an irreversible inverse DWT transform to a component of an image.
|
Apply an irreversible inverse DWT transform to a component of an image.
|
||||||
|
|
|
@ -2488,11 +2488,11 @@ static OPJ_BOOL opj_tcd_dwt_encode(opj_tcd_t *p_tcd)
|
||||||
|
|
||||||
for (compno = 0; compno < l_tile->numcomps; ++compno) {
|
for (compno = 0; compno < l_tile->numcomps; ++compno) {
|
||||||
if (l_tccp->qmfbid == 1) {
|
if (l_tccp->qmfbid == 1) {
|
||||||
if (! opj_dwt_encode(l_tile_comp)) {
|
if (! opj_dwt_encode(p_tcd, l_tile_comp)) {
|
||||||
return OPJ_FALSE;
|
return OPJ_FALSE;
|
||||||
}
|
}
|
||||||
} else if (l_tccp->qmfbid == 0) {
|
} else if (l_tccp->qmfbid == 0) {
|
||||||
if (! opj_dwt_encode_real(l_tile_comp)) {
|
if (! opj_dwt_encode_real(p_tcd, l_tile_comp)) {
|
||||||
return OPJ_FALSE;
|
return OPJ_FALSE;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -121,6 +121,9 @@ BUILD_TESTING:BOOL=${BUILD_TESTING}
|
||||||
# Build Thirdparty, useful but not required for test suite
|
# Build Thirdparty, useful but not required for test suite
|
||||||
BUILD_THIRDPARTY:BOOL=TRUE
|
BUILD_THIRDPARTY:BOOL=TRUE
|
||||||
|
|
||||||
|
# Build unit tests that test subcomponents of libopenjp2 (e.g. DWT)
|
||||||
|
BUILD_UNIT_TESTS:BOOL=TRUE
|
||||||
|
|
||||||
# JPEG2000 test files are available with git clone https://github.com/uclouvain/openjpeg-data.git
|
# JPEG2000 test files are available with git clone https://github.com/uclouvain/openjpeg-data.git
|
||||||
OPJ_DATA_ROOT:PATH=$ENV{PWD}/data
|
OPJ_DATA_ROOT:PATH=$ENV{PWD}/data
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue