Sub-tile decoding: speed up vertical pass in IDWT5x3 by processing 4 cols at a time

This commit is contained in:
Even Rouault 2017-09-01 16:31:00 +02:00
parent ccac773556
commit 873004c615
2 changed files with 165 additions and 25 deletions

View File

@ -1551,6 +1551,7 @@ static void opj_dwt_interleave_partial_v(OPJ_INT32 *dest,
OPJ_INT32 cas,
opj_sparse_array_int32_t* sa,
OPJ_UINT32 sa_col,
OPJ_UINT32 nb_cols,
OPJ_UINT32 sn,
OPJ_UINT32 win_l_y0,
OPJ_UINT32 win_l_y1,
@ -1560,15 +1561,15 @@ static void opj_dwt_interleave_partial_v(OPJ_INT32 *dest,
OPJ_BOOL ret;
ret = opj_sparse_array_int32_read(sa,
sa_col, win_l_y0,
sa_col + 1, win_l_y1,
dest + cas + 2 * win_l_y0,
0, 2, OPJ_TRUE);
sa_col + nb_cols, win_l_y1,
dest + cas * 4 + 2 * 4 * win_l_y0,
1, 2 * 4, OPJ_TRUE);
assert(ret);
ret = opj_sparse_array_int32_read(sa,
sa_col, sn + win_h_y0,
sa_col + 1, sn + win_h_y1,
dest + 1 - cas + 2 * win_h_y0,
0, 2, OPJ_TRUE);
sa_col + nb_cols, sn + win_h_y1,
dest + (1 - cas) * 4 + 2 * 4 * win_h_y0,
1, 2 * 4, OPJ_TRUE);
assert(ret);
OPJ_UNUSED(ret);
}
@ -1648,6 +1649,109 @@ static void opj_dwt_decode_partial_1(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn,
}
}
#define OPJ_S_off(i,off) a[(OPJ_UINT32)(i)*2*4+off]
#define OPJ_D_off(i,off) a[(1+(OPJ_UINT32)(i)*2)*4+off]
#define OPJ_S__off(i,off) ((i)<0?OPJ_S_off(0,off):((i)>=sn?OPJ_S_off(sn-1,off):OPJ_S_off(i,off)))
#define OPJ_D__off(i,off) ((i)<0?OPJ_D_off(0,off):((i)>=dn?OPJ_D_off(dn-1,off):OPJ_D_off(i,off)))
#define OPJ_SS__off(i,off) ((i)<0?OPJ_S_off(0,off):((i)>=dn?OPJ_S_off(dn-1,off):OPJ_S_off(i,off)))
#define OPJ_DD__off(i,off) ((i)<0?OPJ_D_off(0,off):((i)>=sn?OPJ_D_off(sn-1,off):OPJ_D_off(i,off)))
static void opj_dwt_decode_partial_1_parallel(OPJ_INT32 *a,
OPJ_UINT32 nb_cols,
OPJ_INT32 dn, OPJ_INT32 sn,
OPJ_INT32 cas,
OPJ_INT32 win_l_x0,
OPJ_INT32 win_l_x1,
OPJ_INT32 win_h_x0,
OPJ_INT32 win_h_x1)
{
OPJ_INT32 i;
OPJ_UINT32 off;
(void)nb_cols;
if (!cas) {
if ((dn > 0) || (sn > 1)) { /* NEW : CASE ONE ELEMENT */
/* Naive version is :
for (i = win_l_x0; i < i_max; i++) {
OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2;
}
for (i = win_h_x0; i < win_h_x1; i++) {
OPJ_D(i) += (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1;
}
but the compiler doesn't manage to unroll it to avoid bound
checking in OPJ_S_ and OPJ_D_ macros
*/
i = win_l_x0;
if (i < win_l_x1) {
OPJ_INT32 i_max;
/* Left-most case */
for (off = 0; off < 4; off++) {
OPJ_S_off(i, off) -= (OPJ_D__off(i - 1, off) + OPJ_D__off(i, off) + 2) >> 2;
}
i ++;
i_max = win_l_x1;
if (i_max > dn) {
i_max = dn;
}
for (; i < i_max; i++) {
/* No bound checking */
for (off = 0; off < 4; off++) {
OPJ_S_off(i, off) -= (OPJ_D_off(i - 1, off) + OPJ_D_off(i, off) + 2) >> 2;
}
}
for (; i < win_l_x1; i++) {
/* Right-most case */
for (off = 0; off < 4; off++) {
OPJ_S_off(i, off) -= (OPJ_D__off(i - 1, off) + OPJ_D__off(i, off) + 2) >> 2;
}
}
}
i = win_h_x0;
if (i < win_h_x1) {
OPJ_INT32 i_max = win_h_x1;
if (i_max >= sn) {
i_max = sn - 1;
}
for (; i < i_max; i++) {
/* No bound checking */
for (off = 0; off < 4; off++) {
OPJ_D_off(i, off) += (OPJ_S_off(i, off) + OPJ_S_off(i + 1, off)) >> 1;
}
}
for (; i < win_h_x1; i++) {
/* Right-most case */
for (off = 0; off < 4; off++) {
OPJ_D_off(i, off) += (OPJ_S__off(i, off) + OPJ_S__off(i + 1, off)) >> 1;
}
}
}
}
} else {
if (!sn && dn == 1) { /* NEW : CASE ONE ELEMENT */
for (off = 0; off < 4; off++) {
OPJ_S_off(0, off) /= 2;
}
} else {
for (i = win_l_x0; i < win_l_x1; i++) {
for (off = 0; off < 4; off++) {
OPJ_D_off(i, off) -= (OPJ_SS__off(i, off) + OPJ_SS__off(i + 1, off) + 2) >> 2;
}
}
for (i = win_h_x0; i < win_h_x1; i++) {
for (off = 0; off < 4; off++) {
OPJ_S_off(i, off) += (OPJ_DD__off(i, off) + OPJ_DD__off(i - 1, off)) >> 1;
}
}
}
}
}
static void opj_dwt_get_band_coordinates(opj_tcd_tilecomp_t* tilec,
OPJ_UINT32 resno,
OPJ_UINT32 bandno,
@ -1804,13 +1908,14 @@ static OPJ_BOOL opj_dwt_decode_partial_tile(
}
h_mem_size = opj_dwt_max_resolution(tr, numres);
/* overflow check */
if (h_mem_size > (SIZE_MAX / sizeof(OPJ_INT32))) {
/* in vertical pass, we process 4 columns at a time */
if (h_mem_size > (SIZE_MAX / (4 * sizeof(OPJ_INT32)))) {
/* FIXME event manager error callback */
opj_sparse_array_int32_free(sa);
return OPJ_FALSE;
}
h_mem_size *= sizeof(OPJ_INT32);
h_mem_size *= 4 * sizeof(OPJ_INT32);
h.mem = (OPJ_INT32*)opj_aligned_32_malloc(h_mem_size);
if (! h.mem) {
/* FIXME event manager error callback */
@ -1946,31 +2051,35 @@ static OPJ_BOOL opj_dwt_decode_partial_tile(
}
}
for (i = win_tr_x0; i < win_tr_x1; ++i) {
for (i = win_tr_x0; i < win_tr_x1;) {
OPJ_UINT32 nb_cols = opj_uint_min(4U, win_tr_x1 - i);
opj_dwt_interleave_partial_v(v.mem,
v.cas,
sa,
i,
nb_cols,
(OPJ_UINT32)v.sn,
win_ll_y0,
win_ll_y1,
win_lh_y0,
win_lh_y1);
opj_dwt_decode_partial_1(v.mem, v.dn, v.sn, v.cas,
(OPJ_INT32)win_ll_y0,
(OPJ_INT32)win_ll_y1,
(OPJ_INT32)win_lh_y0,
(OPJ_INT32)win_lh_y1);
opj_dwt_decode_partial_1_parallel(v.mem, nb_cols, v.dn, v.sn, v.cas,
(OPJ_INT32)win_ll_y0,
(OPJ_INT32)win_ll_y1,
(OPJ_INT32)win_lh_y0,
(OPJ_INT32)win_lh_y1);
if (!opj_sparse_array_int32_write(sa,
i, win_tr_y0,
i + 1, win_tr_y1,
v.mem + win_tr_y0,
0, 1, OPJ_TRUE)) {
i + nb_cols, win_tr_y1,
v.mem + 4 * win_tr_y0,
1, 4, OPJ_TRUE)) {
/* FIXME event manager error callback */
opj_sparse_array_int32_free(sa);
opj_aligned_free(h.mem);
return OPJ_FALSE;
}
i += nb_cols;
}
}
opj_aligned_free(h.mem);

View File

@ -165,10 +165,20 @@ static OPJ_BOOL opj_sparse_array_int32_read_or_write(
if (buf_col_stride == 1) {
OPJ_INT32* OPJ_RESTRICT dest_ptr = buf + (y - y0) * (size_t)buf_line_stride +
(x - x0) * buf_col_stride;
for (j = 0; j < y_incr; j++) {
memcpy(dest_ptr, src_ptr, sizeof(OPJ_INT32) * x_incr);
dest_ptr += buf_line_stride;
src_ptr += block_width;
if (x_incr == 4) {
// Same code as general branch, but the compiler
// can have an efficient memcpy()
for (j = 0; j < y_incr; j++) {
memcpy(dest_ptr, src_ptr, sizeof(OPJ_INT32) * x_incr);
dest_ptr += buf_line_stride;
src_ptr += block_width;
}
} else {
for (j = 0; j < y_incr; j++) {
memcpy(dest_ptr, src_ptr, sizeof(OPJ_INT32) * x_incr);
dest_ptr += buf_line_stride;
src_ptr += block_width;
}
}
} else {
OPJ_INT32* OPJ_RESTRICT dest_ptr = buf + (y - y0) * (size_t)buf_line_stride +
@ -179,6 +189,17 @@ static OPJ_BOOL opj_sparse_array_int32_read_or_write(
dest_ptr += buf_line_stride;
src_ptr += block_width;
}
} else if (y_incr == 1 && buf_col_stride == 2) {
OPJ_UINT32 k;
for (k = 0; k < (x_incr & ~3U); k += 4) {
dest_ptr[k * buf_col_stride] = src_ptr[k];
dest_ptr[(k + 1) * buf_col_stride] = src_ptr[k + 1];
dest_ptr[(k + 2) * buf_col_stride] = src_ptr[k + 2];
dest_ptr[(k + 3) * buf_col_stride] = src_ptr[k + 3];
}
for (; k < x_incr; k++) {
dest_ptr[k * buf_col_stride] = src_ptr[k];
}
} else {
/* General case */
for (j = 0; j < y_incr; j++) {
@ -207,10 +228,20 @@ static OPJ_BOOL opj_sparse_array_int32_read_or_write(
(size_t)block_width + block_x_offset;
const OPJ_INT32* OPJ_RESTRICT src_ptr = buf + (y - y0) *
(size_t)buf_line_stride + (x - x0) * buf_col_stride;
for (j = 0; j < y_incr; j++) {
memcpy(dest_ptr, src_ptr, sizeof(OPJ_INT32) * x_incr);
dest_ptr += block_width;
src_ptr += buf_line_stride;
if (x_incr == 4) {
// Same code as general branch, but the compiler
// can have an efficient memcpy()
for (j = 0; j < y_incr; j++) {
memcpy(dest_ptr, src_ptr, sizeof(OPJ_INT32) * x_incr);
dest_ptr += block_width;
src_ptr += buf_line_stride;
}
} else {
for (j = 0; j < y_incr; j++) {
memcpy(dest_ptr, src_ptr, sizeof(OPJ_INT32) * x_incr);
dest_ptr += block_width;
src_ptr += buf_line_stride;
}
}
} else {
OPJ_INT32* OPJ_RESTRICT dest_ptr = src_block + block_y_offset *