Sub-tile decoding: speed up vertical pass in IDWT5x3 by processing 4 cols at a time
This commit is contained in:
parent
ccac773556
commit
873004c615
|
@ -1551,6 +1551,7 @@ static void opj_dwt_interleave_partial_v(OPJ_INT32 *dest,
|
|||
OPJ_INT32 cas,
|
||||
opj_sparse_array_int32_t* sa,
|
||||
OPJ_UINT32 sa_col,
|
||||
OPJ_UINT32 nb_cols,
|
||||
OPJ_UINT32 sn,
|
||||
OPJ_UINT32 win_l_y0,
|
||||
OPJ_UINT32 win_l_y1,
|
||||
|
@ -1560,15 +1561,15 @@ static void opj_dwt_interleave_partial_v(OPJ_INT32 *dest,
|
|||
OPJ_BOOL ret;
|
||||
ret = opj_sparse_array_int32_read(sa,
|
||||
sa_col, win_l_y0,
|
||||
sa_col + 1, win_l_y1,
|
||||
dest + cas + 2 * win_l_y0,
|
||||
0, 2, OPJ_TRUE);
|
||||
sa_col + nb_cols, win_l_y1,
|
||||
dest + cas * 4 + 2 * 4 * win_l_y0,
|
||||
1, 2 * 4, OPJ_TRUE);
|
||||
assert(ret);
|
||||
ret = opj_sparse_array_int32_read(sa,
|
||||
sa_col, sn + win_h_y0,
|
||||
sa_col + 1, sn + win_h_y1,
|
||||
dest + 1 - cas + 2 * win_h_y0,
|
||||
0, 2, OPJ_TRUE);
|
||||
sa_col + nb_cols, sn + win_h_y1,
|
||||
dest + (1 - cas) * 4 + 2 * 4 * win_h_y0,
|
||||
1, 2 * 4, OPJ_TRUE);
|
||||
assert(ret);
|
||||
OPJ_UNUSED(ret);
|
||||
}
|
||||
|
@ -1648,6 +1649,109 @@ static void opj_dwt_decode_partial_1(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn,
|
|||
}
|
||||
}
|
||||
|
||||
#define OPJ_S_off(i,off) a[(OPJ_UINT32)(i)*2*4+off]
|
||||
#define OPJ_D_off(i,off) a[(1+(OPJ_UINT32)(i)*2)*4+off]
|
||||
#define OPJ_S__off(i,off) ((i)<0?OPJ_S_off(0,off):((i)>=sn?OPJ_S_off(sn-1,off):OPJ_S_off(i,off)))
|
||||
#define OPJ_D__off(i,off) ((i)<0?OPJ_D_off(0,off):((i)>=dn?OPJ_D_off(dn-1,off):OPJ_D_off(i,off)))
|
||||
#define OPJ_SS__off(i,off) ((i)<0?OPJ_S_off(0,off):((i)>=dn?OPJ_S_off(dn-1,off):OPJ_S_off(i,off)))
|
||||
#define OPJ_DD__off(i,off) ((i)<0?OPJ_D_off(0,off):((i)>=sn?OPJ_D_off(sn-1,off):OPJ_D_off(i,off)))
|
||||
|
||||
static void opj_dwt_decode_partial_1_parallel(OPJ_INT32 *a,
|
||||
OPJ_UINT32 nb_cols,
|
||||
OPJ_INT32 dn, OPJ_INT32 sn,
|
||||
OPJ_INT32 cas,
|
||||
OPJ_INT32 win_l_x0,
|
||||
OPJ_INT32 win_l_x1,
|
||||
OPJ_INT32 win_h_x0,
|
||||
OPJ_INT32 win_h_x1)
|
||||
{
|
||||
OPJ_INT32 i;
|
||||
OPJ_UINT32 off;
|
||||
|
||||
(void)nb_cols;
|
||||
|
||||
if (!cas) {
|
||||
if ((dn > 0) || (sn > 1)) { /* NEW : CASE ONE ELEMENT */
|
||||
|
||||
/* Naive version is :
|
||||
for (i = win_l_x0; i < i_max; i++) {
|
||||
OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2;
|
||||
}
|
||||
for (i = win_h_x0; i < win_h_x1; i++) {
|
||||
OPJ_D(i) += (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1;
|
||||
}
|
||||
but the compiler doesn't manage to unroll it to avoid bound
|
||||
checking in OPJ_S_ and OPJ_D_ macros
|
||||
*/
|
||||
|
||||
i = win_l_x0;
|
||||
if (i < win_l_x1) {
|
||||
OPJ_INT32 i_max;
|
||||
|
||||
/* Left-most case */
|
||||
for (off = 0; off < 4; off++) {
|
||||
OPJ_S_off(i, off) -= (OPJ_D__off(i - 1, off) + OPJ_D__off(i, off) + 2) >> 2;
|
||||
}
|
||||
i ++;
|
||||
|
||||
i_max = win_l_x1;
|
||||
if (i_max > dn) {
|
||||
i_max = dn;
|
||||
}
|
||||
for (; i < i_max; i++) {
|
||||
/* No bound checking */
|
||||
for (off = 0; off < 4; off++) {
|
||||
OPJ_S_off(i, off) -= (OPJ_D_off(i - 1, off) + OPJ_D_off(i, off) + 2) >> 2;
|
||||
}
|
||||
}
|
||||
for (; i < win_l_x1; i++) {
|
||||
/* Right-most case */
|
||||
for (off = 0; off < 4; off++) {
|
||||
OPJ_S_off(i, off) -= (OPJ_D__off(i - 1, off) + OPJ_D__off(i, off) + 2) >> 2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
i = win_h_x0;
|
||||
if (i < win_h_x1) {
|
||||
OPJ_INT32 i_max = win_h_x1;
|
||||
if (i_max >= sn) {
|
||||
i_max = sn - 1;
|
||||
}
|
||||
for (; i < i_max; i++) {
|
||||
/* No bound checking */
|
||||
for (off = 0; off < 4; off++) {
|
||||
OPJ_D_off(i, off) += (OPJ_S_off(i, off) + OPJ_S_off(i + 1, off)) >> 1;
|
||||
}
|
||||
}
|
||||
for (; i < win_h_x1; i++) {
|
||||
/* Right-most case */
|
||||
for (off = 0; off < 4; off++) {
|
||||
OPJ_D_off(i, off) += (OPJ_S__off(i, off) + OPJ_S__off(i + 1, off)) >> 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (!sn && dn == 1) { /* NEW : CASE ONE ELEMENT */
|
||||
for (off = 0; off < 4; off++) {
|
||||
OPJ_S_off(0, off) /= 2;
|
||||
}
|
||||
} else {
|
||||
for (i = win_l_x0; i < win_l_x1; i++) {
|
||||
for (off = 0; off < 4; off++) {
|
||||
OPJ_D_off(i, off) -= (OPJ_SS__off(i, off) + OPJ_SS__off(i + 1, off) + 2) >> 2;
|
||||
}
|
||||
}
|
||||
for (i = win_h_x0; i < win_h_x1; i++) {
|
||||
for (off = 0; off < 4; off++) {
|
||||
OPJ_S_off(i, off) += (OPJ_DD__off(i, off) + OPJ_DD__off(i - 1, off)) >> 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void opj_dwt_get_band_coordinates(opj_tcd_tilecomp_t* tilec,
|
||||
OPJ_UINT32 resno,
|
||||
OPJ_UINT32 bandno,
|
||||
|
@ -1804,13 +1908,14 @@ static OPJ_BOOL opj_dwt_decode_partial_tile(
|
|||
}
|
||||
h_mem_size = opj_dwt_max_resolution(tr, numres);
|
||||
/* overflow check */
|
||||
if (h_mem_size > (SIZE_MAX / sizeof(OPJ_INT32))) {
|
||||
/* in vertical pass, we process 4 columns at a time */
|
||||
if (h_mem_size > (SIZE_MAX / (4 * sizeof(OPJ_INT32)))) {
|
||||
/* FIXME event manager error callback */
|
||||
opj_sparse_array_int32_free(sa);
|
||||
return OPJ_FALSE;
|
||||
}
|
||||
|
||||
h_mem_size *= sizeof(OPJ_INT32);
|
||||
h_mem_size *= 4 * sizeof(OPJ_INT32);
|
||||
h.mem = (OPJ_INT32*)opj_aligned_32_malloc(h_mem_size);
|
||||
if (! h.mem) {
|
||||
/* FIXME event manager error callback */
|
||||
|
@ -1946,31 +2051,35 @@ static OPJ_BOOL opj_dwt_decode_partial_tile(
|
|||
}
|
||||
}
|
||||
|
||||
for (i = win_tr_x0; i < win_tr_x1; ++i) {
|
||||
for (i = win_tr_x0; i < win_tr_x1;) {
|
||||
OPJ_UINT32 nb_cols = opj_uint_min(4U, win_tr_x1 - i);
|
||||
opj_dwt_interleave_partial_v(v.mem,
|
||||
v.cas,
|
||||
sa,
|
||||
i,
|
||||
nb_cols,
|
||||
(OPJ_UINT32)v.sn,
|
||||
win_ll_y0,
|
||||
win_ll_y1,
|
||||
win_lh_y0,
|
||||
win_lh_y1);
|
||||
opj_dwt_decode_partial_1(v.mem, v.dn, v.sn, v.cas,
|
||||
(OPJ_INT32)win_ll_y0,
|
||||
(OPJ_INT32)win_ll_y1,
|
||||
(OPJ_INT32)win_lh_y0,
|
||||
(OPJ_INT32)win_lh_y1);
|
||||
opj_dwt_decode_partial_1_parallel(v.mem, nb_cols, v.dn, v.sn, v.cas,
|
||||
(OPJ_INT32)win_ll_y0,
|
||||
(OPJ_INT32)win_ll_y1,
|
||||
(OPJ_INT32)win_lh_y0,
|
||||
(OPJ_INT32)win_lh_y1);
|
||||
if (!opj_sparse_array_int32_write(sa,
|
||||
i, win_tr_y0,
|
||||
i + 1, win_tr_y1,
|
||||
v.mem + win_tr_y0,
|
||||
0, 1, OPJ_TRUE)) {
|
||||
i + nb_cols, win_tr_y1,
|
||||
v.mem + 4 * win_tr_y0,
|
||||
1, 4, OPJ_TRUE)) {
|
||||
/* FIXME event manager error callback */
|
||||
opj_sparse_array_int32_free(sa);
|
||||
opj_aligned_free(h.mem);
|
||||
return OPJ_FALSE;
|
||||
}
|
||||
|
||||
i += nb_cols;
|
||||
}
|
||||
}
|
||||
opj_aligned_free(h.mem);
|
||||
|
|
|
@ -165,10 +165,20 @@ static OPJ_BOOL opj_sparse_array_int32_read_or_write(
|
|||
if (buf_col_stride == 1) {
|
||||
OPJ_INT32* OPJ_RESTRICT dest_ptr = buf + (y - y0) * (size_t)buf_line_stride +
|
||||
(x - x0) * buf_col_stride;
|
||||
for (j = 0; j < y_incr; j++) {
|
||||
memcpy(dest_ptr, src_ptr, sizeof(OPJ_INT32) * x_incr);
|
||||
dest_ptr += buf_line_stride;
|
||||
src_ptr += block_width;
|
||||
if (x_incr == 4) {
|
||||
// Same code as general branch, but the compiler
|
||||
// can have an efficient memcpy()
|
||||
for (j = 0; j < y_incr; j++) {
|
||||
memcpy(dest_ptr, src_ptr, sizeof(OPJ_INT32) * x_incr);
|
||||
dest_ptr += buf_line_stride;
|
||||
src_ptr += block_width;
|
||||
}
|
||||
} else {
|
||||
for (j = 0; j < y_incr; j++) {
|
||||
memcpy(dest_ptr, src_ptr, sizeof(OPJ_INT32) * x_incr);
|
||||
dest_ptr += buf_line_stride;
|
||||
src_ptr += block_width;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
OPJ_INT32* OPJ_RESTRICT dest_ptr = buf + (y - y0) * (size_t)buf_line_stride +
|
||||
|
@ -179,6 +189,17 @@ static OPJ_BOOL opj_sparse_array_int32_read_or_write(
|
|||
dest_ptr += buf_line_stride;
|
||||
src_ptr += block_width;
|
||||
}
|
||||
} else if (y_incr == 1 && buf_col_stride == 2) {
|
||||
OPJ_UINT32 k;
|
||||
for (k = 0; k < (x_incr & ~3U); k += 4) {
|
||||
dest_ptr[k * buf_col_stride] = src_ptr[k];
|
||||
dest_ptr[(k + 1) * buf_col_stride] = src_ptr[k + 1];
|
||||
dest_ptr[(k + 2) * buf_col_stride] = src_ptr[k + 2];
|
||||
dest_ptr[(k + 3) * buf_col_stride] = src_ptr[k + 3];
|
||||
}
|
||||
for (; k < x_incr; k++) {
|
||||
dest_ptr[k * buf_col_stride] = src_ptr[k];
|
||||
}
|
||||
} else {
|
||||
/* General case */
|
||||
for (j = 0; j < y_incr; j++) {
|
||||
|
@ -207,10 +228,20 @@ static OPJ_BOOL opj_sparse_array_int32_read_or_write(
|
|||
(size_t)block_width + block_x_offset;
|
||||
const OPJ_INT32* OPJ_RESTRICT src_ptr = buf + (y - y0) *
|
||||
(size_t)buf_line_stride + (x - x0) * buf_col_stride;
|
||||
for (j = 0; j < y_incr; j++) {
|
||||
memcpy(dest_ptr, src_ptr, sizeof(OPJ_INT32) * x_incr);
|
||||
dest_ptr += block_width;
|
||||
src_ptr += buf_line_stride;
|
||||
if (x_incr == 4) {
|
||||
// Same code as general branch, but the compiler
|
||||
// can have an efficient memcpy()
|
||||
for (j = 0; j < y_incr; j++) {
|
||||
memcpy(dest_ptr, src_ptr, sizeof(OPJ_INT32) * x_incr);
|
||||
dest_ptr += block_width;
|
||||
src_ptr += buf_line_stride;
|
||||
}
|
||||
} else {
|
||||
for (j = 0; j < y_incr; j++) {
|
||||
memcpy(dest_ptr, src_ptr, sizeof(OPJ_INT32) * x_incr);
|
||||
dest_ptr += block_width;
|
||||
src_ptr += buf_line_stride;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
OPJ_INT32* OPJ_RESTRICT dest_ptr = src_block + block_y_offset *
|
||||
|
|
Loading…
Reference in New Issue