From ca34d13e76a588a00171e57690c1deeaf068723a Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Thu, 6 Jul 2017 16:11:11 +0200 Subject: [PATCH] Decoding: do not allocate memory for the codestream of each codeblock Currently we allocate at least 8192 bytes for each codeblock, and copy the relevant parts of the codestream in that per-codeblock buffer as we decode packets. As the whole codestream for the tile is ingested in memory and alive during the decoding, we can directly point to it instead of copying. But to do that, we need an intermediate concept, a 'chunk' of code-stream segment, given that segments may be made of data at different places in the code-stream when quality layers are used. With that change, the decoding of MAPA_005.jp2 goes down from the previous improvement of 2.7 GB down to 1.9 GB. New profile: n4: 1885648469 (heap allocation functions) malloc/new/new[], --alloc-fns, etc. n1: 1610689344 0x4E78287: opj_aligned_malloc (opj_malloc.c:61) n1: 1610689344 0x4E71D7B: opj_alloc_tile_component_data (tcd.c:676) n1: 1610689344 0x4E7272C: opj_tcd_init_decode_tile (tcd.c:816) n1: 1610689344 0x4E4BDD9: opj_j2k_read_tile_header (j2k.c:8618) n1: 1610689344 0x4E4C8A2: opj_j2k_decode_tiles (j2k.c:10349) n1: 1610689344 0x4E4E36E: opj_j2k_decode (j2k.c:7847) n1: 1610689344 0x4E52FA2: opj_jp2_decode (jp2.c:1564) n0: 1610689344 0x40374E: main (opj_decompress.c:1459) n1: 219232541 0x4E4BBF0: opj_j2k_read_tile_header (j2k.c:4685) n1: 219232541 0x4E4C8A2: opj_j2k_decode_tiles (j2k.c:10349) n1: 219232541 0x4E4E36E: opj_j2k_decode (j2k.c:7847) n1: 219232541 0x4E52FA2: opj_jp2_decode (jp2.c:1564) n0: 219232541 0x40374E: main (opj_decompress.c:1459) n1: 39822000 0x4E727A9: opj_tcd_init_decode_tile (tcd.c:1219) n1: 39822000 0x4E4BDD9: opj_j2k_read_tile_header (j2k.c:8618) n1: 39822000 0x4E4C8A2: opj_j2k_decode_tiles (j2k.c:10349) n1: 39822000 0x4E4E36E: opj_j2k_decode (j2k.c:7847) n1: 39822000 0x4E52FA2: opj_jp2_decode (jp2.c:1564) n0: 39822000 0x40374E: main (opj_decompress.c:1459) n0: 15904584 in 52 places, all below massif's threshold (1.00%) --- src/lib/openjp2/j2k.c | 26 +++++++++++++-- src/lib/openjp2/opj_common.h | 1 - src/lib/openjp2/t1.c | 50 ++++++++++++++++++++++++++--- src/lib/openjp2/t1.h | 6 ++++ src/lib/openjp2/t2.c | 59 +++++++++++++--------------------- src/lib/openjp2/tcd.c | 36 +++++++++++---------- src/lib/openjp2/tcd.h | 61 +++++++++++++++++++++++------------- 7 files changed, 154 insertions(+), 85 deletions(-) diff --git a/src/lib/openjp2/j2k.c b/src/lib/openjp2/j2k.c index 9de69cec..3737655b 100644 --- a/src/lib/openjp2/j2k.c +++ b/src/lib/openjp2/j2k.c @@ -4684,15 +4684,35 @@ static OPJ_BOOL opj_j2k_read_sod(opj_j2k_t *p_j2k, "Tile part length size inconsistent with stream length\n"); return OPJ_FALSE; } + if (p_j2k->m_specific_param.m_decoder.m_sot_length > + UINT_MAX - OPJ_COMMON_CBLK_DATA_EXTRA) { + opj_event_msg(p_manager, EVT_ERROR, + "p_j2k->m_specific_param.m_decoder.m_sot_length > " + "UINT_MAX - OPJ_COMMON_CBLK_DATA_EXTRA"); + return OPJ_FALSE; + } + /* Add a margin of OPJ_COMMON_CBLK_DATA_EXTRA to the allocation we */ + /* do so that opj_mqc_init_dec_common() can safely add a synthetic */ + /* 0xFFFF marker. */ if (! *l_current_data) { /* LH: oddly enough, in this path, l_tile_len!=0. * TODO: If this was consistent, we could simplify the code to only use realloc(), as realloc(0,...) default to malloc(0,...). */ *l_current_data = (OPJ_BYTE*) opj_malloc( - p_j2k->m_specific_param.m_decoder.m_sot_length); + p_j2k->m_specific_param.m_decoder.m_sot_length + OPJ_COMMON_CBLK_DATA_EXTRA); } else { - OPJ_BYTE *l_new_current_data = (OPJ_BYTE *) opj_realloc(*l_current_data, - *l_tile_len + p_j2k->m_specific_param.m_decoder.m_sot_length); + OPJ_BYTE *l_new_current_data; + if (*l_tile_len > UINT_MAX - OPJ_COMMON_CBLK_DATA_EXTRA - + p_j2k->m_specific_param.m_decoder.m_sot_length) { + opj_event_msg(p_manager, EVT_ERROR, + "*l_tile_len > UINT_MAX - OPJ_COMMON_CBLK_DATA_EXTRA - " + "p_j2k->m_specific_param.m_decoder.m_sot_length"); + return OPJ_FALSE; + } + + l_new_current_data = (OPJ_BYTE *) opj_realloc(*l_current_data, + *l_tile_len + p_j2k->m_specific_param.m_decoder.m_sot_length + + OPJ_COMMON_CBLK_DATA_EXTRA); if (! l_new_current_data) { opj_free(*l_current_data); /*nothing more is done as l_current_data will be set to null, and just diff --git a/src/lib/openjp2/opj_common.h b/src/lib/openjp2/opj_common.h index 8db83fc5..a0513391 100644 --- a/src/lib/openjp2/opj_common.h +++ b/src/lib/openjp2/opj_common.h @@ -36,7 +36,6 @@ Common constants shared among several modules ========================================================== */ -#define OPJ_COMMON_DEFAULT_CBLK_DATA_SIZE 8192 #define OPJ_COMMON_CBLK_DATA_EXTRA 2 /**< Margin for a fake FFFF marker */ #endif /* OPJ_COMMMON_H */ diff --git a/src/lib/openjp2/t1.c b/src/lib/openjp2/t1.c index 800b6ed4..f932599b 100644 --- a/src/lib/openjp2/t1.c +++ b/src/lib/openjp2/t1.c @@ -1604,6 +1604,8 @@ void opj_t1_destroy(opj_t1_t *p_t1) p_t1->flags = 00; } + opj_free(p_t1->segdatabuffer); + opj_free(p_t1); } @@ -1613,6 +1615,7 @@ typedef struct { opj_tcd_band_t* band; opj_tcd_tilecomp_t* tilec; opj_tccp_t* tccp; + OPJ_BOOL mustuse_segdatabuffer; volatile OPJ_BOOL* pret; opj_event_mgr_t *p_manager; opj_mutex_t* p_manager_mutex; @@ -1657,6 +1660,7 @@ static void opj_t1_clbl_decode_processor(void* user_data, opj_tls_t* tls) t1 = opj_t1_create(OPJ_FALSE); opj_tls_set(tls, OPJ_TLS_KEY_T1, t1, opj_t1_destroy_wrapper); } + t1->mustuse_segdatabuffer = job->mustuse_segdatabuffer; if (OPJ_FALSE == opj_t1_decode_cblk( t1, @@ -1786,6 +1790,7 @@ void opj_t1_decode_cblks(opj_thread_pool_t* tp, job->p_manager_mutex = p_manager_mutex; job->p_manager = p_manager; job->check_pterm = check_pterm; + job->mustuse_segdatabuffer = opj_thread_pool_get_thread_count(tp) > 1; opj_thread_pool_submit_job(tp, opj_t1_clbl_decode_processor, job); if (!(*pret)) { return; @@ -1846,19 +1851,54 @@ static OPJ_BOOL opj_t1_decode_cblk(opj_t1_t *t1, for (segno = 0; segno < cblk->real_num_segs; ++segno) { opj_tcd_seg_t *seg = &cblk->segs[segno]; + OPJ_BYTE* segdata; + OPJ_UINT32 seglen; /* BYPASS mode */ type = ((bpno_plus_one <= ((OPJ_INT32)(cblk->numbps)) - 4) && (passtype < 2) && (cblksty & J2K_CCP_CBLKSTY_LAZY)) ? T1_TYPE_RAW : T1_TYPE_MQ; - /* FIXME: slviewer gets here with a null pointer. Why? Partially downloaded and/or corrupt textures? */ - if (seg->data == 00) { - continue; + + /* Even if we have a single chunk, in mulithtreaded decoding */ + /* the insertion of our synthetic marker might potentially override */ + /* valid codestream of other codeblocks decoded in parallel. */ + if (seg->numchunks == 1 && !(t1->mustuse_segdatabuffer)) { + segdata = seg->chunks[0].data; + seglen = seg->chunks[0].len; + } else { + OPJ_UINT32 i; + + /* Compute whole segment length from chunk lengths */ + seglen = 0; + for (i = 0; i < seg->numchunks; i++) { + seglen += seg->chunks[i].len; + } + + /* Allocate temporary memory if needed */ + if (seglen + OPJ_COMMON_CBLK_DATA_EXTRA > t1->segdatabuffersize) { + segdata = (OPJ_BYTE*)opj_realloc(t1->segdatabuffer, + seglen + OPJ_COMMON_CBLK_DATA_EXTRA); + if (segdata == NULL) { + return OPJ_FALSE; + } + t1->segdatabuffer = segdata; + memset(t1->segdatabuffer + seglen, 0, OPJ_COMMON_CBLK_DATA_EXTRA); + t1->segdatabuffersize = seglen + OPJ_COMMON_CBLK_DATA_EXTRA; + } + + /* Concatenate all segments chunks */ + segdata = t1->segdatabuffer; + seglen = 0; + for (i = 0; i < seg->numchunks; i++) { + memcpy(segdata + seglen, seg->chunks[i].data, seg->chunks[i].len); + seglen += seg->chunks[i].len; + } } + if (type == T1_TYPE_RAW) { - opj_mqc_raw_init_dec(mqc, (*seg->data) + seg->dataindex, seg->len, + opj_mqc_raw_init_dec(mqc, segdata, seglen, OPJ_COMMON_CBLK_DATA_EXTRA); } else { - opj_mqc_init_dec(mqc, (*seg->data) + seg->dataindex, seg->len, + opj_mqc_init_dec(mqc, segdata, seglen, OPJ_COMMON_CBLK_DATA_EXTRA); } diff --git a/src/lib/openjp2/t1.h b/src/lib/openjp2/t1.h index 6802d188..da8b0c80 100644 --- a/src/lib/openjp2/t1.h +++ b/src/lib/openjp2/t1.h @@ -200,6 +200,12 @@ typedef struct opj_t1 { OPJ_UINT32 flagssize; OPJ_UINT32 data_stride; OPJ_BOOL encoder; + + /* Thre 3 variables below are only used by the decoder */ + OPJ_BOOL mustuse_segdatabuffer; /* set to TRUE in multithreaded context */ + OPJ_BYTE + *segdatabuffer; /* Temporary buffer to concatenate all chunks of a segment */ + OPJ_UINT32 segdatabuffersize; /* Maximum size available in segdatabuffer */ } opj_t1_t; /** @name Exported functions */ diff --git a/src/lib/openjp2/t2.c b/src/lib/openjp2/t2.c index 760e17ac..5d5e33ea 100644 --- a/src/lib/openjp2/t2.c +++ b/src/lib/openjp2/t2.c @@ -1245,7 +1245,6 @@ static OPJ_BOOL opj_t2_read_packet_data(opj_t2_t* p_t2, if (!l_cblk->numsegs) { l_seg = l_cblk->segs; ++l_cblk->numsegs; - l_cblk->data_current_size = 0; } else { l_seg = &l_cblk->segs[l_cblk->numsegs - 1]; @@ -1287,46 +1286,30 @@ static OPJ_BOOL opj_t2_read_packet_data(opj_t2_t* p_t2, }; #endif /* USE_JPWL */ - /* Check possible overflow on size */ - if ((l_cblk->data_current_size + l_seg->newlen + OPJ_COMMON_CBLK_DATA_EXTRA) < - l_cblk->data_current_size) { - opj_event_msg(p_manager, EVT_ERROR, - "read: segment too long (%d) with current size (%d > %d) for codeblock %d (p=%d, b=%d, r=%d, c=%d)\n", - l_seg->newlen, l_cblk->data_current_size, 0xFFFFFFFF - l_seg->newlen, cblkno, - p_pi->precno, bandno, p_pi->resno, p_pi->compno); - return OPJ_FALSE; - } - /* Check if the cblk->data have allocated enough memory */ - if ((l_cblk->data_current_size + l_seg->newlen + OPJ_COMMON_CBLK_DATA_EXTRA) > - l_cblk->data_max_size) { - OPJ_BYTE* new_cblk_data = (OPJ_BYTE*) opj_realloc(l_cblk->data, - l_cblk->data_current_size + l_seg->newlen + OPJ_COMMON_CBLK_DATA_EXTRA); - if (! new_cblk_data) { - opj_free(l_cblk->data); - l_cblk->data = NULL; - l_cblk->data_max_size = 0; - /* opj_event_msg(p_manager, EVT_ERROR, "Not enough memory to realloc code block cata!\n"); */ + + if (l_seg->numchunks == l_seg->numchunksalloc) { + OPJ_UINT32 l_numchunksalloc = l_seg->numchunksalloc * 2 + 1; + opj_tcd_seg_data_chunk_t* l_chunks = + (opj_tcd_seg_data_chunk_t*)opj_realloc(l_seg->chunks, + l_numchunksalloc * sizeof(opj_tcd_seg_data_chunk_t)); + if (l_chunks == NULL) { + opj_event_msg(p_manager, EVT_ERROR, + "cannot allocate opj_tcd_seg_data_chunk_t* array"); return OPJ_FALSE; } - l_cblk->data_max_size = l_cblk->data_current_size + l_seg->newlen + - OPJ_COMMON_CBLK_DATA_EXTRA; - l_cblk->data = new_cblk_data; + l_seg->chunks = l_chunks; + l_seg->numchunksalloc = l_numchunksalloc; } - memcpy(l_cblk->data + l_cblk->data_current_size, l_current_data, l_seg->newlen); - - if (l_seg->numpasses == 0) { - l_seg->data = &l_cblk->data; - l_seg->dataindex = l_cblk->data_current_size; - } + l_seg->chunks[l_seg->numchunks].data = l_current_data; + l_seg->chunks[l_seg->numchunks].len = l_seg->newlen; + l_seg->numchunks ++; l_current_data += l_seg->newlen; l_seg->numpasses += l_seg->numnewpasses; l_cblk->numnewpasses -= l_seg->numnewpasses; l_seg->real_num_passes = l_seg->numpasses; - l_cblk->data_current_size += l_seg->newlen; - l_seg->len += l_seg->newlen; if (l_cblk->numnewpasses > 0) { ++l_seg; @@ -1391,7 +1374,6 @@ static OPJ_BOOL opj_t2_skip_packet_data(opj_t2_t* p_t2, if (!l_cblk->numsegs) { l_seg = l_cblk->segs; ++l_cblk->numsegs; - l_cblk->data_current_size = 0; } else { l_seg = &l_cblk->segs[l_cblk->numsegs - 1]; @@ -1464,22 +1446,23 @@ static OPJ_BOOL opj_t2_init_seg(opj_tcd_cblk_dec_t* cblk, if (l_nb_segs > cblk->m_current_max_segs) { opj_tcd_seg_t* new_segs; - cblk->m_current_max_segs += OPJ_J2K_DEFAULT_NB_SEGS; + OPJ_UINT32 l_m_current_max_segs = cblk->m_current_max_segs + + OPJ_J2K_DEFAULT_NB_SEGS; new_segs = (opj_tcd_seg_t*) opj_realloc(cblk->segs, - cblk->m_current_max_segs * sizeof(opj_tcd_seg_t)); + l_m_current_max_segs * sizeof(opj_tcd_seg_t)); if (! new_segs) { - opj_free(cblk->segs); - cblk->segs = NULL; - cblk->m_current_max_segs = 0; /* opj_event_msg(p_manager, EVT_ERROR, "Not enough memory to initialize segment %d\n", l_nb_segs); */ return OPJ_FALSE; } cblk->segs = new_segs; + memset(new_segs + cblk->m_current_max_segs, + 0, OPJ_J2K_DEFAULT_NB_SEGS * sizeof(opj_tcd_seg_t)); + cblk->m_current_max_segs = l_m_current_max_segs; } seg = &cblk->segs[index]; - memset(seg, 0, sizeof(opj_tcd_seg_t)); + opj_tcd_reinit_segment(seg); if (cblksty & J2K_CCP_CBLKSTY_TERMALL) { seg->maxpasses = 1; diff --git a/src/lib/openjp2/tcd.c b/src/lib/openjp2/tcd.c index 53e4ded3..4c4839d0 100644 --- a/src/lib/openjp2/tcd.c +++ b/src/lib/openjp2/tcd.c @@ -1210,20 +1210,23 @@ static OPJ_BOOL opj_tcd_code_block_enc_allocate_data(opj_tcd_cblk_enc_t * return OPJ_TRUE; } + +void opj_tcd_reinit_segment(opj_tcd_seg_t* seg) +{ + opj_tcd_seg_data_chunk_t *l_chunks = seg->chunks; + OPJ_UINT32 l_numchunksalloc = seg->numchunksalloc; + memset(seg, 0, sizeof(opj_tcd_seg_t)); + seg->chunks = l_chunks; + seg->numchunksalloc = l_numchunksalloc; +} + /** * Allocates memory for a decoding code block. */ static OPJ_BOOL opj_tcd_code_block_dec_allocate(opj_tcd_cblk_dec_t * p_code_block) { - if (! p_code_block->data) { - - p_code_block->data = (OPJ_BYTE*) opj_malloc(OPJ_COMMON_DEFAULT_CBLK_DATA_SIZE); - if (! p_code_block->data) { - return OPJ_FALSE; - } - p_code_block->data_max_size = OPJ_COMMON_DEFAULT_CBLK_DATA_SIZE; - /*fprintf(stderr, "Allocate 8192 elements of code_block->data\n");*/ + if (! p_code_block->segs) { p_code_block->segs = (opj_tcd_seg_t *) opj_calloc(OPJ_J2K_DEFAULT_NB_SEGS, sizeof(opj_tcd_seg_t)); @@ -1236,16 +1239,16 @@ static OPJ_BOOL opj_tcd_code_block_dec_allocate(opj_tcd_cblk_dec_t * /*fprintf(stderr, "m_current_max_segs of code_block->data = %d\n", p_code_block->m_current_max_segs);*/ } else { /* sanitize */ - OPJ_BYTE* l_data = p_code_block->data; - OPJ_UINT32 l_data_max_size = p_code_block->data_max_size; opj_tcd_seg_t * l_segs = p_code_block->segs; OPJ_UINT32 l_current_max_segs = p_code_block->m_current_max_segs; + OPJ_UINT32 i; memset(p_code_block, 0, sizeof(opj_tcd_cblk_dec_t)); - p_code_block->data = l_data; - p_code_block->data_max_size = l_data_max_size; p_code_block->segs = l_segs; p_code_block->m_current_max_segs = l_current_max_segs; + for (i = 0; i < l_current_max_segs; ++i) { + opj_tcd_reinit_segment(&l_segs[i]); + } } return OPJ_TRUE; @@ -1948,12 +1951,11 @@ static void opj_tcd_code_block_dec_deallocate(opj_tcd_precinct_t * p_precinct) for (cblkno = 0; cblkno < l_nb_code_blocks; ++cblkno) { - if (l_code_block->data) { - opj_free(l_code_block->data); - l_code_block->data = 00; - } - if (l_code_block->segs) { + OPJ_UINT32 i; + for (i = 0; i < l_code_block->m_current_max_segs; ++ i) { + opj_free(l_code_block->segs[i].chunks); + } opj_free(l_code_block->segs); l_code_block->segs = 00; } diff --git a/src/lib/openjp2/tcd.h b/src/lib/openjp2/tcd.h index cd750d1b..0de888d5 100644 --- a/src/lib/openjp2/tcd.h +++ b/src/lib/openjp2/tcd.h @@ -49,19 +49,6 @@ each other. The functions in TCD.C are used by other functions in J2K.C. /** @defgroup TCD TCD - Implementation of a tile coder/decoder */ /*@{*/ -/** -FIXME DOC -*/ -typedef struct opj_tcd_seg { - OPJ_BYTE ** data; - OPJ_UINT32 dataindex; - OPJ_UINT32 numpasses; - OPJ_UINT32 real_num_passes; - OPJ_UINT32 len; - OPJ_UINT32 maxpasses; - OPJ_UINT32 numnewpasses; - OPJ_UINT32 newlen; -} opj_tcd_seg_t; /** FIXME DOC @@ -102,19 +89,48 @@ typedef struct opj_tcd_cblk_enc { } opj_tcd_cblk_enc_t; +/** Chunk of codestream data that is part of a T1 segment */ +typedef struct opj_tcd_seg_data_chunk { + OPJ_BYTE * + data; /* Point to tilepart buffer. We don't make a copy ! + So the tilepart buffer must be kept alive + as long as we need to decode the codeblocks */ + OPJ_UINT32 len; /* Usable length of data */ +} opj_tcd_seg_data_chunk_t; + +/** Segment of a code-block. + * A segment represent a number of consecutive coding passes, without termination + * of MQC or RAW between them. */ +typedef struct opj_tcd_seg { + opj_tcd_seg_data_chunk_t* chunks; /* Array of chunks */ + OPJ_UINT32 numchunks; /* Number of valid chunks items */ + OPJ_UINT32 numchunksalloc; /* Number of chunks item allocated */ + OPJ_UINT32 + numpasses; /* Number of passes decoded. Including those that we skip */ + OPJ_UINT32 + real_num_passes; /* Number of passes actually to be decoded. To be used for code-block decoding */ + OPJ_UINT32 maxpasses; /* Maximum number of passes for this segment */ + OPJ_UINT32 + numnewpasses; /* Number of new passes for current packed. Transitory value */ + OPJ_UINT32 + newlen; /* Codestream length for this segment for current packed. Transitory value */ +} opj_tcd_seg_t; + +/* Code-block for decoding */ typedef struct opj_tcd_cblk_dec { - OPJ_BYTE * data; /* Data */ opj_tcd_seg_t* segs; /* segments information */ OPJ_INT32 x0, y0, x1, y1; /* position of the code-blocks : left upper corner (x0, y0) right low corner (x1,y1) */ OPJ_UINT32 numbps; - OPJ_UINT32 numlenbits; - OPJ_UINT32 data_max_size; /* Size of allocated data buffer */ - OPJ_UINT32 data_current_size; /* Size of used data buffer */ - OPJ_UINT32 numnewpasses; /* number of pass added to the code-blocks */ - OPJ_UINT32 numsegs; /* number of segments */ - OPJ_UINT32 real_num_segs; - OPJ_UINT32 m_current_max_segs; + OPJ_UINT32 + numlenbits; /* number of bits for len,, for the current packet. Transitory value */ + OPJ_UINT32 + numnewpasses; /* number of pass added to the code-blocks, for the current packet. Transitory value */ + OPJ_UINT32 + numsegs; /* number of segments, including those of packet we skip */ + OPJ_UINT32 + real_num_segs; /* number of segments, to be used for code block decoding */ + OPJ_UINT32 m_current_max_segs; /* allocated number of segs[] items */ } opj_tcd_cblk_dec_t; /** @@ -381,6 +397,9 @@ OPJ_BOOL opj_alloc_tile_component_data(opj_tcd_tilecomp_t *l_tilec); */ OPJ_BOOL opj_tcd_is_band_empty(opj_tcd_band_t* band); +/** Reinitialize a segment, without deallocating its chunks array */ +void opj_tcd_reinit_segment(opj_tcd_seg_t* seg); + /* ----------------------------------------------------------------------- */ /*@}*/