Patch by Callum Lerwick. The opj_tcd_cblk array is one of the largest allocations, because it contains a bunch of static buffers. This also makes it a major source of cache thrashing. This patch allocates the buffers from the heap, and dynamically sizes them in the decoder. I have not yet managed to dynamically size them in the encoder, getting the decoder to do it was tricky enough... I also split opj_tcd_cblk_t into separate encode and decode versions. A lot of fields were not used by both, so this cuts its size even further.

This commit is contained in:
Francois-Olivier Devaux 2007-11-14 10:52:02 +00:00
parent 24e189e4d8
commit 010ae27471
5 changed files with 107 additions and 61 deletions

View File

@ -7,9 +7,14 @@ What's New for OpenJPEG
November 14, 2007
! [FOD] First Patch by Callum Lerwick. Instead of reinventing realloc, j2k_read_sod now just uses opj_realloc in j2k.c
Second Patch by Callum Lerwick. This patch rearranges the largest memory allocations so they're allocated as
! [FOD] - First Patch by Callum Lerwick. Instead of reinventing realloc, j2k_read_sod now just uses opj_realloc in j2k.c
- Second Patch by Callum Lerwick. This patch rearranges the largest memory allocations so they're allocated as
late as possible, and freed as soon as possible. This cuts memory usage by about half on two large test images.
- Third Patch by Callum Lerwick. The opj_tcd_cblk array is one of the largest allocations, because it
contains a bunch of static buffers. This also makes it a major source of cache thrashing. This patch allocates
the buffers from the heap, and dynamically sizes them in the decoder. I have not yet managed to dynamically size
them in the encoder, getting the decoder to do it was tricky enough... I also split opj_tcd_cblk_t into separate
encode and decode versions. A lot of fields were not used by both, so this cuts its size even further.
November 13, 2007
! [FOD] Patch by Dzonatas and Callum Lerwick.

View File

@ -194,7 +194,7 @@ Encode 1 code-block
*/
static void t1_encode_cblk(
opj_t1_t *t1,
opj_tcd_cblk_t * cblk,
opj_tcd_cblk_enc_t* cblk,
int orient,
int compno,
int level,
@ -213,7 +213,7 @@ Decode 1 code-block
*/
static void t1_decode_cblk(
opj_t1_t *t1,
opj_tcd_cblk_t * cblk,
opj_tcd_cblk_dec_t* cblk,
int orient,
int roishift,
int cblksty);
@ -792,7 +792,7 @@ static bool allocate_buffers(
/** mod fixed_quality */
static void t1_encode_cblk(
opj_t1_t *t1,
opj_tcd_cblk_t * cblk,
opj_tcd_cblk_enc_t* cblk,
int orient,
int compno,
int level,
@ -925,7 +925,7 @@ static void t1_encode_cblk(
static void t1_decode_cblk(
opj_t1_t *t1,
opj_tcd_cblk_t * cblk,
opj_tcd_cblk_dec_t* cblk,
int orient,
int roishift,
int cblksty)
@ -958,10 +958,14 @@ static void t1_decode_cblk(
/* BYPASS mode */
type = ((bpno <= (cblk->numbps - 1) - 4) && (passtype < 2) && (cblksty & J2K_CCP_CBLKSTY_LAZY)) ? T1_TYPE_RAW : T1_TYPE_MQ;
/* FIXME: slviewer gets here with a null pointer. Why? Partially downloaded and/or corrupt textures? */
if(seg->data == NULL){
continue;
}
if (type == T1_TYPE_RAW) {
raw_init_dec(raw, seg->data, seg->len);
raw_init_dec(raw, (*seg->data) + seg->dataindex, seg->len);
} else {
mqc_init_dec(mqc, seg->data, seg->len);
mqc_init_dec(mqc, (*seg->data) + seg->dataindex, seg->len);
}
for (passno = 0; passno < seg->numpasses; ++passno) {
@ -1046,7 +1050,7 @@ void t1_encode_cblks(
opj_tcd_precinct_t *prc = &band->precincts[precno];
for (cblkno = 0; cblkno < prc->cw * prc->ch; ++cblkno) {
opj_tcd_cblk_t *cblk = &prc->cblks[cblkno];
opj_tcd_cblk_enc_t* cblk = &prc->cblks.enc[cblkno];
int* restrict datap;
int* restrict tiledp;
int cblk_w;
@ -1134,7 +1138,7 @@ void t1_decode_cblks(
opj_tcd_precinct_t* precinct = &band->precincts[precno];
for (cblkno = 0; cblkno < precinct->cw * precinct->ch; ++cblkno) {
opj_tcd_cblk_t* cblk = &precinct->cblks[cblkno];
opj_tcd_cblk_dec_t* cblk = &precinct->cblks.dec[cblkno];
int* restrict datap;
void* restrict tiledp;
int cblk_w, cblk_h;
@ -1193,8 +1197,10 @@ void t1_decode_cblks(
}
}
}
opj_free(cblk->data);
opj_free(cblk->segs);
} /* cblkno */
opj_free(precinct->cblks);
opj_free(precinct->cblks.dec);
} /* precno */
} /* bandno */
} /* resno */

View File

@ -63,7 +63,7 @@ static int t2_encode_packet(opj_tcd_tile_t *tile, opj_tcp_t *tcp, opj_pi_iterato
@param cblksty
@param first
*/
static void t2_init_seg(opj_tcd_seg_t *seg, int cblksty, int first);
static void t2_init_seg(opj_tcd_cblk_dec_t* cblk, int index, int cblksty, int first);
/**
Decode a packet of a tile from a source buffer
@param t2 T2 handle
@ -160,7 +160,7 @@ static int t2_encode_packet(opj_tcd_tile_t * tile, opj_tcp_t * tcp, opj_pi_itera
tgt_reset(prc->incltree);
tgt_reset(prc->imsbtree);
for (cblkno = 0; cblkno < prc->cw * prc->ch; cblkno++) {
opj_tcd_cblk_t *cblk = &prc->cblks[cblkno];
opj_tcd_cblk_enc_t* cblk = &prc->cblks.enc[cblkno];
cblk->numpasses = 0;
tgt_setvalue(prc->imsbtree, cblkno, band->numbps - cblk->numbps);
}
@ -176,14 +176,14 @@ static int t2_encode_packet(opj_tcd_tile_t * tile, opj_tcp_t * tcp, opj_pi_itera
opj_tcd_band_t *band = &res->bands[bandno];
opj_tcd_precinct_t *prc = &band->precincts[precno];
for (cblkno = 0; cblkno < prc->cw * prc->ch; cblkno++) {
opj_tcd_cblk_t *cblk = &prc->cblks[cblkno];
opj_tcd_cblk_enc_t* cblk = &prc->cblks.enc[cblkno];
opj_tcd_layer_t *layer = &cblk->layers[layno];
if (!cblk->numpasses && layer->numpasses) {
tgt_setvalue(prc->incltree, cblkno, layno);
}
}
for (cblkno = 0; cblkno < prc->cw * prc->ch; cblkno++) {
opj_tcd_cblk_t *cblk = &prc->cblks[cblkno];
opj_tcd_cblk_enc_t* cblk = &prc->cblks.enc[cblkno];
opj_tcd_layer_t *layer = &cblk->layers[layno];
int increment = 0;
int nump = 0;
@ -267,7 +267,7 @@ static int t2_encode_packet(opj_tcd_tile_t * tile, opj_tcp_t * tcp, opj_pi_itera
opj_tcd_band_t *band = &res->bands[bandno];
opj_tcd_precinct_t *prc = &band->precincts[precno];
for (cblkno = 0; cblkno < prc->cw * prc->ch; cblkno++) {
opj_tcd_cblk_t *cblk = &prc->cblks[cblkno];
opj_tcd_cblk_enc_t* cblk = &prc->cblks.enc[cblkno];
opj_tcd_layer_t *layer = &cblk->layers[layno];
if (!layer->numpasses) {
continue;
@ -294,7 +294,12 @@ static int t2_encode_packet(opj_tcd_tile_t * tile, opj_tcp_t * tcp, opj_pi_itera
return (c - dest);
}
static void t2_init_seg(opj_tcd_seg_t * seg, int cblksty, int first) {
static void t2_init_seg(opj_tcd_cblk_dec_t* cblk, int index, int cblksty, int first) {
opj_tcd_seg_t* seg;
cblk->segs = (opj_tcd_seg_t*) opj_realloc(cblk->segs, (index + 1) * sizeof(opj_tcd_seg_t));
seg = &cblk->segs[index];
seg->data = NULL;
seg->dataindex = 0;
seg->numpasses = 0;
seg->len = 0;
if (cblksty & J2K_CCP_CBLKSTY_TERMALL) {
@ -340,7 +345,7 @@ static int t2_decode_packet(opj_t2_t* t2, unsigned char *src, int len, opj_tcd_t
tgt_reset(prc->incltree);
tgt_reset(prc->imsbtree);
for (cblkno = 0; cblkno < prc->cw * prc->ch; cblkno++) {
opj_tcd_cblk_t *cblk = &prc->cblks[cblkno];
opj_tcd_cblk_dec_t* cblk = &prc->cblks.dec[cblkno];
cblk->numsegs = 0;
}
}
@ -424,9 +429,8 @@ static int t2_decode_packet(opj_t2_t* t2, unsigned char *src, int len, opj_tcd_t
if ((band->x1-band->x0 == 0)||(band->y1-band->y0 == 0)) continue;
for (cblkno = 0; cblkno < prc->cw * prc->ch; cblkno++) {
int included, increment, n;
opj_tcd_cblk_t *cblk = &prc->cblks[cblkno];
opj_tcd_seg_t *seg = NULL;
int included, increment, n, segno;
opj_tcd_cblk_dec_t* cblk = &prc->cblks.dec[cblkno];
/* if cblk not yet included before --> inclusion tagtree */
if (!cblk->numsegs) {
included = tgt_decode(bio, prc->incltree, cblkno, layno + 1);
@ -454,23 +458,25 @@ static int t2_decode_packet(opj_t2_t* t2, unsigned char *src, int len, opj_tcd_t
increment = t2_getcommacode(bio);
/* length indicator increment */
cblk->numlenbits += increment;
segno = 0;
if (!cblk->numsegs) {
seg = &cblk->segs[0];
t2_init_seg(seg, tcp->tccps[compno].cblksty, 1);
t2_init_seg(cblk, segno, tcp->tccps[compno].cblksty, 1);
} else {
seg = &cblk->segs[cblk->numsegs - 1];
if (seg->numpasses == seg->maxpasses) {
t2_init_seg(++seg, tcp->tccps[compno].cblksty, 0);
segno = cblk->numsegs - 1;
if (cblk->segs[segno].numpasses == cblk->segs[segno].maxpasses) {
++segno;
t2_init_seg(cblk, segno, tcp->tccps[compno].cblksty, 0);
}
}
n = cblk->numnewpasses;
do {
seg->numnewpasses = int_min(seg->maxpasses - seg->numpasses, n);
seg->newlen = bio_read(bio, cblk->numlenbits + int_floorlog2(seg->numnewpasses));
n -= seg->numnewpasses;
cblk->segs[segno].numnewpasses = int_min(cblk->segs[segno].maxpasses - cblk->segs[segno].numpasses, n);
cblk->segs[segno].newlen = bio_read(bio, cblk->numlenbits + int_floorlog2(cblk->segs[segno].numnewpasses));
n -= cblk->segs[segno].numnewpasses;
if (n > 0) {
t2_init_seg(++seg, tcp->tccps[compno].cblksty, 0);
++segno;
t2_init_seg(cblk, segno, tcp->tccps[compno].cblksty, 0);
}
} while (n > 0);
}
@ -518,7 +524,7 @@ static int t2_decode_packet(opj_t2_t* t2, unsigned char *src, int len, opj_tcd_t
if ((band->x1-band->x0 == 0)||(band->y1-band->y0 == 0)) continue;
for (cblkno = 0; cblkno < prc->cw * prc->ch; cblkno++) {
opj_tcd_cblk_t *cblk = &prc->cblks[cblkno];
opj_tcd_cblk_dec_t* cblk = &prc->cblks.dec[cblkno];
opj_tcd_seg_t *seg = NULL;
if (!cblk->numnewpasses)
continue;
@ -559,9 +565,11 @@ static int t2_decode_packet(opj_t2_t* t2, unsigned char *src, int len, opj_tcd_t
#endif /* USE_JPWL */
cblk->data = (unsigned char*) opj_realloc(cblk->data, (cblk->len + seg->newlen) * sizeof(unsigned char*));
memcpy(cblk->data + cblk->len, c, seg->newlen);
if (seg->numpasses == 0) {
seg->data = cblk->data + cblk->len;
seg->data = &cblk->data;
seg->dataindex = cblk->len;
}
c += seg->newlen;
cblk->len += seg->newlen;

View File

@ -33,7 +33,7 @@
#include "opj_includes.h"
void tcd_dump(FILE *fd, opj_tcd_t *tcd, opj_tcd_image_t * img) {
int tileno, compno, resno, bandno, precno, cblkno;
int tileno, compno, resno, bandno, precno;//, cblkno;
fprintf(fd, "image {\n");
fprintf(fd, " tw=%d, th=%d x0=%d x1=%d y0=%d y1=%d\n",
@ -68,6 +68,7 @@ void tcd_dump(FILE *fd, opj_tcd_t *tcd, opj_tcd_image_t * img) {
fprintf(fd,
" x0=%d, y0=%d, x1=%d, y1=%d, cw=%d, ch=%d\n",
prec->x0, prec->y0, prec->x1, prec->y1, prec->cw, prec->ch);
/*
for (cblkno = 0; cblkno < prec->cw * prec->ch; cblkno++) {
opj_tcd_cblk_t *cblk = &prec->cblks[cblkno];
fprintf(fd, " cblk {\n");
@ -76,6 +77,7 @@ void tcd_dump(FILE *fd, opj_tcd_t *tcd, opj_tcd_image_t * img) {
cblk->x0, cblk->y0, cblk->x1, cblk->y1);
fprintf(fd, " }\n");
}
*/
fprintf(fd, " }\n");
}
fprintf(fd, " }\n");
@ -313,7 +315,7 @@ void tcd_malloc_encode(opj_tcd_t *tcd, opj_image_t * image, opj_cp_t * cp, int c
prc->cw = (brcblkxend - tlcblkxstart) >> cblkwidthexpn;
prc->ch = (brcblkyend - tlcblkystart) >> cblkheightexpn;
prc->cblks = (opj_tcd_cblk_t*) opj_calloc((prc->cw * prc->ch), sizeof(opj_tcd_cblk_t));
prc->cblks.enc = (opj_tcd_cblk_enc_t*) opj_calloc((prc->cw * prc->ch), sizeof(opj_tcd_cblk_enc_t));
prc->incltree = tgt_create(prc->cw, prc->ch);
prc->imsbtree = tgt_create(prc->cw, prc->ch);
@ -323,13 +325,18 @@ void tcd_malloc_encode(opj_tcd_t *tcd, opj_image_t * image, opj_cp_t * cp, int c
int cblkxend = cblkxstart + (1 << cblkwidthexpn);
int cblkyend = cblkystart + (1 << cblkheightexpn);
opj_tcd_cblk_t *cblk = &prc->cblks[cblkno];
opj_tcd_cblk_enc_t* cblk = &prc->cblks.enc[cblkno];
/* code-block size (global) */
cblk->x0 = int_max(cblkxstart, prc->x0);
cblk->y0 = int_max(cblkystart, prc->y0);
cblk->x1 = int_min(cblkxend, prc->x1);
cblk->y1 = int_min(cblkyend, prc->y1);
cblk->data = (unsigned char*) opj_calloc(8192+2, sizeof(unsigned char));
/* FIXME: mqc_init_enc and mqc_byteout underrun the buffer if we don't do this. Why? */
cblk->data += 2;
cblk->layers = (opj_tcd_layer_t*) opj_calloc(100, sizeof(opj_tcd_layer_t));
cblk->passes = (opj_tcd_pass_t*) opj_calloc(100, sizeof(opj_tcd_pass_t));
}
}
}
@ -341,7 +348,7 @@ void tcd_malloc_encode(opj_tcd_t *tcd, opj_image_t * image, opj_cp_t * cp, int c
}
void tcd_free_encode(opj_tcd_t *tcd) {
int tileno, compno, resno, bandno, precno;
int tileno, compno, resno, bandno, precno, cblkno;
for (tileno = 0; tileno < 1; tileno++) {
opj_tcd_tile_t *tile = tcd->tcd_image->tiles;
@ -366,8 +373,12 @@ void tcd_free_encode(opj_tcd_t *tcd) {
tgt_destroy(prc->imsbtree);
prc->imsbtree = NULL;
}
opj_free(prc->cblks);
prc->cblks = NULL;
for (cblkno = 0; cblkno < prc->cw * prc->ch; cblkno++) {
opj_free(prc->cblks.enc[cblkno].data - 2);
opj_free(prc->cblks.enc[cblkno].layers);
opj_free(prc->cblks.enc[cblkno].passes);
}
opj_free(prc->cblks.enc);
} /* for (precno */
opj_free(band->precincts);
band->precincts = NULL;
@ -547,8 +558,8 @@ void tcd_init_encode(opj_tcd_t *tcd, opj_image_t * image, opj_cp_t * cp, int cur
prc->cw = (brcblkxend - tlcblkxstart) >> cblkwidthexpn;
prc->ch = (brcblkyend - tlcblkystart) >> cblkheightexpn;
opj_free(prc->cblks);
prc->cblks = (opj_tcd_cblk_t*) opj_calloc(prc->cw * prc->ch, sizeof(opj_tcd_cblk_t));
opj_free(prc->cblks.enc);
prc->cblks.enc = (opj_tcd_cblk_enc_t*) opj_calloc(prc->cw * prc->ch, sizeof(opj_tcd_cblk_enc_t));
if (prc->incltree != NULL) {
tgt_destroy(prc->incltree);
@ -566,13 +577,16 @@ void tcd_init_encode(opj_tcd_t *tcd, opj_image_t * image, opj_cp_t * cp, int cur
int cblkxend = cblkxstart + (1 << cblkwidthexpn);
int cblkyend = cblkystart + (1 << cblkheightexpn);
opj_tcd_cblk_t *cblk = &prc->cblks[cblkno];
opj_tcd_cblk_enc_t* cblk = &prc->cblks.enc[cblkno];
/* code-block size (global) */
cblk->x0 = int_max(cblkxstart, prc->x0);
cblk->y0 = int_max(cblkystart, prc->y0);
cblk->x1 = int_min(cblkxend, prc->x1);
cblk->y1 = int_min(cblkyend, prc->y1);
cblk->data = (unsigned char*) opj_calloc(8192, sizeof(unsigned char));
cblk->layers = (opj_tcd_layer_t*) opj_calloc(100, sizeof(opj_tcd_layer_t));
cblk->passes = (opj_tcd_pass_t*) opj_calloc(100, sizeof(opj_tcd_pass_t));
}
} /* precno */
} /* bandno */
@ -780,7 +794,7 @@ void tcd_malloc_decode_tile(opj_tcd_t *tcd, opj_image_t * image, opj_cp_t * cp,
prc->cw = (brcblkxend - tlcblkxstart) >> cblkwidthexpn;
prc->ch = (brcblkyend - tlcblkystart) >> cblkheightexpn;
prc->cblks = (opj_tcd_cblk_t *) opj_malloc(prc->cw * prc->ch * sizeof(opj_tcd_cblk_t));
prc->cblks.dec = (opj_tcd_cblk_dec_t*) opj_malloc(prc->cw * prc->ch * sizeof(opj_tcd_cblk_dec_t));
prc->incltree = tgt_create(prc->cw, prc->ch);
prc->imsbtree = tgt_create(prc->cw, prc->ch);
@ -791,8 +805,10 @@ void tcd_malloc_decode_tile(opj_tcd_t *tcd, opj_image_t * image, opj_cp_t * cp,
int cblkxend = cblkxstart + (1 << cblkwidthexpn);
int cblkyend = cblkystart + (1 << cblkheightexpn);
opj_tcd_cblk_dec_t* cblk = &prc->cblks.dec[cblkno];
cblk->data = NULL;
cblk->segs = NULL;
/* code-block size (global) */
opj_tcd_cblk_t *cblk = &prc->cblks[cblkno];
cblk->x0 = int_max(cblkxstart, prc->x0);
cblk->y0 = int_max(cblkystart, prc->y0);
cblk->x1 = int_min(cblkxend, prc->x1);
@ -837,7 +853,7 @@ void tcd_makelayer_fixed(opj_tcd_t *tcd, int layno, int final) {
for (precno = 0; precno < res->pw * res->ph; precno++) {
opj_tcd_precinct_t *prc = &band->precincts[precno];
for (cblkno = 0; cblkno < prc->cw * prc->ch; cblkno++) {
opj_tcd_cblk_t *cblk = &prc->cblks[cblkno];
opj_tcd_cblk_enc_t *cblk = &prc->cblks.enc[cblkno];
opj_tcd_layer_t *layer = &cblk->layers[layno];
int n;
int imsb = tcd->image->comps[compno].prec - cblk->numbps; /* number of bit-plan equal to zero */
@ -918,7 +934,7 @@ void tcd_makelayer(opj_tcd_t *tcd, int layno, double thresh, int final) {
for (precno = 0; precno < res->pw * res->ph; precno++) {
opj_tcd_precinct_t *prc = &band->precincts[precno];
for (cblkno = 0; cblkno < prc->cw * prc->ch; cblkno++) {
opj_tcd_cblk_t *cblk = &prc->cblks[cblkno];
opj_tcd_cblk_enc_t *cblk = &prc->cblks.enc[cblkno];
opj_tcd_layer_t *layer = &cblk->layers[layno];
int n;
@ -1002,7 +1018,7 @@ bool tcd_rateallocate(opj_tcd_t *tcd, unsigned char *dest, int len, opj_codestre
opj_tcd_precinct_t *prc = &band->precincts[precno];
for (cblkno = 0; cblkno < prc->cw * prc->ch; cblkno++) {
opj_tcd_cblk_t *cblk = &prc->cblks[cblkno];
opj_tcd_cblk_enc_t *cblk = &prc->cblks.enc[cblkno];
for (passno = 0; passno < cblk->totalpasses; passno++) {
opj_tcd_pass_t *pass = &cblk->passes[passno];

View File

@ -45,9 +45,10 @@ each other. The functions in TCD.C are used by some function in J2K.C.
FIXME: documentation
*/
typedef struct opj_tcd_seg {
unsigned char** data;
int dataindex;
int numpasses;
int len;
unsigned char *data;
int maxpasses;
int numnewpasses;
int newlen;
@ -75,21 +76,28 @@ typedef struct opj_tcd_layer {
/**
FIXME: documentation
*/
typedef struct opj_tcd_cblk {
typedef struct opj_tcd_cblk_enc {
unsigned char* data; /* Data */
opj_tcd_layer_t* layers; /* layer information */
opj_tcd_pass_t* passes; /* information about the passes */
int x0, y0, x1, y1; /* dimension of the code-blocks : left upper corner (x0, y0) right low corner (x1,y1) */
int numbps;
int numlenbits;
int len; /* length */
int numpasses; /* number of pass already done for the code-blocks */
int numpassesinlayers; /* number of passes in the layer */
int totalpasses; /* total number of passes */
} opj_tcd_cblk_enc_t;
typedef struct opj_tcd_cblk_dec {
unsigned char* data; /* Data */
opj_tcd_seg_t* segs; /* segments informations */
int x0, y0, x1, y1; /* dimension of the code-blocks : left upper corner (x0, y0) right low corner (x1,y1) */
int numbps;
int numlenbits;
int len; /* length */
int numnewpasses; /* number of pass added to the code-blocks */
int numsegs; /* number of segments */
opj_tcd_seg_t segs[100]; /* segments informations */
unsigned char data[8192]; /* Data */
int numpassesinlayers; /* number of passes in the layer */
opj_tcd_layer_t layers[100]; /* layer information */
int totalpasses; /* total number of passes */
opj_tcd_pass_t passes[100]; /* information about the passes */
} opj_tcd_cblk_t;
} opj_tcd_cblk_dec_t;
/**
FIXME: documentation
@ -97,7 +105,10 @@ FIXME: documentation
typedef struct opj_tcd_precinct {
int x0, y0, x1, y1; /* dimension of the precinct : left upper corner (x0, y0) right low corner (x1,y1) */
int cw, ch; /* number of precinct in width and heigth */
opj_tcd_cblk_t *cblks; /* code-blocks informations */
union{ /* code-blocks informations */
opj_tcd_cblk_enc_t* enc;
opj_tcd_cblk_dec_t* dec;
} cblks;
opj_tgt_tree_t *incltree; /* inclusion tree */
opj_tgt_tree_t *imsbtree; /* IMSB tree */
} opj_tcd_precinct_t;