From 627f484bce7b572dfcbd13f90f5f6fb083d1008c Mon Sep 17 00:00:00 2001 From: Francois-Olivier Devaux Date: Thu, 8 Apr 2010 17:22:58 +0000 Subject: [PATCH] Significant optimizations of MCT, DWT, MQ and T1 modules by Peter Wimmer (thanks Peter) --- ChangeLog | 3 + libopenjpeg/dwt.c | 79 +++++-- libopenjpeg/j2k.h | 12 +- libopenjpeg/mct.c | 42 ++++ libopenjpeg/mqc.c | 75 +++++- libopenjpeg/mqc.h | 5 +- libopenjpeg/opj_includes.h | 2 +- libopenjpeg/opj_malloc.h | 23 ++ libopenjpeg/t1.c | 457 +++++++++++++++++++++++++++++++++---- 9 files changed, 612 insertions(+), 86 deletions(-) diff --git a/ChangeLog b/ChangeLog index 6370aa94..7a8752e4 100644 --- a/ChangeLog +++ b/ChangeLog @@ -5,6 +5,9 @@ What's New for OpenJPEG ! : changed + : added +April 8, 2010 +! [FOD] Significant optimizations of MCT, DWT, MQ and T1 modules by Peter Wimmer (thanks Peter) + March 26, 2010 + [FOD] Added support for "jpc" codestreams which are equivalent to "j2c" codestreams. Thanks to Winfried for this patch * [FOD] Added support for PNG image format [Not yet functional under WIN32]. Thanks to Winfried for this patch. See details here http://code.google.com/p/openjpeg/issues/detail?id=16 diff --git a/libopenjpeg/dwt.c b/libopenjpeg/dwt.c index 78d18d17..a8d579fa 100644 --- a/libopenjpeg/dwt.c +++ b/libopenjpeg/dwt.c @@ -570,6 +570,20 @@ static void v4dwt_interleave_h(v4dwt_t* restrict w, float* restrict a, int x, in int count = w->sn; int i, k; for(k = 0; k < 2; ++k){ + if (count + 3 * x < size && ((int) a & 0x0f) == 0 && ((int) bi & 0x0f) == 0 && (x & 0x0f) == 0) { + /* Fast code path */ + for(i = 0; i < count; ++i){ + int j = i; + bi[i*8 ] = a[j]; + j += x; + bi[i*8 + 1] = a[j]; + j += x; + bi[i*8 + 2] = a[j]; + j += x; + bi[i*8 + 3] = a[j]; + } + } else { + /* Slow code path */ for(i = 0; i < count; ++i){ int j = i; bi[i*8 ] = a[j]; @@ -583,6 +597,7 @@ static void v4dwt_interleave_h(v4dwt_t* restrict w, float* restrict a, int x, in if(j > size) continue; bi[i*8 + 3] = a[j]; } + } bi = (float*) (w->wavelet + 1 - w->cas); a += w->sn; size -= w->sn; @@ -608,9 +623,21 @@ static void v4dwt_interleave_v(v4dwt_t* restrict v , float* restrict a , int x){ static void v4dwt_decode_step1_sse(v4* w, int count, const __m128 c){ __m128* restrict vw = (__m128*) w; int i; + /* 4x unrolled loop */ + for(i = 0; i < count >> 2; ++i){ + *vw = _mm_mul_ps(*vw, c); + vw += 2; + *vw = _mm_mul_ps(*vw, c); + vw += 2; + *vw = _mm_mul_ps(*vw, c); + vw += 2; + *vw = _mm_mul_ps(*vw, c); + vw += 2; + } + count &= 3; for(i = 0; i < count; ++i){ - __m128 tmp = vw[i*2]; - vw[i*2] = tmp * c; + *vw = _mm_mul_ps(*vw, c); + vw += 2; } } @@ -618,22 +645,24 @@ static void v4dwt_decode_step2_sse(v4* l, v4* w, int k, int m, __m128 c){ __m128* restrict vl = (__m128*) l; __m128* restrict vw = (__m128*) w; int i; + __m128 tmp1, tmp2, tmp3; + tmp1 = vl[0]; for(i = 0; i < m; ++i){ - __m128 tmp1 = vl[ 0]; - __m128 tmp2 = vw[-1]; - __m128 tmp3 = vw[ 0]; - vw[-1] = tmp2 + ((tmp1 + tmp3) * c); - vl = vw; + tmp2 = vw[-1]; + tmp3 = vw[ 0]; + vw[-1] = _mm_add_ps(tmp2, _mm_mul_ps(_mm_add_ps(tmp1, tmp3), c)); + tmp1 = tmp3; vw += 2; } + vl = vw - 2; if(m >= k){ return; } - c += c; - c *= vl[0]; + c = _mm_add_ps(c, c); + c = _mm_mul_ps(c, vl[0]); for(; m < k; ++m){ __m128 tmp = vw[-1]; - vw[-1] = tmp + c; + vw[-1] = _mm_add_ps(tmp, c); vw += 2; } } @@ -773,19 +802,24 @@ void dwt_decode_real(opj_tcd_tilecomp_t* restrict tilec, int numres){ h.dn = rw - h.sn; h.cas = res->x0 % 2; - for(j = rh; j > 0; j -= 4){ + for(j = rh; j > 3; j -= 4){ + int k; v4dwt_interleave_h(&h, aj, w, bufsize); v4dwt_decode(&h); - if(j >= 4){ - int k; for(k = rw; --k >= 0;){ aj[k ] = h.wavelet[k].f[0]; aj[k+w ] = h.wavelet[k].f[1]; aj[k+w*2] = h.wavelet[k].f[2]; aj[k+w*3] = h.wavelet[k].f[3]; } - }else{ + aj += w*4; + bufsize -= w*4; + } + if (rh & 0x03) { int k; + j = rh & 0x03; + v4dwt_interleave_h(&h, aj, w, bufsize); + v4dwt_decode(&h); for(k = rw; --k >= 0;){ switch(j) { case 3: aj[k+w*2] = h.wavelet[k].f[2]; @@ -794,30 +828,29 @@ void dwt_decode_real(opj_tcd_tilecomp_t* restrict tilec, int numres){ } } } - aj += w*4; - bufsize -= w*4; - } v.dn = rh - v.sn; v.cas = res->y0 % 2; aj = (float*) tilec->data; - for(j = rw; j > 0; j -= 4){ + for(j = rw; j > 3; j -= 4){ + int k; v4dwt_interleave_v(&v, aj, w); v4dwt_decode(&v); - if(j >= 4){ - int k; for(k = 0; k < rh; ++k){ memcpy(&aj[k*w], &v.wavelet[k], 4 * sizeof(float)); } - }else{ + aj += 4; + } + if (rw & 0x03){ int k; + j = rw & 0x03; + v4dwt_interleave_v(&v, aj, w); + v4dwt_decode(&v); for(k = 0; k < rh; ++k){ memcpy(&aj[k*w], &v.wavelet[k], j * sizeof(float)); } } - aj += 4; - } } opj_aligned_free(h.wavelet); diff --git a/libopenjpeg/j2k.h b/libopenjpeg/j2k.h index 5599be47..8fc8e6dc 100644 --- a/libopenjpeg/j2k.h +++ b/libopenjpeg/j2k.h @@ -45,12 +45,12 @@ The functions in J2K.C have for goal to read/write the several parts of the code #define J2K_CP_CSTY_SOP 0x02 #define J2K_CP_CSTY_EPH 0x04 #define J2K_CCP_CSTY_PRT 0x01 -#define J2K_CCP_CBLKSTY_LAZY 0x01 -#define J2K_CCP_CBLKSTY_RESET 0x02 -#define J2K_CCP_CBLKSTY_TERMALL 0x04 -#define J2K_CCP_CBLKSTY_VSC 0x08 -#define J2K_CCP_CBLKSTY_PTERM 0x10 -#define J2K_CCP_CBLKSTY_SEGSYM 0x20 +#define J2K_CCP_CBLKSTY_LAZY 0x01 /**< Selective arithmetic coding bypass */ +#define J2K_CCP_CBLKSTY_RESET 0x02 /**< Reset context probabilities on coding pass boundaries */ +#define J2K_CCP_CBLKSTY_TERMALL 0x04 /**< Termination on each coding pass */ +#define J2K_CCP_CBLKSTY_VSC 0x08 /**< Vertically stripe causal context */ +#define J2K_CCP_CBLKSTY_PTERM 0x10 /**< Predictable termination */ +#define J2K_CCP_CBLKSTY_SEGSYM 0x20 /**< Segmentation symbols are used */ #define J2K_CCP_QNTSTY_NOQNT 0 #define J2K_CCP_QNTSTY_SIQNT 1 #define J2K_CCP_QNTSTY_SEQNT 2 diff --git a/libopenjpeg/mct.c b/libopenjpeg/mct.c index ca21744f..870993b0 100644 --- a/libopenjpeg/mct.c +++ b/libopenjpeg/mct.c @@ -29,6 +29,10 @@ * POSSIBILITY OF SUCH DAMAGE. */ +#ifdef __SSE__ +#include +#endif + #include "opj_includes.h" /* */ @@ -127,6 +131,44 @@ void mct_decode_real( int n) { int i; +#ifdef __SSE__ + __m128 vrv, vgu, vgv, vbu; + vrv = _mm_set1_ps(1.402f); + vgu = _mm_set1_ps(0.34413f); + vgv = _mm_set1_ps(0.71414f); + vbu = _mm_set1_ps(1.772f); + for (i = 0; i < (n >> 3); ++i) { + __m128 vy, vu, vv; + __m128 vr, vg, vb; + + vy = _mm_load_ps(c0); + vu = _mm_load_ps(c1); + vv = _mm_load_ps(c2); + vr = _mm_add_ps(vy, _mm_mul_ps(vv, vrv)); + vg = _mm_sub_ps(_mm_sub_ps(vy, _mm_mul_ps(vu, vgu)), _mm_mul_ps(vv, vgv)); + vb = _mm_add_ps(vy, _mm_mul_ps(vu, vbu)); + _mm_store_ps(c0, vr); + _mm_store_ps(c1, vg); + _mm_store_ps(c2, vb); + c0 += 4; + c1 += 4; + c2 += 4; + + vy = _mm_load_ps(c0); + vu = _mm_load_ps(c1); + vv = _mm_load_ps(c2); + vr = _mm_add_ps(vy, _mm_mul_ps(vv, vrv)); + vg = _mm_sub_ps(_mm_sub_ps(vy, _mm_mul_ps(vu, vgu)), _mm_mul_ps(vv, vgv)); + vb = _mm_add_ps(vy, _mm_mul_ps(vu, vbu)); + _mm_store_ps(c0, vr); + _mm_store_ps(c1, vg); + _mm_store_ps(c2, vb); + c0 += 4; + c1 += 4; + c2 += 4; + } + n &= 7; +#endif for(i = 0; i < n; ++i) { float y = c0[i]; float u = c1[i]; diff --git a/libopenjpeg/mqc.c b/libopenjpeg/mqc.c index 9aa9d2c2..e26cd80b 100644 --- a/libopenjpeg/mqc.c +++ b/libopenjpeg/mqc.c @@ -68,24 +68,23 @@ FIXME: documentation ??? @param mqc MQC handle @return */ -static int mqc_mpsexchange(opj_mqc_t *mqc); +static INLINE int mqc_mpsexchange(opj_mqc_t *const mqc); /** FIXME: documentation ??? @param mqc MQC handle @return */ -static int mqc_lpsexchange(opj_mqc_t *mqc); +static INLINE int mqc_lpsexchange(opj_mqc_t *const mqc); /** Input a byte @param mqc MQC handle */ -static void mqc_bytein(opj_mqc_t *mqc); +static INLINE void mqc_bytein(opj_mqc_t *const mqc); /** Renormalize mqc->a and mqc->c while decoding @param mqc MQC handle */ -static void mqc_renormd(opj_mqc_t *mqc); - +static INLINE void mqc_renormd(opj_mqc_t *const mqc); /*@}*/ /*@}*/ @@ -271,7 +270,7 @@ static void mqc_setbits(opj_mqc_t *mqc) { } } -static int mqc_mpsexchange(opj_mqc_t *mqc) { +static INLINE int mqc_mpsexchange(opj_mqc_t *const mqc) { int d; if (mqc->a < (*mqc->curctx)->qeval) { d = 1 - (*mqc->curctx)->mps; @@ -284,7 +283,7 @@ static int mqc_mpsexchange(opj_mqc_t *mqc) { return d; } -static int mqc_lpsexchange(opj_mqc_t *mqc) { +static INLINE int mqc_lpsexchange(opj_mqc_t *const mqc) { int d; if (mqc->a < (*mqc->curctx)->qeval) { mqc->a = (*mqc->curctx)->qeval; @@ -299,7 +298,15 @@ static int mqc_lpsexchange(opj_mqc_t *mqc) { return d; } -static void mqc_bytein(opj_mqc_t *mqc) { +#ifdef MQC_PERF_OPT +static INLINE void mqc_bytein(opj_mqc_t *const mqc) { + unsigned int i = *((unsigned int *) mqc->bp); + mqc->c += i & 0xffff00; + mqc->ct = i & 0x0f; + mqc->bp += (i >> 2) & 0x04; +} +#else +static void mqc_bytein(opj_mqc_t *const mqc) { if (mqc->bp != mqc->end) { unsigned int c; if (mqc->bp + 1 != mqc->end) { @@ -326,8 +333,9 @@ static void mqc_bytein(opj_mqc_t *mqc) { mqc->ct = 8; } } +#endif -static void mqc_renormd(opj_mqc_t *mqc) { +static INLINE void mqc_renormd(opj_mqc_t *const mqc) { do { if (mqc->ct == 0) { mqc_bytein(mqc); @@ -346,11 +354,19 @@ static void mqc_renormd(opj_mqc_t *mqc) { opj_mqc_t* mqc_create(void) { opj_mqc_t *mqc = (opj_mqc_t*)opj_malloc(sizeof(opj_mqc_t)); +#ifdef MQC_PERF_OPT + mqc->buffer = NULL; +#endif return mqc; } void mqc_destroy(opj_mqc_t *mqc) { if(mqc) { +#ifdef MQC_PERF_OPT + if (mqc->buffer) { + opj_free(mqc->buffer); + } +#endif opj_free(mqc); } } @@ -499,13 +515,52 @@ void mqc_init_dec(opj_mqc_t *mqc, unsigned char *bp, int len) { mqc->bp = bp; if (len==0) mqc->c = 0xff << 16; else mqc->c = *mqc->bp << 16; + +#ifdef MQC_PERF_OPT + { + unsigned int c; + unsigned int *ip; + unsigned char *end = mqc->end - 1; + mqc->buffer = opj_realloc(mqc->buffer, (2 * len + 1) * sizeof(unsigned int)); + ip = (unsigned int *) mqc->buffer; + + while (bp != end) { + c = *(bp + 1); + if (*bp == 0xff) { + if (c > 0x8f) { + *ip = 0x0000ff18; + } else { + bp++; + *ip = 0x00000017 | (c << 9); + } + } else { + bp++; + *ip = 0x00000018 | (c << 8); + } + ip++; + } + + /* Handle last byte of data */ + c = 0xff; + if (*bp == 0xff) { + *ip = 0x0000ff18; + } else { + bp++; + *ip = 0x00000018 | (c << 8); + } + ip++; + + *ip = 0x0000ff08; + mqc->bp = mqc->buffer; + } +#endif mqc_bytein(mqc); mqc->c <<= 7; mqc->ct -= 7; mqc->a = 0x8000; } -int mqc_decode(opj_mqc_t *mqc) { +INLINE int mqc_decode(opj_mqc_t *const mqc) { int d; mqc->a -= (*mqc->curctx)->qeval; if ((mqc->c >> 16) < (*mqc->curctx)->qeval) { diff --git a/libopenjpeg/mqc.h b/libopenjpeg/mqc.h index 8cc8c934..d00cd106 100644 --- a/libopenjpeg/mqc.h +++ b/libopenjpeg/mqc.h @@ -70,6 +70,9 @@ typedef struct opj_mqc { unsigned char *end; opj_mqc_state_t *ctxs[MQC_NUMCTXS]; opj_mqc_state_t **curctx; +#ifdef MQC_PERF_OPT + unsigned char *buffer; +#endif } opj_mqc_t; /** @name Exported functions */ @@ -188,7 +191,7 @@ Decode a symbol @param mqc MQC handle @return Returns the decoded symbol (0 or 1) */ -int mqc_decode(opj_mqc_t *mqc); +int mqc_decode(opj_mqc_t *const mqc); /* ----------------------------------------------------------------------- */ /*@}*/ diff --git a/libopenjpeg/opj_includes.h b/libopenjpeg/opj_includes.h index 80d43df9..a0e64a8a 100644 --- a/libopenjpeg/opj_includes.h +++ b/libopenjpeg/opj_includes.h @@ -65,7 +65,7 @@ Most compilers implement their own version of this keyword ... */ #ifndef INLINE #if defined(_MSC_VER) - #define INLINE __inline + #define INLINE __forceinline #elif defined(__GNUC__) #define INLINE __inline__ #elif defined(__MWERKS__) diff --git a/libopenjpeg/opj_malloc.h b/libopenjpeg/opj_malloc.h index 9b48c256..c477aec0 100644 --- a/libopenjpeg/opj_malloc.h +++ b/libopenjpeg/opj_malloc.h @@ -45,7 +45,11 @@ Allocate an uninitialized memory block @param size Bytes to allocate @return Returns a void pointer to the allocated space, or NULL if there is insufficient memory available */ +#ifdef ALLOC_PERF_OPT +void * OPJ_CALLCONV opj_malloc(size_t size); +#else #define opj_malloc(size) malloc(size) +#endif /** Allocate a memory block with elements initialized to 0 @@ -53,7 +57,11 @@ Allocate a memory block with elements initialized to 0 @param size Bytes per block to allocate @return Returns a void pointer to the allocated space, or NULL if there is insufficient memory available */ +#ifdef ALLOC_PERF_OPT +void * OPJ_CALLCONV opj_calloc(size_t _NumOfElements, size_t _SizeOfElements); +#else #define opj_calloc(num, size) calloc(num, size) +#endif /** Allocate memory aligned to a 16 byte boundry @@ -113,19 +121,34 @@ Allocate memory aligned to a 16 byte boundry #define opj_aligned_free(m) free(m) #endif +#ifdef ALLOC_PERF_OPT + #undef opj_aligned_malloc + #define opj_aligned_malloc(size) opj_malloc(size) + #undef opj_aligned_free + #define opj_aligned_free(m) opj_free(m) +#endif + /** Reallocate memory blocks. @param memblock Pointer to previously allocated memory block @param size New size in bytes @return Returns a void pointer to the reallocated (and possibly moved) memory block */ +#ifdef ALLOC_PERF_OPT +void * OPJ_CALLCONV opj_realloc(void * _Memory, size_t NewSize); +#else #define opj_realloc(m, s) realloc(m, s) +#endif /** Deallocates or frees a memory block. @param memblock Previously allocated memory block to be freed */ +#ifdef ALLOC_PERF_OPT +void OPJ_CALLCONV opj_free(void * _Memory); +#else #define opj_free(m) free(m) +#endif #ifdef __GNUC__ #pragma GCC poison malloc calloc realloc free diff --git a/libopenjpeg/t1.c b/libopenjpeg/t1.c index a78b700f..14d5b5cf 100644 --- a/libopenjpeg/t1.c +++ b/libopenjpeg/t1.c @@ -62,13 +62,25 @@ static void t1_enc_sigpass_step( /** Decode significant pass */ -static void t1_dec_sigpass_step( +static INLINE void t1_dec_sigpass_step_raw( + opj_t1_t *t1, + flag_t *flagsp, + int *datap, + int orient, + int oneplushalf, + int vsc); +static INLINE void t1_dec_sigpass_step_mqc( + opj_t1_t *t1, + flag_t *flagsp, + int *datap, + int orient, + int oneplushalf); +static INLINE void t1_dec_sigpass_step_mqc_vsc( opj_t1_t *t1, flag_t *flagsp, int *datap, int orient, int oneplushalf, - char type, int vsc); /** Encode significant pass @@ -83,12 +95,19 @@ static void t1_enc_sigpass( /** Decode significant pass */ -static void t1_dec_sigpass( +static void t1_dec_sigpass_raw( opj_t1_t *t1, int bpno, int orient, - char type, int cblksty); +static void t1_dec_sigpass_mqc( + opj_t1_t *t1, + int bpno, + int orient); +static void t1_dec_sigpass_mqc_vsc( + opj_t1_t *t1, + int bpno, + int orient); /** Encode refinement pass */ @@ -104,14 +123,27 @@ static void t1_enc_refpass_step( /** Decode refinement pass */ -static void t1_dec_refpass_step( +static void INLINE t1_dec_refpass_step_raw( opj_t1_t *t1, flag_t *flagsp, int *datap, int poshalf, int neghalf, - char type, int vsc); +static void INLINE t1_dec_refpass_step_mqc( + opj_t1_t *t1, + flag_t *flagsp, + int *datap, + int poshalf, + int neghalf); +static void INLINE t1_dec_refpass_step_mqc_vsc( + opj_t1_t *t1, + flag_t *flagsp, + int *datap, + int poshalf, + int neghalf, + int vsc); + /** Encode refinement pass */ @@ -124,11 +156,16 @@ static void t1_enc_refpass( /** Decode refinement pass */ -static void t1_dec_refpass( +static void t1_dec_refpass_raw( opj_t1_t *t1, int bpno, - char type, int cblksty); +static void t1_dec_refpass_mqc( + opj_t1_t *t1, + int bpno); +static void t1_dec_refpass_mqc_vsc( + opj_t1_t *t1, + int bpno); /** Encode clean-up pass */ @@ -145,7 +182,19 @@ static void t1_enc_clnpass_step( /** Decode clean-up pass */ +static void t1_dec_clnpass_step_partial( + opj_t1_t *t1, + flag_t *flagsp, + int *datap, + int orient, + int oneplushalf); static void t1_dec_clnpass_step( + opj_t1_t *t1, + flag_t *flagsp, + int *datap, + int orient, + int oneplushalf); +static void t1_dec_clnpass_step_vsc( opj_t1_t *t1, flag_t *flagsp, int *datap, @@ -323,29 +372,42 @@ static void t1_enc_sigpass_step( } } -static void t1_dec_sigpass_step( +static INLINE void t1_dec_sigpass_step_raw( opj_t1_t *t1, flag_t *flagsp, int *datap, int orient, int oneplushalf, - char type, int vsc) { int v, flag; opj_raw_t *raw = t1->raw; /* RAW component */ - opj_mqc_t *mqc = t1->mqc; /* MQC component */ flag = vsc ? ((*flagsp) & (~(T1_SIG_S | T1_SIG_SE | T1_SIG_SW | T1_SGN_S))) : (*flagsp); if ((flag & T1_SIG_OTH) && !(flag & (T1_SIG | T1_VISIT))) { - if (type == T1_TYPE_RAW) { if (raw_decode(raw)) { v = raw_decode(raw); /* ESSAI */ *datap = v ? -oneplushalf : oneplushalf; t1_updateflags(flagsp, v, t1->flags_stride); } - } else { + *flagsp |= T1_VISIT; + } +} /* VSC and BYPASS by Antonin */ + +static INLINE void t1_dec_sigpass_step_mqc( + opj_t1_t *t1, + flag_t *flagsp, + int *datap, + int orient, + int oneplushalf) +{ + int v, flag; + + opj_mqc_t *mqc = t1->mqc; /* MQC component */ + + flag = *flagsp; + if ((flag & T1_SIG_OTH) && !(flag & (T1_SIG | T1_VISIT))) { mqc_setcurctx(mqc, t1_getctxno_zc(flag, orient)); if (mqc_decode(mqc)) { mqc_setcurctx(mqc, t1_getctxno_sc(flag)); @@ -353,6 +415,30 @@ static void t1_dec_sigpass_step( *datap = v ? -oneplushalf : oneplushalf; t1_updateflags(flagsp, v, t1->flags_stride); } + *flagsp |= T1_VISIT; + } +} /* VSC and BYPASS by Antonin */ + +static INLINE void t1_dec_sigpass_step_mqc_vsc( + opj_t1_t *t1, + flag_t *flagsp, + int *datap, + int orient, + int oneplushalf, + int vsc) +{ + int v, flag; + + opj_mqc_t *mqc = t1->mqc; /* MQC component */ + + flag = vsc ? ((*flagsp) & (~(T1_SIG_S | T1_SIG_SE | T1_SIG_SW | T1_SGN_S))) : (*flagsp); + if ((flag & T1_SIG_OTH) && !(flag & (T1_SIG | T1_VISIT))) { + mqc_setcurctx(mqc, t1_getctxno_zc(flag, orient)); + if (mqc_decode(mqc)) { + mqc_setcurctx(mqc, t1_getctxno_sc(flag)); + v = mqc_decode(mqc) ^ t1_getspb(flag); + *datap = v ? -oneplushalf : oneplushalf; + t1_updateflags(flagsp, v, t1->flags_stride); } *flagsp |= T1_VISIT; } @@ -388,11 +474,10 @@ static void t1_enc_sigpass( } } -static void t1_dec_sigpass( +static void t1_dec_sigpass_raw( opj_t1_t *t1, int bpno, int orient, - char type, int cblksty) { int i, j, k, one, half, oneplushalf, vsc; @@ -403,13 +488,79 @@ static void t1_dec_sigpass( for (i = 0; i < t1->w; ++i) { for (j = k; j < k + 4 && j < t1->h; ++j) { vsc = ((cblksty & J2K_CCP_CBLKSTY_VSC) && (j == k + 3 || j == t1->h - 1)) ? 1 : 0; - t1_dec_sigpass_step( + t1_dec_sigpass_step_raw( + t1, + &t1->flags[((j+1) * t1->flags_stride) + i + 1], + &t1->data[(j * t1->w) + i], + orient, + oneplushalf, + vsc); + } + } + } +} /* VSC and BYPASS by Antonin */ + +static void t1_dec_sigpass_mqc( + opj_t1_t *t1, + int bpno, + int orient) +{ + int i, j, k, one, half, oneplushalf; + int *data1 = t1->data; + flag_t *flags1 = &t1->flags[1]; + one = 1 << bpno; + half = one >> 1; + oneplushalf = one | half; + for (k = 0; k < (t1->h & ~3); k += 4) { + for (i = 0; i < t1->w; ++i) { + int *data2 = data1 + i; + flag_t *flags2 = flags1 + i; + flags2 += t1->flags_stride; + t1_dec_sigpass_step_mqc(t1, flags2, data2, orient, oneplushalf); + data2 += t1->w; + flags2 += t1->flags_stride; + t1_dec_sigpass_step_mqc(t1, flags2, data2, orient, oneplushalf); + data2 += t1->w; + flags2 += t1->flags_stride; + t1_dec_sigpass_step_mqc(t1, flags2, data2, orient, oneplushalf); + data2 += t1->w; + flags2 += t1->flags_stride; + t1_dec_sigpass_step_mqc(t1, flags2, data2, orient, oneplushalf); + data2 += t1->w; + } + data1 += t1->w << 2; + flags1 += t1->flags_stride << 2; + } + for (i = 0; i < t1->w; ++i) { + int *data2 = data1 + i; + flag_t *flags2 = flags1 + i; + for (j = k; j < t1->h; ++j) { + flags2 += t1->flags_stride; + t1_dec_sigpass_step_mqc(t1, flags2, data2, orient, oneplushalf); + data2 += t1->w; + } + } +} /* VSC and BYPASS by Antonin */ + +static void t1_dec_sigpass_mqc_vsc( + opj_t1_t *t1, + int bpno, + int orient) +{ + int i, j, k, one, half, oneplushalf, vsc; + one = 1 << bpno; + half = one >> 1; + oneplushalf = one | half; + for (k = 0; k < t1->h; k += 4) { + for (i = 0; i < t1->w; ++i) { + for (j = k; j < k + 4 && j < t1->h; ++j) { + vsc = (j == k + 3 || j == t1->h - 1) ? 1 : 0; + t1_dec_sigpass_step_mqc_vsc( t1, &t1->flags[((j+1) * t1->flags_stride) + i + 1], &t1->data[(j * t1->w) + i], orient, oneplushalf, - type, vsc); } } @@ -444,28 +595,64 @@ static void t1_enc_refpass_step( } } -static void t1_dec_refpass_step( +static INLINE void t1_dec_refpass_step_raw( + opj_t1_t *t1, + flag_t *flagsp, + int *datap, + int poshalf, + int neghalf, + int vsc) +{ + int v, t, flag; + + opj_raw_t *raw = t1->raw; /* RAW component */ + + flag = vsc ? ((*flagsp) & (~(T1_SIG_S | T1_SIG_SE | T1_SIG_SW | T1_SGN_S))) : (*flagsp); + if ((flag & (T1_SIG | T1_VISIT)) == T1_SIG) { + v = raw_decode(raw); + t = v ? poshalf : neghalf; + *datap += *datap < 0 ? -t : t; + *flagsp |= T1_REFINE; + } +} /* VSC and BYPASS by Antonin */ + +static INLINE void t1_dec_refpass_step_mqc( + opj_t1_t *t1, + flag_t *flagsp, + int *datap, + int poshalf, + int neghalf) +{ + int v, t, flag; + + opj_mqc_t *mqc = t1->mqc; /* MQC component */ + + flag = *flagsp; + if ((flag & (T1_SIG | T1_VISIT)) == T1_SIG) { + mqc_setcurctx(mqc, t1_getctxno_mag(flag)); /* ESSAI */ + v = mqc_decode(mqc); + t = v ? poshalf : neghalf; + *datap += *datap < 0 ? -t : t; + *flagsp |= T1_REFINE; + } +} /* VSC and BYPASS by Antonin */ + +static INLINE void t1_dec_refpass_step_mqc_vsc( opj_t1_t *t1, flag_t *flagsp, int *datap, int poshalf, int neghalf, - char type, int vsc) { int v, t, flag; opj_mqc_t *mqc = t1->mqc; /* MQC component */ - opj_raw_t *raw = t1->raw; /* RAW component */ flag = vsc ? ((*flagsp) & (~(T1_SIG_S | T1_SIG_SE | T1_SIG_SW | T1_SGN_S))) : (*flagsp); if ((flag & (T1_SIG | T1_VISIT)) == T1_SIG) { mqc_setcurctx(mqc, t1_getctxno_mag(flag)); /* ESSAI */ - if (type == T1_TYPE_RAW) { - v = raw_decode(raw); - } else { - v = mqc_decode(mqc); - } + v = mqc_decode(mqc); t = v ? poshalf : neghalf; *datap += *datap < 0 ? -t : t; *flagsp |= T1_REFINE; @@ -500,10 +687,9 @@ static void t1_enc_refpass( } } -static void t1_dec_refpass( +static void t1_dec_refpass_raw( opj_t1_t *t1, int bpno, - char type, int cblksty) { int i, j, k, one, poshalf, neghalf; @@ -515,13 +701,78 @@ static void t1_dec_refpass( for (i = 0; i < t1->w; ++i) { for (j = k; j < k + 4 && j < t1->h; ++j) { vsc = ((cblksty & J2K_CCP_CBLKSTY_VSC) && (j == k + 3 || j == t1->h - 1)) ? 1 : 0; - t1_dec_refpass_step( + t1_dec_refpass_step_raw( + t1, + &t1->flags[((j+1) * t1->flags_stride) + i + 1], + &t1->data[(j * t1->w) + i], + poshalf, + neghalf, + vsc); + } + } + } +} /* VSC and BYPASS by Antonin */ + +static void t1_dec_refpass_mqc( + opj_t1_t *t1, + int bpno) +{ + int i, j, k, one, poshalf, neghalf; + int *data1 = t1->data; + flag_t *flags1 = &t1->flags[1]; + one = 1 << bpno; + poshalf = one >> 1; + neghalf = bpno > 0 ? -poshalf : -1; + for (k = 0; k < (t1->h & ~3); k += 4) { + for (i = 0; i < t1->w; ++i) { + int *data2 = data1 + i; + flag_t *flags2 = flags1 + i; + flags2 += t1->flags_stride; + t1_dec_refpass_step_mqc(t1, flags2, data2, poshalf, neghalf); + data2 += t1->w; + flags2 += t1->flags_stride; + t1_dec_refpass_step_mqc(t1, flags2, data2, poshalf, neghalf); + data2 += t1->w; + flags2 += t1->flags_stride; + t1_dec_refpass_step_mqc(t1, flags2, data2, poshalf, neghalf); + data2 += t1->w; + flags2 += t1->flags_stride; + t1_dec_refpass_step_mqc(t1, flags2, data2, poshalf, neghalf); + data2 += t1->w; + } + data1 += t1->w << 2; + flags1 += t1->flags_stride << 2; + } + for (i = 0; i < t1->w; ++i) { + int *data2 = data1 + i; + flag_t *flags2 = flags1 + i; + for (j = k; j < t1->h; ++j) { + flags2 += t1->flags_stride; + t1_dec_refpass_step_mqc(t1, flags2, data2, poshalf, neghalf); + data2 += t1->w; + } + } +} /* VSC and BYPASS by Antonin */ + +static void t1_dec_refpass_mqc_vsc( + opj_t1_t *t1, + int bpno) +{ + int i, j, k, one, poshalf, neghalf; + int vsc; + one = 1 << bpno; + poshalf = one >> 1; + neghalf = bpno > 0 ? -poshalf : -1; + for (k = 0; k < t1->h; k += 4) { + for (i = 0; i < t1->w; ++i) { + for (j = k; j < k + 4 && j < t1->h; ++j) { + vsc = ((j == k + 3 || j == t1->h - 1)) ? 1 : 0; + t1_dec_refpass_step_mqc_vsc( t1, &t1->flags[((j+1) * t1->flags_stride) + i + 1], &t1->data[(j * t1->w) + i], poshalf, neghalf, - type, vsc); } } @@ -563,7 +814,50 @@ LABEL_PARTIAL: *flagsp &= ~T1_VISIT; } +static void t1_dec_clnpass_step_partial( + opj_t1_t *t1, + flag_t *flagsp, + int *datap, + int orient, + int oneplushalf) +{ + int v, flag; + + opj_mqc_t *mqc = t1->mqc; /* MQC component */ + + flag = *flagsp; + mqc_setcurctx(mqc, t1_getctxno_sc(flag)); + v = mqc_decode(mqc) ^ t1_getspb(flag); + *datap = v ? -oneplushalf : oneplushalf; + t1_updateflags(flagsp, v, t1->flags_stride); + *flagsp &= ~T1_VISIT; +} /* VSC and BYPASS by Antonin */ + static void t1_dec_clnpass_step( + opj_t1_t *t1, + flag_t *flagsp, + int *datap, + int orient, + int oneplushalf) +{ + int v, flag; + + opj_mqc_t *mqc = t1->mqc; /* MQC component */ + + flag = *flagsp; + if (!(flag & (T1_SIG | T1_VISIT))) { + mqc_setcurctx(mqc, t1_getctxno_zc(flag, orient)); + if (mqc_decode(mqc)) { + mqc_setcurctx(mqc, t1_getctxno_sc(flag)); + v = mqc_decode(mqc) ^ t1_getspb(flag); + *datap = v ? -oneplushalf : oneplushalf; + t1_updateflags(flagsp, v, t1->flags_stride); + } + } + *flagsp &= ~T1_VISIT; +} /* VSC and BYPASS by Antonin */ + +static void t1_dec_clnpass_step_vsc( opj_t1_t *t1, flag_t *flagsp, int *datap, @@ -591,7 +885,7 @@ LABEL_PARTIAL: } } *flagsp &= ~T1_VISIT; -} /* VSC and BYPASS by Antonin */ +} static void t1_enc_clnpass( opj_t1_t *t1, @@ -671,22 +965,16 @@ static void t1_dec_clnpass( one = 1 << bpno; half = one >> 1; oneplushalf = one | half; + if (cblksty & J2K_CCP_CBLKSTY_VSC) { for (k = 0; k < t1->h; k += 4) { for (i = 0; i < t1->w; ++i) { if (k + 3 < t1->h) { - if (cblksty & J2K_CCP_CBLKSTY_VSC) { agg = !(MACRO_t1_flags(1 + k,1 + i) & (T1_SIG | T1_VISIT | T1_SIG_OTH) || MACRO_t1_flags(1 + k + 1,1 + i) & (T1_SIG | T1_VISIT | T1_SIG_OTH) || MACRO_t1_flags(1 + k + 2,1 + i) & (T1_SIG | T1_VISIT | T1_SIG_OTH) || (MACRO_t1_flags(1 + k + 3,1 + i) & (~(T1_SIG_S | T1_SIG_SE | T1_SIG_SW | T1_SGN_S))) & (T1_SIG | T1_VISIT | T1_SIG_OTH)); } else { - agg = !(MACRO_t1_flags(1 + k,1 + i) & (T1_SIG | T1_VISIT | T1_SIG_OTH) - || MACRO_t1_flags(1 + k + 1,1 + i) & (T1_SIG | T1_VISIT | T1_SIG_OTH) - || MACRO_t1_flags(1 + k + 2,1 + i) & (T1_SIG | T1_VISIT | T1_SIG_OTH) - || MACRO_t1_flags(1 + k + 3,1 + i) & (T1_SIG | T1_VISIT | T1_SIG_OTH)); - } - } else { agg = 0; } if (agg) { @@ -701,8 +989,8 @@ static void t1_dec_clnpass( runlen = 0; } for (j = k + runlen; j < k + 4 && j < t1->h; ++j) { - vsc = ((cblksty & J2K_CCP_CBLKSTY_VSC) && (j == k + 3 || j == t1->h - 1)) ? 1 : 0; - t1_dec_clnpass_step( + vsc = (j == k + 3 || j == t1->h - 1) ? 1 : 0; + t1_dec_clnpass_step_vsc( t1, &t1->flags[((j+1) * t1->flags_stride) + i + 1], &t1->data[(j * t1->w) + i], @@ -713,6 +1001,65 @@ static void t1_dec_clnpass( } } } + } else { + int *data1 = t1->data; + flag_t *flags1 = &t1->flags[1]; + for (k = 0; k < (t1->h & ~3); k += 4) { + for (i = 0; i < t1->w; ++i) { + int *data2 = data1 + i; + flag_t *flags2 = flags1 + i; + agg = !(MACRO_t1_flags(1 + k,1 + i) & (T1_SIG | T1_VISIT | T1_SIG_OTH) + || MACRO_t1_flags(1 + k + 1,1 + i) & (T1_SIG | T1_VISIT | T1_SIG_OTH) + || MACRO_t1_flags(1 + k + 2,1 + i) & (T1_SIG | T1_VISIT | T1_SIG_OTH) + || MACRO_t1_flags(1 + k + 3,1 + i) & (T1_SIG | T1_VISIT | T1_SIG_OTH)); + if (agg) { + mqc_setcurctx(mqc, T1_CTXNO_AGG); + if (!mqc_decode(mqc)) { + continue; + } + mqc_setcurctx(mqc, T1_CTXNO_UNI); + runlen = mqc_decode(mqc); + runlen = (runlen << 1) | mqc_decode(mqc); + flags2 += runlen * t1->flags_stride; + data2 += runlen * t1->w; + for (j = k + runlen; j < k + 4 && j < t1->h; ++j) { + flags2 += t1->flags_stride; + if (agg && (j == k + runlen)) { + t1_dec_clnpass_step_partial(t1, flags2, data2, orient, oneplushalf); + } else { + t1_dec_clnpass_step(t1, flags2, data2, orient, oneplushalf); + } + data2 += t1->w; + } + } else { + flags2 += t1->flags_stride; + t1_dec_clnpass_step(t1, flags2, data2, orient, oneplushalf); + data2 += t1->w; + flags2 += t1->flags_stride; + t1_dec_clnpass_step(t1, flags2, data2, orient, oneplushalf); + data2 += t1->w; + flags2 += t1->flags_stride; + t1_dec_clnpass_step(t1, flags2, data2, orient, oneplushalf); + data2 += t1->w; + flags2 += t1->flags_stride; + t1_dec_clnpass_step(t1, flags2, data2, orient, oneplushalf); + data2 += t1->w; + } + } + data1 += t1->w << 2; + flags1 += t1->flags_stride << 2; + } + for (i = 0; i < t1->w; ++i) { + int *data2 = data1 + i; + flag_t *flags2 = flags1 + i; + for (j = k; j < t1->h; ++j) { + flags2 += t1->flags_stride; + t1_dec_clnpass_step(t1, flags2, data2, orient, oneplushalf); + data2 += t1->w; + } + } + } + if (segsym) { int v = 0; mqc_setcurctx(mqc, T1_CTXNO_UNI); @@ -975,10 +1322,26 @@ static void t1_decode_cblk( for (passno = 0; passno < seg->numpasses; ++passno) { switch (passtype) { case 0: - t1_dec_sigpass(t1, bpno+1, orient, type, cblksty); + if (type == T1_TYPE_RAW) { + t1_dec_sigpass_raw(t1, bpno+1, orient, cblksty); + } else { + if (cblksty & J2K_CCP_CBLKSTY_VSC) { + t1_dec_sigpass_mqc_vsc(t1, bpno+1, orient); + } else { + t1_dec_sigpass_mqc(t1, bpno+1, orient); + } + } break; case 1: - t1_dec_refpass(t1, bpno+1, type, cblksty); + if (type == T1_TYPE_RAW) { + t1_dec_refpass_raw(t1, bpno+1, cblksty); + } else { + if (cblksty & J2K_CCP_CBLKSTY_VSC) { + t1_dec_refpass_mqc_vsc(t1, bpno+1); + } else { + t1_dec_refpass_mqc(t1, bpno+1); + } + } break; case 2: t1_dec_clnpass(t1, bpno+1, orient, cblksty); @@ -1145,7 +1508,6 @@ void t1_decode_cblks( for (cblkno = 0; cblkno < precinct->cw * precinct->ch; ++cblkno) { opj_tcd_cblk_dec_t* cblk = &precinct->cblks.dec[cblkno]; int* restrict datap; - void* restrict tiledp; int cblk_w, cblk_h; int x, y; int i, j; @@ -1186,8 +1548,8 @@ void t1_decode_cblks( } } - tiledp=(void*)&tilec->data[(y * tile_w) + x]; if (tccp->qmfbid == 1) { + int* restrict tiledp = &tilec->data[(y * tile_w) + x]; for (j = 0; j < cblk_h; ++j) { for (i = 0; i < cblk_w; ++i) { int tmp = datap[(j * cblk_w) + i]; @@ -1195,11 +1557,16 @@ void t1_decode_cblks( } } } else { /* if (tccp->qmfbid == 0) */ + float* restrict tiledp = (float*) &tilec->data[(y * tile_w) + x]; for (j = 0; j < cblk_h; ++j) { + float* restrict tiledp2 = tiledp; for (i = 0; i < cblk_w; ++i) { - float tmp = datap[(j * cblk_w) + i] * band->stepsize; - ((float*)tiledp)[(j * tile_w) + i] = tmp; + float tmp = *datap * band->stepsize; + *tiledp2 = tmp; + datap++; + tiledp2++; } + tiledp += tile_w; } } opj_free(cblk->data);