From 426bf8d337715f7b2e867cb2643128e5c2e3b5bb Mon Sep 17 00:00:00 2001 From: Kal Conley Date: Sun, 27 Dec 2015 20:14:47 +0100 Subject: [PATCH 01/22] Move some MQC functions into a header for speed Allow these hot functions to be inlined. This boosts decode performance by ~10%. --- src/lib/openjp2/CMakeLists.txt | 1 + src/lib/openjp2/mqc.c | 117 ------------------------ src/lib/openjp2/mqc.h | 4 +- src/lib/openjp2/mqc_inl.h | 159 +++++++++++++++++++++++++++++++++ 4 files changed, 163 insertions(+), 118 deletions(-) create mode 100644 src/lib/openjp2/mqc_inl.h diff --git a/src/lib/openjp2/CMakeLists.txt b/src/lib/openjp2/CMakeLists.txt index 367a7a8d..c02a9948 100644 --- a/src/lib/openjp2/CMakeLists.txt +++ b/src/lib/openjp2/CMakeLists.txt @@ -29,6 +29,7 @@ set(OPENJPEG_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/mct.h ${CMAKE_CURRENT_SOURCE_DIR}/mqc.c ${CMAKE_CURRENT_SOURCE_DIR}/mqc.h + ${CMAKE_CURRENT_SOURCE_DIR}/mqc_inl.h ${CMAKE_CURRENT_SOURCE_DIR}/openjpeg.c ${CMAKE_CURRENT_SOURCE_DIR}/openjpeg.h ${CMAKE_CURRENT_SOURCE_DIR}/opj_clock.c diff --git a/src/lib/openjp2/mqc.c b/src/lib/openjp2/mqc.c index 4e409a7c..7119c3a5 100644 --- a/src/lib/openjp2/mqc.c +++ b/src/lib/openjp2/mqc.c @@ -70,28 +70,6 @@ Fill mqc->c with 1's for flushing @param mqc MQC handle */ static void opj_mqc_setbits(opj_mqc_t *mqc); -/** -FIXME DOC -@param mqc MQC handle -@return -*/ -static INLINE OPJ_INT32 opj_mqc_mpsexchange(opj_mqc_t *const mqc); -/** -FIXME DOC -@param mqc MQC handle -@return -*/ -static INLINE OPJ_INT32 opj_mqc_lpsexchange(opj_mqc_t *const mqc); -/** -Input a byte -@param mqc MQC handle -*/ -static INLINE void opj_mqc_bytein(opj_mqc_t *const mqc); -/** -Renormalize mqc->a and mqc->c while decoding -@param mqc MQC handle -*/ -static INLINE void opj_mqc_renormd(opj_mqc_t *const mqc); /*@}*/ /*@}*/ @@ -284,82 +262,6 @@ static void opj_mqc_setbits(opj_mqc_t *mqc) { } } -static INLINE OPJ_INT32 opj_mqc_mpsexchange(opj_mqc_t *const mqc) { - OPJ_INT32 d; - if (mqc->a < (*mqc->curctx)->qeval) { - d = (OPJ_INT32)(1 - (*mqc->curctx)->mps); - *mqc->curctx = (*mqc->curctx)->nlps; - } else { - d = (OPJ_INT32)(*mqc->curctx)->mps; - *mqc->curctx = (*mqc->curctx)->nmps; - } - - return d; -} - -static INLINE OPJ_INT32 opj_mqc_lpsexchange(opj_mqc_t *const mqc) { - OPJ_INT32 d; - if (mqc->a < (*mqc->curctx)->qeval) { - mqc->a = (*mqc->curctx)->qeval; - d = (OPJ_INT32)(*mqc->curctx)->mps; - *mqc->curctx = (*mqc->curctx)->nmps; - } else { - mqc->a = (*mqc->curctx)->qeval; - d = (OPJ_INT32)(1 - (*mqc->curctx)->mps); - *mqc->curctx = (*mqc->curctx)->nlps; - } - - return d; -} - -#ifdef MQC_PERF_OPT -static INLINE void opj_mqc_bytein(opj_mqc_t *const mqc) { - unsigned int i = *((unsigned int *) mqc->bp); - mqc->c += i & 0xffff00; - mqc->ct = i & 0x0f; - mqc->bp += (i >> 2) & 0x04; -} -#else -static void opj_mqc_bytein(opj_mqc_t *const mqc) { - if (mqc->bp != mqc->end) { - OPJ_UINT32 c; - if (mqc->bp + 1 != mqc->end) { - c = *(mqc->bp + 1); - } else { - c = 0xff; - } - if (*mqc->bp == 0xff) { - if (c > 0x8f) { - mqc->c += 0xff00; - mqc->ct = 8; - } else { - mqc->bp++; - mqc->c += c << 9; - mqc->ct = 7; - } - } else { - mqc->bp++; - mqc->c += c << 8; - mqc->ct = 8; - } - } else { - mqc->c += 0xff00; - mqc->ct = 8; - } -} -#endif - -static INLINE void opj_mqc_renormd(opj_mqc_t *const mqc) { - do { - if (mqc->ct == 0) { - opj_mqc_bytein(mqc); - } - mqc->a <<= 1; - mqc->c <<= 1; - mqc->ct--; - } while (mqc->a < 0x8000); -} - /* ========================================================== MQ-Coder interface @@ -585,25 +487,6 @@ OPJ_BOOL opj_mqc_init_dec(opj_mqc_t *mqc, OPJ_BYTE *bp, OPJ_UINT32 len) { return OPJ_TRUE; } -OPJ_INT32 opj_mqc_decode(opj_mqc_t *const mqc) { - OPJ_INT32 d; - mqc->a -= (*mqc->curctx)->qeval; - if ((mqc->c >> 16) < (*mqc->curctx)->qeval) { - d = opj_mqc_lpsexchange(mqc); - opj_mqc_renormd(mqc); - } else { - mqc->c -= (*mqc->curctx)->qeval << 16; - if ((mqc->a & 0x8000) == 0) { - d = opj_mqc_mpsexchange(mqc); - opj_mqc_renormd(mqc); - } else { - d = (OPJ_INT32)(*mqc->curctx)->mps; - } - } - - return d; -} - void opj_mqc_resetstates(opj_mqc_t *mqc) { OPJ_UINT32 i; for (i = 0; i < MQC_NUMCTXS; i++) { diff --git a/src/lib/openjp2/mqc.h b/src/lib/openjp2/mqc.h index 69a2d460..574c599b 100644 --- a/src/lib/openjp2/mqc.h +++ b/src/lib/openjp2/mqc.h @@ -82,6 +82,8 @@ typedef struct opj_mqc { #endif } opj_mqc_t; +#include "mqc_inl.h" + /** @name Exported functions */ /*@{*/ /* ----------------------------------------------------------------------- */ @@ -198,7 +200,7 @@ Decode a symbol @param mqc MQC handle @return Returns the decoded symbol (0 or 1) */ -OPJ_INT32 opj_mqc_decode(opj_mqc_t * const mqc); +static INLINE OPJ_INT32 opj_mqc_decode(opj_mqc_t * const mqc); /* ----------------------------------------------------------------------- */ /*@}*/ diff --git a/src/lib/openjp2/mqc_inl.h b/src/lib/openjp2/mqc_inl.h new file mode 100644 index 00000000..882b59f4 --- /dev/null +++ b/src/lib/openjp2/mqc_inl.h @@ -0,0 +1,159 @@ +/* + * The copyright in this software is being made available under the 2-clauses + * BSD License, included below. This software may be subject to other third + * party and contributor rights, including patent rights, and no such rights + * are granted under this license. + * + * Copyright (c) 2002-2014, Universite catholique de Louvain (UCL), Belgium + * Copyright (c) 2002-2014, Professor Benoit Macq + * Copyright (c) 2001-2003, David Janssens + * Copyright (c) 2002-2003, Yannick Verschueren + * Copyright (c) 2003-2007, Francois-Olivier Devaux + * Copyright (c) 2003-2014, Antonin Descampe + * Copyright (c) 2005, Herve Drolon, FreeImage Team + * Copyright (c) 2008, Jerome Fimes, Communications & Systemes + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS' + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __MQC_INL_H +#define __MQC_INL_H +/** +FIXME DOC +@param mqc MQC handle +@return +*/ +static INLINE OPJ_INT32 opj_mqc_mpsexchange(opj_mqc_t *const mqc) { + OPJ_INT32 d; + if (mqc->a < (*mqc->curctx)->qeval) { + d = (OPJ_INT32)(1 - (*mqc->curctx)->mps); + *mqc->curctx = (*mqc->curctx)->nlps; + } else { + d = (OPJ_INT32)(*mqc->curctx)->mps; + *mqc->curctx = (*mqc->curctx)->nmps; + } + + return d; +} + +/** +FIXME DOC +@param mqc MQC handle +@return +*/ +static INLINE OPJ_INT32 opj_mqc_lpsexchange(opj_mqc_t *const mqc) { + OPJ_INT32 d; + if (mqc->a < (*mqc->curctx)->qeval) { + mqc->a = (*mqc->curctx)->qeval; + d = (OPJ_INT32)(*mqc->curctx)->mps; + *mqc->curctx = (*mqc->curctx)->nmps; + } else { + mqc->a = (*mqc->curctx)->qeval; + d = (OPJ_INT32)(1 - (*mqc->curctx)->mps); + *mqc->curctx = (*mqc->curctx)->nlps; + } + + return d; +} + +/** +Input a byte +@param mqc MQC handle +*/ +#ifdef MQC_PERF_OPT +static INLINE void opj_mqc_bytein(opj_mqc_t *const mqc) { + unsigned int i = *((unsigned int *) mqc->bp); + mqc->c += i & 0xffff00; + mqc->ct = i & 0x0f; + mqc->bp += (i >> 2) & 0x04; +} +#else +static INLINE void opj_mqc_bytein(opj_mqc_t *const mqc) { + if (mqc->bp != mqc->end) { + OPJ_UINT32 c; + if (mqc->bp + 1 != mqc->end) { + c = *(mqc->bp + 1); + } else { + c = 0xff; + } + if (*mqc->bp == 0xff) { + if (c > 0x8f) { + mqc->c += 0xff00; + mqc->ct = 8; + } else { + mqc->bp++; + mqc->c += c << 9; + mqc->ct = 7; + } + } else { + mqc->bp++; + mqc->c += c << 8; + mqc->ct = 8; + } + } else { + mqc->c += 0xff00; + mqc->ct = 8; + } +} +#endif + +/** +Renormalize mqc->a and mqc->c while decoding +@param mqc MQC handle +*/ +static INLINE void opj_mqc_renormd(opj_mqc_t *const mqc) { + do { + if (mqc->ct == 0) { + opj_mqc_bytein(mqc); + } + mqc->a <<= 1; + mqc->c <<= 1; + mqc->ct--; + } while (mqc->a < 0x8000); +} + +/** +Decode a symbol +@param mqc MQC handle +@return Returns the decoded symbol (0 or 1) +*/ +static INLINE OPJ_INT32 opj_mqc_decode(opj_mqc_t *const mqc) { + OPJ_INT32 d; + mqc->a -= (*mqc->curctx)->qeval; + if ((mqc->c >> 16) < (*mqc->curctx)->qeval) { + d = opj_mqc_lpsexchange(mqc); + opj_mqc_renormd(mqc); + } else { + mqc->c -= (*mqc->curctx)->qeval << 16; + if ((mqc->a & 0x8000) == 0) { + d = opj_mqc_mpsexchange(mqc); + opj_mqc_renormd(mqc); + } else { + d = (OPJ_INT32)(*mqc->curctx)->mps; + } + } + + return d; +} + +#endif /* __MQC_INL_H */ From c539808d097945866c0f7120ccdea28921a011a2 Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Sat, 21 May 2016 15:41:36 +0200 Subject: [PATCH 02/22] opj_t1_updateflags(): tiny optimization We can avoid using a loop-up table with some shift arithmetics. --- src/lib/openjp2/t1.c | 19 +++++++++---------- src/lib/openjp2/t1.h | 2 ++ 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/src/lib/openjp2/t1.c b/src/lib/openjp2/t1.c index 1bf7205e..37fc4fc1 100644 --- a/src/lib/openjp2/t1.c +++ b/src/lib/openjp2/t1.c @@ -343,23 +343,22 @@ static void opj_t1_updateflags(opj_flag_t *flagsp, OPJ_UINT32 s, OPJ_UINT32 stri opj_flag_t *np = flagsp - stride; opj_flag_t *sp = flagsp + stride; - static const opj_flag_t mod[] = { - T1_SIG_S, T1_SIG_S|T1_SGN_S, - T1_SIG_E, T1_SIG_E|T1_SGN_E, - T1_SIG_W, T1_SIG_W|T1_SGN_W, - T1_SIG_N, T1_SIG_N|T1_SGN_N - }; + /* We strongly rely on (T1_SGN_N == 0x0100) == (T1_SIG_N == 0x0010) << 4 */ + /* and T1_SIG_E == T1_SIG_N << 1, T1_SIG_W == T1_SIG_N << 2 and T1_SIG_S == T1_SIG_N << 2 */ + /* and T1_SGN_E == T1_SGN_N << 1, T1_SGN_W == T1_SGN_N << 2 and T1_SGN_S == T1_SGN_N << 2 */ + + opj_flag_t flag_N = T1_SIG_N | (T1_SIG_N << (4 * s)); np[-1] |= T1_SIG_SE; - np[0] |= mod[s]; + np[0] |= flag_N << 2; np[1] |= T1_SIG_SW; - flagsp[-1] |= mod[s+2]; + flagsp[-1] |= flag_N << 1; flagsp[0] |= T1_SIG; - flagsp[1] |= mod[s+4]; + flagsp[1] |= flag_N << 3; sp[-1] |= T1_SIG_NE; - sp[0] |= mod[s+6]; + sp[0] |= flag_N; sp[1] |= T1_SIG_NW; } diff --git a/src/lib/openjp2/t1.h b/src/lib/openjp2/t1.h index 3bc0ad9e..e9d3db57 100644 --- a/src/lib/openjp2/t1.h +++ b/src/lib/openjp2/t1.h @@ -50,6 +50,8 @@ in T1.C are used by some function in TCD.C. /* ----------------------------------------------------------------------- */ #define T1_NMSEDEC_BITS 7 +/* CAUTION: the value of those constants must not be changed, otherwise the */ +/* optimization of opj_t1_updateflags() will break! */ #define T1_SIG_NE 0x0001 /**< Context orientation : North-East direction */ #define T1_SIG_SE 0x0002 /**< Context orientation : South-East direction */ #define T1_SIG_SW 0x0004 /**< Context orientation : South-West direction */ From d8fef96f23ea8b12226d7326118f2ffd91da28ac Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Sat, 21 May 2016 15:52:02 +0200 Subject: [PATCH 03/22] Improve code generation in opj_t1_dec_clnpass() Add a opj_t1_dec_clnpass_step_only_if_flag_not_sig_visit() method that does the job of opj_t1_dec_clnpass_step_only() assuming the conditions are met. And use it in opj_t1_dec_clnpass(). The compiler generates more efficient code. --- src/lib/openjp2/t1.c | 47 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 43 insertions(+), 4 deletions(-) diff --git a/src/lib/openjp2/t1.c b/src/lib/openjp2/t1.c index 37fc4fc1..124d68ea 100644 --- a/src/lib/openjp2/t1.c +++ b/src/lib/openjp2/t1.c @@ -906,6 +906,32 @@ static void opj_t1_dec_clnpass_step( *flagsp &= ~T1_VISIT; } /* VSC and BYPASS by Antonin */ +static void opj_t1_dec_clnpass_step_only_if_flag_not_sig_visit( + opj_t1_t *t1, + opj_flag_t *flagsp, + OPJ_INT32 *datap, + OPJ_INT32 orient, + OPJ_INT32 oneplushalf) +{ + OPJ_INT32 v; + OPJ_INT32 flag; + + opj_mqc_t *mqc = t1->mqc; /* MQC component */ + + flag = *flagsp; + /*if (!(flag & (T1_SIG | T1_VISIT)))*/ + { + opj_mqc_setcurctx(mqc, opj_t1_getctxno_zc((OPJ_UINT32)flag, (OPJ_UINT32)orient)); + if (opj_mqc_decode(mqc)) { + opj_mqc_setcurctx(mqc, opj_t1_getctxno_sc((OPJ_UINT32)flag)); + v = opj_mqc_decode(mqc) ^ opj_t1_getspb((OPJ_UINT32)flag); + *datap = v ? -oneplushalf : oneplushalf; + opj_t1_updateflags(flagsp, v, t1->flags_stride); + } + } + /*flagsp &= ~T1_VISIT;*/ +} + static void opj_t1_dec_clnpass_step_vsc( opj_t1_t *t1, opj_flag_t *flagsp, @@ -1084,17 +1110,30 @@ static void opj_t1_dec_clnpass( data2 += t1->w; } } else { + opj_flag_t flag; flags2 += t1->flags_stride; - opj_t1_dec_clnpass_step(t1, flags2, data2, orient, oneplushalf); + flag = *flags2; + if (!(flag & (T1_SIG | T1_VISIT))) + opj_t1_dec_clnpass_step_only_if_flag_not_sig_visit(t1, flags2, data2, orient, oneplushalf); + *flags2 &= ~T1_VISIT; data2 += t1->w; flags2 += t1->flags_stride; - opj_t1_dec_clnpass_step(t1, flags2, data2, orient, oneplushalf); + flag = *flags2; + if (!(flag & (T1_SIG | T1_VISIT))) + opj_t1_dec_clnpass_step_only_if_flag_not_sig_visit(t1, flags2, data2, orient, oneplushalf); + *flags2 &= ~T1_VISIT; data2 += t1->w; flags2 += t1->flags_stride; - opj_t1_dec_clnpass_step(t1, flags2, data2, orient, oneplushalf); + flag = *flags2; + if (!(flag & (T1_SIG | T1_VISIT))) + opj_t1_dec_clnpass_step_only_if_flag_not_sig_visit(t1, flags2, data2, orient, oneplushalf); + *flags2 &= ~T1_VISIT; data2 += t1->w; flags2 += t1->flags_stride; - opj_t1_dec_clnpass_step(t1, flags2, data2, orient, oneplushalf); + flag = *flags2; + if (!(flag & (T1_SIG | T1_VISIT))) + opj_t1_dec_clnpass_step_only_if_flag_not_sig_visit(t1, flags2, data2, orient, oneplushalf); + *flags2 &= ~T1_VISIT; data2 += t1->w; } } From 23a01dfdef1a266754af268b07d912efbe04a759 Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Sat, 21 May 2016 16:27:35 +0200 Subject: [PATCH 04/22] Specialize decoding passes for 64x64 code blocks --- src/lib/openjp2/t1.c | 586 ++++++++++++++++++++++++------------------- 1 file changed, 334 insertions(+), 252 deletions(-) diff --git a/src/lib/openjp2/t1.c b/src/lib/openjp2/t1.c index 124d68ea..07e358a5 100644 --- a/src/lib/openjp2/t1.c +++ b/src/lib/openjp2/t1.c @@ -118,10 +118,6 @@ static void opj_t1_dec_sigpass_raw( OPJ_INT32 bpno, OPJ_INT32 orient, OPJ_INT32 cblksty); -static void opj_t1_dec_sigpass_mqc( - opj_t1_t *t1, - OPJ_INT32 bpno, - OPJ_INT32 orient); static void opj_t1_dec_sigpass_mqc_vsc( opj_t1_t *t1, OPJ_INT32 bpno, @@ -158,9 +154,6 @@ static void opj_t1_dec_refpass_raw( opj_t1_t *t1, OPJ_INT32 bpno, OPJ_INT32 cblksty); -static void opj_t1_dec_refpass_mqc( - opj_t1_t *t1, - OPJ_INT32 bpno); static void opj_t1_dec_refpass_mqc_vsc( opj_t1_t *t1, OPJ_INT32 bpno); @@ -536,49 +529,63 @@ static void opj_t1_dec_sigpass_raw( } } /* VSC and BYPASS by Antonin */ -static void opj_t1_dec_sigpass_mqc( +#define opj_t1_dec_sigpass_mqc_internal(t1, bpno, orient, w, h, flags_stride) \ +{ \ + OPJ_INT32 one, half, oneplushalf; \ + OPJ_UINT32 i, j, k; \ + OPJ_INT32 *data1 = t1->data; \ + opj_flag_t *flags1 = &t1->flags[1]; \ + one = 1 << bpno; \ + half = one >> 1; \ + oneplushalf = one | half; \ + for (k = 0; k < (h & ~3u); k += 4) { \ + for (i = 0; i < w; ++i) { \ + OPJ_INT32 *data2 = data1 + i; \ + opj_flag_t *flags2 = flags1 + i; \ + flags2 += flags_stride; \ + opj_t1_dec_sigpass_step_mqc(t1, flags2, data2, orient, oneplushalf); \ + data2 += w; \ + flags2 += flags_stride; \ + opj_t1_dec_sigpass_step_mqc(t1, flags2, data2, orient, oneplushalf); \ + data2 += w; \ + flags2 += flags_stride; \ + opj_t1_dec_sigpass_step_mqc(t1, flags2, data2, orient, oneplushalf); \ + data2 += w; \ + flags2 += flags_stride; \ + opj_t1_dec_sigpass_step_mqc(t1, flags2, data2, orient, oneplushalf); \ + data2 += w; \ + } \ + data1 += w << 2; \ + flags1 += flags_stride << 2; \ + } \ + for (i = 0; i < w; ++i) { \ + OPJ_INT32 *data2 = data1 + i; \ + opj_flag_t *flags2 = flags1 + i; \ + for (j = k; j < h; ++j) { \ + flags2 += flags_stride; \ + opj_t1_dec_sigpass_step_mqc(t1, flags2, data2, orient, oneplushalf); \ + data2 += w; \ + } \ + } \ +} + +static void opj_t1_dec_sigpass_mqc_64x64( opj_t1_t *t1, OPJ_INT32 bpno, OPJ_INT32 orient) { - OPJ_INT32 one, half, oneplushalf; - OPJ_UINT32 i, j, k; - OPJ_INT32 *data1 = t1->data; - opj_flag_t *flags1 = &t1->flags[1]; - one = 1 << bpno; - half = one >> 1; - oneplushalf = one | half; - for (k = 0; k < (t1->h & ~3u); k += 4) { - for (i = 0; i < t1->w; ++i) { - OPJ_INT32 *data2 = data1 + i; - opj_flag_t *flags2 = flags1 + i; - flags2 += t1->flags_stride; - opj_t1_dec_sigpass_step_mqc(t1, flags2, data2, orient, oneplushalf); - data2 += t1->w; - flags2 += t1->flags_stride; - opj_t1_dec_sigpass_step_mqc(t1, flags2, data2, orient, oneplushalf); - data2 += t1->w; - flags2 += t1->flags_stride; - opj_t1_dec_sigpass_step_mqc(t1, flags2, data2, orient, oneplushalf); - data2 += t1->w; - flags2 += t1->flags_stride; - opj_t1_dec_sigpass_step_mqc(t1, flags2, data2, orient, oneplushalf); - data2 += t1->w; - } - data1 += t1->w << 2; - flags1 += t1->flags_stride << 2; - } - for (i = 0; i < t1->w; ++i) { - OPJ_INT32 *data2 = data1 + i; - opj_flag_t *flags2 = flags1 + i; - for (j = k; j < t1->h; ++j) { - flags2 += t1->flags_stride; - opj_t1_dec_sigpass_step_mqc(t1, flags2, data2, orient, oneplushalf); - data2 += t1->w; - } - } -} /* VSC and BYPASS by Antonin */ + opj_t1_dec_sigpass_mqc_internal(t1, bpno, orient, 64, 64, 66); +} +static void opj_t1_dec_sigpass_mqc_generic( + opj_t1_t *t1, + OPJ_INT32 bpno, + OPJ_INT32 orient) +{ + opj_t1_dec_sigpass_mqc_internal(t1, bpno, orient, t1->w, t1->h, t1->flags_stride); +} + +/* VSC and BYPASS by Antonin */ static void opj_t1_dec_sigpass_mqc_vsc( opj_t1_t *t1, OPJ_INT32 bpno, @@ -757,48 +764,61 @@ static void opj_t1_dec_refpass_raw( } } /* VSC and BYPASS by Antonin */ -static void opj_t1_dec_refpass_mqc( +#define opj_t1_dec_refpass_mqc_internal(t1, bpno, w, h, flags_stride) \ +{ \ + OPJ_INT32 one, poshalf, neghalf; \ + OPJ_UINT32 i, j, k; \ + OPJ_INT32 *data1 = t1->data; \ + opj_flag_t *flags1 = &t1->flags[1]; \ + one = 1 << bpno; \ + poshalf = one >> 1; \ + neghalf = bpno > 0 ? -poshalf : -1; \ + for (k = 0; k < (h & ~3u); k += 4) { \ + for (i = 0; i < w; ++i) { \ + OPJ_INT32 *data2 = data1 + i; \ + opj_flag_t *flags2 = flags1 + i; \ + flags2 += flags_stride; \ + opj_t1_dec_refpass_step_mqc(t1, flags2, data2, poshalf, neghalf); \ + data2 += w; \ + flags2 += flags_stride; \ + opj_t1_dec_refpass_step_mqc(t1, flags2, data2, poshalf, neghalf); \ + data2 += w; \ + flags2 += flags_stride; \ + opj_t1_dec_refpass_step_mqc(t1, flags2, data2, poshalf, neghalf); \ + data2 += w; \ + flags2 += flags_stride; \ + opj_t1_dec_refpass_step_mqc(t1, flags2, data2, poshalf, neghalf); \ + data2 += w; \ + } \ + data1 += w << 2; \ + flags1 += flags_stride << 2; \ + } \ + for (i = 0; i < w; ++i) { \ + OPJ_INT32 *data2 = data1 + i; \ + opj_flag_t *flags2 = flags1 + i; \ + for (j = k; j < h; ++j) { \ + flags2 += flags_stride; \ + opj_t1_dec_refpass_step_mqc(t1, flags2, data2, poshalf, neghalf); \ + data2 += w; \ + } \ + } \ +} + +static void opj_t1_dec_refpass_mqc_64x64( opj_t1_t *t1, OPJ_INT32 bpno) { - OPJ_INT32 one, poshalf, neghalf; - OPJ_UINT32 i, j, k; - OPJ_INT32 *data1 = t1->data; - opj_flag_t *flags1 = &t1->flags[1]; - one = 1 << bpno; - poshalf = one >> 1; - neghalf = bpno > 0 ? -poshalf : -1; - for (k = 0; k < (t1->h & ~3u); k += 4) { - for (i = 0; i < t1->w; ++i) { - OPJ_INT32 *data2 = data1 + i; - opj_flag_t *flags2 = flags1 + i; - flags2 += t1->flags_stride; - opj_t1_dec_refpass_step_mqc(t1, flags2, data2, poshalf, neghalf); - data2 += t1->w; - flags2 += t1->flags_stride; - opj_t1_dec_refpass_step_mqc(t1, flags2, data2, poshalf, neghalf); - data2 += t1->w; - flags2 += t1->flags_stride; - opj_t1_dec_refpass_step_mqc(t1, flags2, data2, poshalf, neghalf); - data2 += t1->w; - flags2 += t1->flags_stride; - opj_t1_dec_refpass_step_mqc(t1, flags2, data2, poshalf, neghalf); - data2 += t1->w; - } - data1 += t1->w << 2; - flags1 += t1->flags_stride << 2; - } - for (i = 0; i < t1->w; ++i) { - OPJ_INT32 *data2 = data1 + i; - opj_flag_t *flags2 = flags1 + i; - for (j = k; j < t1->h; ++j) { - flags2 += t1->flags_stride; - opj_t1_dec_refpass_step_mqc(t1, flags2, data2, poshalf, neghalf); - data2 += t1->w; - } - } -} /* VSC and BYPASS by Antonin */ + opj_t1_dec_refpass_mqc_internal(t1, bpno, 64, 64, 66); +} +static void opj_t1_dec_refpass_mqc_generic( + opj_t1_t *t1, + OPJ_INT32 bpno) +{ + opj_t1_dec_refpass_mqc_internal(t1, bpno, t1->w, t1->h, t1->flags_stride); +} + +/* VSC and BYPASS by Antonin */ static void opj_t1_dec_refpass_mqc_vsc( opj_t1_t *t1, OPJ_INT32 bpno) @@ -1028,143 +1048,159 @@ static void opj_t1_enc_clnpass( } } -static void opj_t1_dec_clnpass( +#define MACRO_t1_flags_internal(x,y,flags_stride) t1->flags[((x)*(flags_stride))+(y)] + +#define opj_t1_dec_clnpass_internal(t1, bpno, orient, cblksty, w, h, flags_stride) \ +{ \ + OPJ_INT32 one, half, oneplushalf, agg, runlen, vsc; \ + OPJ_UINT32 i, j, k; \ + OPJ_INT32 segsym = cblksty & J2K_CCP_CBLKSTY_SEGSYM; \ + \ + opj_mqc_t *mqc = t1->mqc; /* MQC component */ \ + \ + one = 1 << bpno; \ + half = one >> 1; \ + oneplushalf = one | half; \ + if (cblksty & J2K_CCP_CBLKSTY_VSC) { \ + for (k = 0; k < h; k += 4) { \ + for (i = 0; i < w; ++i) { \ + if (k + 3 < h) { \ + agg = !(MACRO_t1_flags_internal(1 + k,1 + i,flags_stride) & (T1_SIG | T1_VISIT | T1_SIG_OTH) \ + || MACRO_t1_flags_internal(1 + k + 1,1 + i,flags_stride) & (T1_SIG | T1_VISIT | T1_SIG_OTH) \ + || MACRO_t1_flags_internal(1 + k + 2,1 + i,flags_stride) & (T1_SIG | T1_VISIT | T1_SIG_OTH) \ + || (MACRO_t1_flags_internal(1 + k + 3,1 + i,flags_stride) \ + & (~(T1_SIG_S | T1_SIG_SE | T1_SIG_SW | T1_SGN_S))) & (T1_SIG | T1_VISIT | T1_SIG_OTH)); \ + } else { \ + agg = 0; \ + } \ + if (agg) { \ + opj_mqc_setcurctx(mqc, T1_CTXNO_AGG); \ + if (!opj_mqc_decode(mqc)) { \ + continue; \ + } \ + opj_mqc_setcurctx(mqc, T1_CTXNO_UNI); \ + runlen = opj_mqc_decode(mqc); \ + runlen = (runlen << 1) | opj_mqc_decode(mqc); \ + } else { \ + runlen = 0; \ + } \ + for (j = k + (OPJ_UINT32)runlen; j < k + 4 && j < h; ++j) { \ + vsc = (j == k + 3 || j == h - 1) ? 1 : 0; \ + opj_t1_dec_clnpass_step_vsc( \ + t1, \ + &t1->flags[((j+1) * flags_stride) + i + 1], \ + &t1->data[(j * w) + i], \ + orient, \ + oneplushalf, \ + agg && (j == k + (OPJ_UINT32)runlen), \ + vsc); \ + } \ + } \ + } \ + } else { \ + OPJ_INT32 *data1 = t1->data; \ + opj_flag_t *flags1 = &t1->flags[1]; \ + for (k = 0; k < (h & ~3u); k += 4) { \ + for (i = 0; i < w; ++i) { \ + OPJ_INT32 *data2 = data1 + i; \ + opj_flag_t *flags2 = flags1 + i; \ + agg = !((MACRO_t1_flags_internal(1 + k, 1 + i,flags_stride) | \ + MACRO_t1_flags_internal(1 + k + 1, 1 + i,flags_stride) | \ + MACRO_t1_flags_internal(1 + k + 2, 1 + i,flags_stride) | \ + MACRO_t1_flags_internal(1 + k + 3, 1 + i,flags_stride)) & (T1_SIG | T1_VISIT | T1_SIG_OTH)); \ + if (agg) { \ + opj_mqc_setcurctx(mqc, T1_CTXNO_AGG); \ + if (!opj_mqc_decode(mqc)) { \ + continue; \ + } \ + opj_mqc_setcurctx(mqc, T1_CTXNO_UNI); \ + runlen = opj_mqc_decode(mqc); \ + runlen = (runlen << 1) | opj_mqc_decode(mqc); \ + flags2 += (OPJ_UINT32)runlen * flags_stride; \ + data2 += (OPJ_UINT32)runlen * w; \ + for (j = (OPJ_UINT32)runlen; j < 4 && j < h; ++j) { \ + flags2 += flags_stride; \ + if (agg && (j == (OPJ_UINT32)runlen)) { \ + opj_t1_dec_clnpass_step_partial(t1, flags2, data2, orient, oneplushalf); \ + } else { \ + opj_t1_dec_clnpass_step(t1, flags2, data2, orient, oneplushalf); \ + } \ + data2 += w; \ + } \ + } else { \ + opj_flag_t flag; \ + flags2 += flags_stride; \ + flag = *flags2; \ + if (!(flag & (T1_SIG | T1_VISIT))) \ + opj_t1_dec_clnpass_step_only_if_flag_not_sig_visit(t1, flags2, data2, orient, oneplushalf); \ + *flags2 &= ~T1_VISIT; \ + data2 += w; \ + flags2 += flags_stride; \ + flag = *flags2; \ + if (!(flag & (T1_SIG | T1_VISIT))) \ + opj_t1_dec_clnpass_step_only_if_flag_not_sig_visit(t1, flags2, data2, orient, oneplushalf); \ + *flags2 &= ~T1_VISIT; \ + data2 += w; \ + flags2 += flags_stride; \ + flag = *flags2; \ + if (!(flag & (T1_SIG | T1_VISIT))) \ + opj_t1_dec_clnpass_step_only_if_flag_not_sig_visit(t1, flags2, data2, orient, oneplushalf); \ + *flags2 &= ~T1_VISIT; \ + data2 += w; \ + flags2 += flags_stride; \ + flag = *flags2; \ + if (!(flag & (T1_SIG | T1_VISIT))) \ + opj_t1_dec_clnpass_step_only_if_flag_not_sig_visit(t1, flags2, data2, orient, oneplushalf); \ + *flags2 &= ~T1_VISIT; \ + data2 += w; \ + } \ + } \ + data1 += w << 2; \ + flags1 += flags_stride << 2; \ + } \ + for (i = 0; i < w; ++i) { \ + OPJ_INT32 *data2 = data1 + i; \ + opj_flag_t *flags2 = flags1 + i; \ + for (j = k; j < h; ++j) { \ + flags2 += flags_stride; \ + opj_t1_dec_clnpass_step(t1, flags2, data2, orient, oneplushalf); \ + data2 += w; \ + } \ + } \ + } \ + \ + if (segsym) { \ + OPJ_INT32 v = 0; \ + opj_mqc_setcurctx(mqc, T1_CTXNO_UNI); \ + v = opj_mqc_decode(mqc); \ + v = (v << 1) | opj_mqc_decode(mqc); \ + v = (v << 1) | opj_mqc_decode(mqc); \ + v = (v << 1) | opj_mqc_decode(mqc); \ + /* \ + if (v!=0xa) { \ + opj_event_msg(t1->cinfo, EVT_WARNING, "Bad segmentation symbol %x\n", v); \ + } \ + */ \ + } \ +} /* VSC and BYPASS by Antonin */ + +static void opj_t1_dec_clnpass_64x64( opj_t1_t *t1, OPJ_INT32 bpno, OPJ_INT32 orient, OPJ_INT32 cblksty) { - OPJ_INT32 one, half, oneplushalf, agg, runlen, vsc; - OPJ_UINT32 i, j, k; - OPJ_INT32 segsym = cblksty & J2K_CCP_CBLKSTY_SEGSYM; - - opj_mqc_t *mqc = t1->mqc; /* MQC component */ - - one = 1 << bpno; - half = one >> 1; - oneplushalf = one | half; - if (cblksty & J2K_CCP_CBLKSTY_VSC) { - for (k = 0; k < t1->h; k += 4) { - for (i = 0; i < t1->w; ++i) { - if (k + 3 < t1->h) { - agg = !(MACRO_t1_flags(1 + k,1 + i) & (T1_SIG | T1_VISIT | T1_SIG_OTH) - || MACRO_t1_flags(1 + k + 1,1 + i) & (T1_SIG | T1_VISIT | T1_SIG_OTH) - || MACRO_t1_flags(1 + k + 2,1 + i) & (T1_SIG | T1_VISIT | T1_SIG_OTH) - || (MACRO_t1_flags(1 + k + 3,1 + i) - & (~(T1_SIG_S | T1_SIG_SE | T1_SIG_SW | T1_SGN_S))) & (T1_SIG | T1_VISIT | T1_SIG_OTH)); - } else { - agg = 0; - } - if (agg) { - opj_mqc_setcurctx(mqc, T1_CTXNO_AGG); - if (!opj_mqc_decode(mqc)) { - continue; - } - opj_mqc_setcurctx(mqc, T1_CTXNO_UNI); - runlen = opj_mqc_decode(mqc); - runlen = (runlen << 1) | opj_mqc_decode(mqc); - } else { - runlen = 0; - } - for (j = k + (OPJ_UINT32)runlen; j < k + 4 && j < t1->h; ++j) { - vsc = (j == k + 3 || j == t1->h - 1) ? 1 : 0; - opj_t1_dec_clnpass_step_vsc( - t1, - &t1->flags[((j+1) * t1->flags_stride) + i + 1], - &t1->data[(j * t1->w) + i], - orient, - oneplushalf, - agg && (j == k + (OPJ_UINT32)runlen), - vsc); - } - } - } - } else { - OPJ_INT32 *data1 = t1->data; - opj_flag_t *flags1 = &t1->flags[1]; - for (k = 0; k < (t1->h & ~3u); k += 4) { - for (i = 0; i < t1->w; ++i) { - OPJ_INT32 *data2 = data1 + i; - opj_flag_t *flags2 = flags1 + i; - agg = !((MACRO_t1_flags(1 + k, 1 + i) | - MACRO_t1_flags(1 + k + 1, 1 + i) | - MACRO_t1_flags(1 + k + 2, 1 + i) | - MACRO_t1_flags(1 + k + 3, 1 + i)) & (T1_SIG | T1_VISIT | T1_SIG_OTH)); - if (agg) { - opj_mqc_setcurctx(mqc, T1_CTXNO_AGG); - if (!opj_mqc_decode(mqc)) { - continue; - } - opj_mqc_setcurctx(mqc, T1_CTXNO_UNI); - runlen = opj_mqc_decode(mqc); - runlen = (runlen << 1) | opj_mqc_decode(mqc); - flags2 += (OPJ_UINT32)runlen * t1->flags_stride; - data2 += (OPJ_UINT32)runlen * t1->w; - for (j = (OPJ_UINT32)runlen; j < 4 && j < t1->h; ++j) { - flags2 += t1->flags_stride; - if (agg && (j == (OPJ_UINT32)runlen)) { - opj_t1_dec_clnpass_step_partial(t1, flags2, data2, orient, oneplushalf); - } else { - opj_t1_dec_clnpass_step(t1, flags2, data2, orient, oneplushalf); - } - data2 += t1->w; - } - } else { - opj_flag_t flag; - flags2 += t1->flags_stride; - flag = *flags2; - if (!(flag & (T1_SIG | T1_VISIT))) - opj_t1_dec_clnpass_step_only_if_flag_not_sig_visit(t1, flags2, data2, orient, oneplushalf); - *flags2 &= ~T1_VISIT; - data2 += t1->w; - flags2 += t1->flags_stride; - flag = *flags2; - if (!(flag & (T1_SIG | T1_VISIT))) - opj_t1_dec_clnpass_step_only_if_flag_not_sig_visit(t1, flags2, data2, orient, oneplushalf); - *flags2 &= ~T1_VISIT; - data2 += t1->w; - flags2 += t1->flags_stride; - flag = *flags2; - if (!(flag & (T1_SIG | T1_VISIT))) - opj_t1_dec_clnpass_step_only_if_flag_not_sig_visit(t1, flags2, data2, orient, oneplushalf); - *flags2 &= ~T1_VISIT; - data2 += t1->w; - flags2 += t1->flags_stride; - flag = *flags2; - if (!(flag & (T1_SIG | T1_VISIT))) - opj_t1_dec_clnpass_step_only_if_flag_not_sig_visit(t1, flags2, data2, orient, oneplushalf); - *flags2 &= ~T1_VISIT; - data2 += t1->w; - } - } - data1 += t1->w << 2; - flags1 += t1->flags_stride << 2; - } - for (i = 0; i < t1->w; ++i) { - OPJ_INT32 *data2 = data1 + i; - opj_flag_t *flags2 = flags1 + i; - for (j = k; j < t1->h; ++j) { - flags2 += t1->flags_stride; - opj_t1_dec_clnpass_step(t1, flags2, data2, orient, oneplushalf); - data2 += t1->w; - } - } - } + opj_t1_dec_clnpass_internal(t1, bpno, orient, cblksty, 64, 64, 66); +} - if (segsym) { - OPJ_INT32 v = 0; - opj_mqc_setcurctx(mqc, T1_CTXNO_UNI); - v = opj_mqc_decode(mqc); - v = (v << 1) | opj_mqc_decode(mqc); - v = (v << 1) | opj_mqc_decode(mqc); - v = (v << 1) | opj_mqc_decode(mqc); - /* - if (v!=0xa) { - opj_event_msg(t1->cinfo, EVT_WARNING, "Bad segmentation symbol %x\n", v); - } - */ - } -} /* VSC and BYPASS by Antonin */ +static void opj_t1_dec_clnpass_generic( + opj_t1_t *t1, + OPJ_INT32 bpno, + OPJ_INT32 orient, + OPJ_INT32 cblksty) +{ + opj_t1_dec_clnpass_internal(t1, bpno, orient, cblksty, t1->w, t1->h, t1->flags_stride); +} /** mod fixed_quality */ @@ -1446,45 +1482,91 @@ static OPJ_BOOL opj_t1_decode_cblk(opj_t1_t *t1, } } - for (passno = 0; (passno < seg->real_num_passes) && (bpno_plus_one >= 1); ++passno) { - switch (passtype) { - case 0: - if (type == T1_TYPE_RAW) { - opj_t1_dec_sigpass_raw(t1, bpno_plus_one, (OPJ_INT32)orient, (OPJ_INT32)cblksty); - } else { - if (cblksty & J2K_CCP_CBLKSTY_VSC) { - opj_t1_dec_sigpass_mqc_vsc(t1, bpno_plus_one, (OPJ_INT32)orient); - } else { - opj_t1_dec_sigpass_mqc(t1, bpno_plus_one, (OPJ_INT32)orient); - } - } - break; - case 1: - if (type == T1_TYPE_RAW) { - opj_t1_dec_refpass_raw(t1, bpno_plus_one, (OPJ_INT32)cblksty); - } else { - if (cblksty & J2K_CCP_CBLKSTY_VSC) { - opj_t1_dec_refpass_mqc_vsc(t1, bpno_plus_one); - } else { - opj_t1_dec_refpass_mqc(t1, bpno_plus_one); - } - } - break; - case 2: - opj_t1_dec_clnpass(t1, bpno_plus_one, (OPJ_INT32)orient, (OPJ_INT32)cblksty); - break; - } + if( t1->w == 64 && t1->h == 64 ) + { + for (passno = 0; (passno < seg->real_num_passes) && (bpno_plus_one >= 1); ++passno) { + switch (passtype) { + case 0: + if (type == T1_TYPE_RAW) { + opj_t1_dec_sigpass_raw(t1, bpno_plus_one, (OPJ_INT32)orient, (OPJ_INT32)cblksty); + } else { + if (cblksty & J2K_CCP_CBLKSTY_VSC) { + opj_t1_dec_sigpass_mqc_vsc(t1, bpno_plus_one, (OPJ_INT32)orient); + } else { + opj_t1_dec_sigpass_mqc_64x64(t1, bpno_plus_one, (OPJ_INT32)orient); + } + } + break; + case 1: + if (type == T1_TYPE_RAW) { + opj_t1_dec_refpass_raw(t1, bpno_plus_one, (OPJ_INT32)cblksty); + } else { + if (cblksty & J2K_CCP_CBLKSTY_VSC) { + opj_t1_dec_refpass_mqc_vsc(t1, bpno_plus_one); + } else { + opj_t1_dec_refpass_mqc_64x64(t1, bpno_plus_one); + } + } + break; + case 2: + opj_t1_dec_clnpass_64x64(t1, bpno_plus_one, (OPJ_INT32)orient, (OPJ_INT32)cblksty); + break; + } - if ((cblksty & J2K_CCP_CBLKSTY_RESET) && type == T1_TYPE_MQ) { - opj_mqc_resetstates(mqc); - opj_mqc_setstate(mqc, T1_CTXNO_UNI, 0, 46); - opj_mqc_setstate(mqc, T1_CTXNO_AGG, 0, 3); - opj_mqc_setstate(mqc, T1_CTXNO_ZC, 0, 4); - } - if (++passtype == 3) { - passtype = 0; - bpno_plus_one--; - } + if ((cblksty & J2K_CCP_CBLKSTY_RESET) && type == T1_TYPE_MQ) { + opj_mqc_resetstates(mqc); + opj_mqc_setstate(mqc, T1_CTXNO_UNI, 0, 46); + opj_mqc_setstate(mqc, T1_CTXNO_AGG, 0, 3); + opj_mqc_setstate(mqc, T1_CTXNO_ZC, 0, 4); + } + if (++passtype == 3) { + passtype = 0; + bpno_plus_one--; + } + } + } + else + { + for (passno = 0; (passno < seg->real_num_passes) && (bpno_plus_one >= 1); ++passno) { + switch (passtype) { + case 0: + if (type == T1_TYPE_RAW) { + opj_t1_dec_sigpass_raw(t1, bpno_plus_one, (OPJ_INT32)orient, (OPJ_INT32)cblksty); + } else { + if (cblksty & J2K_CCP_CBLKSTY_VSC) { + opj_t1_dec_sigpass_mqc_vsc(t1, bpno_plus_one, (OPJ_INT32)orient); + } else { + opj_t1_dec_sigpass_mqc_generic(t1, bpno_plus_one, (OPJ_INT32)orient); + } + } + break; + case 1: + if (type == T1_TYPE_RAW) { + opj_t1_dec_refpass_raw(t1, bpno_plus_one, (OPJ_INT32)cblksty); + } else { + if (cblksty & J2K_CCP_CBLKSTY_VSC) { + opj_t1_dec_refpass_mqc_vsc(t1, bpno_plus_one); + } else { + opj_t1_dec_refpass_mqc_generic(t1, bpno_plus_one); + } + } + break; + case 2: + opj_t1_dec_clnpass_generic(t1, bpno_plus_one, (OPJ_INT32)orient, (OPJ_INT32)cblksty); + break; + } + + if ((cblksty & J2K_CCP_CBLKSTY_RESET) && type == T1_TYPE_MQ) { + opj_mqc_resetstates(mqc); + opj_mqc_setstate(mqc, T1_CTXNO_UNI, 0, 46); + opj_mqc_setstate(mqc, T1_CTXNO_AGG, 0, 3); + opj_mqc_setstate(mqc, T1_CTXNO_ZC, 0, 4); + } + if (++passtype == 3) { + passtype = 0; + bpno_plus_one--; + } + } } } return OPJ_TRUE; From ba1edf6cd41415594729bc90ad3b0008af48251e Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Sun, 22 May 2016 00:50:34 +0200 Subject: [PATCH 05/22] Reduce number of occurrences of orient function argument This is essentially used to shift inside the lut_ctxno_zc, which we can precompute at the beginning of opj_t1_decode_cblk() / opj_t1_encode_cblk() --- src/lib/openjp2/mqc.h | 1 + src/lib/openjp2/t1.c | 127 +++++++++++++++--------------------------- 2 files changed, 47 insertions(+), 81 deletions(-) diff --git a/src/lib/openjp2/mqc.h b/src/lib/openjp2/mqc.h index 574c599b..491ee50e 100644 --- a/src/lib/openjp2/mqc.h +++ b/src/lib/openjp2/mqc.h @@ -77,6 +77,7 @@ typedef struct opj_mqc { OPJ_BYTE *end; opj_mqc_state_t *ctxs[MQC_NUMCTXS]; opj_mqc_state_t **curctx; + const OPJ_BYTE *lut_ctxno_zc_orient; /* lut_ctxno_zc shifted by 256 * bandno */ #ifdef MQC_PERF_OPT unsigned char *buffer; #endif diff --git a/src/lib/openjp2/t1.c b/src/lib/openjp2/t1.c index 07e358a5..30919585 100644 --- a/src/lib/openjp2/t1.c +++ b/src/lib/openjp2/t1.c @@ -45,7 +45,7 @@ /** @name Local static functions */ /*@{*/ -static INLINE OPJ_BYTE opj_t1_getctxno_zc(OPJ_UINT32 f, OPJ_UINT32 orient); +static INLINE OPJ_BYTE opj_t1_getctxno_zc(opj_mqc_t *mqc, OPJ_UINT32 f); static OPJ_BYTE opj_t1_getctxno_sc(OPJ_UINT32 f); static INLINE OPJ_UINT32 opj_t1_getctxno_mag(OPJ_UINT32 f); static OPJ_BYTE opj_t1_getspb(OPJ_UINT32 f); @@ -58,7 +58,6 @@ Encode significant pass static void opj_t1_enc_sigpass_step(opj_t1_t *t1, opj_flag_t *flagsp, OPJ_INT32 *datap, - OPJ_UINT32 orient, OPJ_INT32 bpno, OPJ_INT32 one, OPJ_INT32 *nmsedec, @@ -82,20 +81,17 @@ static INLINE void opj_t1_dec_sigpass_step_raw( opj_t1_t *t1, opj_flag_t *flagsp, OPJ_INT32 *datap, - OPJ_INT32 orient, OPJ_INT32 oneplushalf, OPJ_INT32 vsc); static INLINE void opj_t1_dec_sigpass_step_mqc( opj_t1_t *t1, opj_flag_t *flagsp, OPJ_INT32 *datap, - OPJ_INT32 orient, OPJ_INT32 oneplushalf); static INLINE void opj_t1_dec_sigpass_step_mqc_vsc( opj_t1_t *t1, opj_flag_t *flagsp, OPJ_INT32 *datap, - OPJ_INT32 orient, OPJ_INT32 oneplushalf, OPJ_INT32 vsc); @@ -105,7 +101,6 @@ Encode significant pass */ static void opj_t1_enc_sigpass( opj_t1_t *t1, OPJ_INT32 bpno, - OPJ_UINT32 orient, OPJ_INT32 *nmsedec, OPJ_BYTE type, OPJ_UINT32 cblksty); @@ -116,12 +111,10 @@ Decode significant pass static void opj_t1_dec_sigpass_raw( opj_t1_t *t1, OPJ_INT32 bpno, - OPJ_INT32 orient, OPJ_INT32 cblksty); static void opj_t1_dec_sigpass_mqc_vsc( opj_t1_t *t1, - OPJ_INT32 bpno, - OPJ_INT32 orient); + OPJ_INT32 bpno); @@ -202,7 +195,6 @@ static void opj_t1_enc_clnpass_step( opj_t1_t *t1, opj_flag_t *flagsp, OPJ_INT32 *datap, - OPJ_UINT32 orient, OPJ_INT32 bpno, OPJ_INT32 one, OPJ_INT32 *nmsedec, @@ -215,19 +207,16 @@ static void opj_t1_dec_clnpass_step_partial( opj_t1_t *t1, opj_flag_t *flagsp, OPJ_INT32 *datap, - OPJ_INT32 orient, OPJ_INT32 oneplushalf); static void opj_t1_dec_clnpass_step( opj_t1_t *t1, opj_flag_t *flagsp, OPJ_INT32 *datap, - OPJ_INT32 orient, OPJ_INT32 oneplushalf); static void opj_t1_dec_clnpass_step_vsc( opj_t1_t *t1, opj_flag_t *flagsp, OPJ_INT32 *datap, - OPJ_INT32 orient, OPJ_INT32 oneplushalf, OPJ_INT32 partial, OPJ_INT32 vsc); @@ -237,7 +226,6 @@ Encode clean-up pass static void opj_t1_enc_clnpass( opj_t1_t *t1, OPJ_INT32 bpno, - OPJ_UINT32 orient, OPJ_INT32 *nmsedec, OPJ_UINT32 cblksty); /** @@ -246,7 +234,6 @@ Decode clean-up pass static void opj_t1_dec_clnpass( opj_t1_t *t1, OPJ_INT32 bpno, - OPJ_INT32 orient, OPJ_INT32 cblksty); static OPJ_FLOAT64 opj_t1_getwmsedec( @@ -298,8 +285,8 @@ static OPJ_BOOL opj_t1_allocate_buffers( opj_t1_t *t1, /* ----------------------------------------------------------------------- */ -static OPJ_BYTE opj_t1_getctxno_zc(OPJ_UINT32 f, OPJ_UINT32 orient) { - return lut_ctxno_zc[(orient << 8) | (f & T1_SIG_OTH)]; +static OPJ_BYTE opj_t1_getctxno_zc(opj_mqc_t *mqc, OPJ_UINT32 f) { + return mqc->lut_ctxno_zc_orient[(f & T1_SIG_OTH)]; } static OPJ_BYTE opj_t1_getctxno_sc(OPJ_UINT32 f) { @@ -358,7 +345,6 @@ static void opj_t1_updateflags(opj_flag_t *flagsp, OPJ_UINT32 s, OPJ_UINT32 stri static void opj_t1_enc_sigpass_step( opj_t1_t *t1, opj_flag_t *flagsp, OPJ_INT32 *datap, - OPJ_UINT32 orient, OPJ_INT32 bpno, OPJ_INT32 one, OPJ_INT32 *nmsedec, @@ -374,7 +360,7 @@ static void opj_t1_enc_sigpass_step( opj_t1_t *t1, flag = vsc ? (OPJ_UINT32)((*flagsp) & (~(T1_SIG_S | T1_SIG_SE | T1_SIG_SW | T1_SGN_S))) : (OPJ_UINT32)(*flagsp); if ((flag & T1_SIG_OTH) && !(flag & (T1_SIG | T1_VISIT))) { v = (opj_int_abs(*datap) & one) ? 1 : 0; - opj_mqc_setcurctx(mqc, opj_t1_getctxno_zc(flag, orient)); /* ESSAI */ + opj_mqc_setcurctx(mqc, opj_t1_getctxno_zc(mqc, flag)); /* ESSAI */ if (type == T1_TYPE_RAW) { /* BYPASS/LAZY MODE */ opj_mqc_bypass_enc(mqc, (OPJ_UINT32)v); } else { @@ -400,14 +386,12 @@ static INLINE void opj_t1_dec_sigpass_step_raw( opj_t1_t *t1, opj_flag_t *flagsp, OPJ_INT32 *datap, - OPJ_INT32 orient, OPJ_INT32 oneplushalf, OPJ_INT32 vsc) { OPJ_INT32 v, flag; opj_raw_t *raw = t1->raw; /* RAW component */ - OPJ_ARG_NOT_USED(orient); - + flag = vsc ? ((*flagsp) & (~(T1_SIG_S | T1_SIG_SE | T1_SIG_SW | T1_SGN_S))) : (*flagsp); if ((flag & T1_SIG_OTH) && !(flag & (T1_SIG | T1_VISIT))) { if (opj_raw_decode(raw)) { @@ -423,7 +407,6 @@ static INLINE void opj_t1_dec_sigpass_step_mqc( opj_t1_t *t1, opj_flag_t *flagsp, OPJ_INT32 *datap, - OPJ_INT32 orient, OPJ_INT32 oneplushalf) { OPJ_INT32 v, flag; @@ -432,7 +415,7 @@ static INLINE void opj_t1_dec_sigpass_step_mqc( flag = *flagsp; if ((flag & T1_SIG_OTH) && !(flag & (T1_SIG | T1_VISIT))) { - opj_mqc_setcurctx(mqc, opj_t1_getctxno_zc((OPJ_UINT32)flag, (OPJ_UINT32)orient)); + opj_mqc_setcurctx(mqc, opj_t1_getctxno_zc(mqc, (OPJ_UINT32)flag)); if (opj_mqc_decode(mqc)) { opj_mqc_setcurctx(mqc, opj_t1_getctxno_sc((OPJ_UINT32)flag)); v = opj_mqc_decode(mqc) ^ opj_t1_getspb((OPJ_UINT32)flag); @@ -447,7 +430,6 @@ static INLINE void opj_t1_dec_sigpass_step_mqc_vsc( opj_t1_t *t1, opj_flag_t *flagsp, OPJ_INT32 *datap, - OPJ_INT32 orient, OPJ_INT32 oneplushalf, OPJ_INT32 vsc) { @@ -457,7 +439,7 @@ static INLINE void opj_t1_dec_sigpass_step_mqc_vsc( flag = vsc ? ((*flagsp) & (~(T1_SIG_S | T1_SIG_SE | T1_SIG_SW | T1_SGN_S))) : (*flagsp); if ((flag & T1_SIG_OTH) && !(flag & (T1_SIG | T1_VISIT))) { - opj_mqc_setcurctx(mqc, opj_t1_getctxno_zc((OPJ_UINT32)flag, (OPJ_UINT32)orient)); + opj_mqc_setcurctx(mqc, opj_t1_getctxno_zc(mqc, (OPJ_UINT32)flag)); if (opj_mqc_decode(mqc)) { opj_mqc_setcurctx(mqc, opj_t1_getctxno_sc((OPJ_UINT32)flag)); v = opj_mqc_decode(mqc) ^ opj_t1_getspb((OPJ_UINT32)flag); @@ -472,7 +454,6 @@ static INLINE void opj_t1_dec_sigpass_step_mqc_vsc( static void opj_t1_enc_sigpass(opj_t1_t *t1, OPJ_INT32 bpno, - OPJ_UINT32 orient, OPJ_INT32 *nmsedec, OPJ_BYTE type, OPJ_UINT32 cblksty @@ -491,7 +472,6 @@ static void opj_t1_enc_sigpass(opj_t1_t *t1, t1, &t1->flags[((j+1) * t1->flags_stride) + i + 1], &t1->data[(j * t1->data_stride) + i], - orient, bpno, one, nmsedec, @@ -505,7 +485,6 @@ static void opj_t1_enc_sigpass(opj_t1_t *t1, static void opj_t1_dec_sigpass_raw( opj_t1_t *t1, OPJ_INT32 bpno, - OPJ_INT32 orient, OPJ_INT32 cblksty) { OPJ_INT32 one, half, oneplushalf, vsc; @@ -521,7 +500,6 @@ static void opj_t1_dec_sigpass_raw( t1, &t1->flags[((j+1) * t1->flags_stride) + i + 1], &t1->data[(j * t1->w) + i], - orient, oneplushalf, vsc); } @@ -529,7 +507,7 @@ static void opj_t1_dec_sigpass_raw( } } /* VSC and BYPASS by Antonin */ -#define opj_t1_dec_sigpass_mqc_internal(t1, bpno, orient, w, h, flags_stride) \ +#define opj_t1_dec_sigpass_mqc_internal(t1, bpno, w, h, flags_stride) \ { \ OPJ_INT32 one, half, oneplushalf; \ OPJ_UINT32 i, j, k; \ @@ -543,16 +521,16 @@ static void opj_t1_dec_sigpass_raw( OPJ_INT32 *data2 = data1 + i; \ opj_flag_t *flags2 = flags1 + i; \ flags2 += flags_stride; \ - opj_t1_dec_sigpass_step_mqc(t1, flags2, data2, orient, oneplushalf); \ + opj_t1_dec_sigpass_step_mqc(t1, flags2, data2, oneplushalf); \ data2 += w; \ flags2 += flags_stride; \ - opj_t1_dec_sigpass_step_mqc(t1, flags2, data2, orient, oneplushalf); \ + opj_t1_dec_sigpass_step_mqc(t1, flags2, data2, oneplushalf); \ data2 += w; \ flags2 += flags_stride; \ - opj_t1_dec_sigpass_step_mqc(t1, flags2, data2, orient, oneplushalf); \ + opj_t1_dec_sigpass_step_mqc(t1, flags2, data2, oneplushalf); \ data2 += w; \ flags2 += flags_stride; \ - opj_t1_dec_sigpass_step_mqc(t1, flags2, data2, orient, oneplushalf); \ + opj_t1_dec_sigpass_step_mqc(t1, flags2, data2, oneplushalf); \ data2 += w; \ } \ data1 += w << 2; \ @@ -563,7 +541,7 @@ static void opj_t1_dec_sigpass_raw( opj_flag_t *flags2 = flags1 + i; \ for (j = k; j < h; ++j) { \ flags2 += flags_stride; \ - opj_t1_dec_sigpass_step_mqc(t1, flags2, data2, orient, oneplushalf); \ + opj_t1_dec_sigpass_step_mqc(t1, flags2, data2, oneplushalf); \ data2 += w; \ } \ } \ @@ -571,25 +549,22 @@ static void opj_t1_dec_sigpass_raw( static void opj_t1_dec_sigpass_mqc_64x64( opj_t1_t *t1, - OPJ_INT32 bpno, - OPJ_INT32 orient) + OPJ_INT32 bpno) { - opj_t1_dec_sigpass_mqc_internal(t1, bpno, orient, 64, 64, 66); + opj_t1_dec_sigpass_mqc_internal(t1, bpno, 64, 64, 66); } static void opj_t1_dec_sigpass_mqc_generic( opj_t1_t *t1, - OPJ_INT32 bpno, - OPJ_INT32 orient) + OPJ_INT32 bpno) { - opj_t1_dec_sigpass_mqc_internal(t1, bpno, orient, t1->w, t1->h, t1->flags_stride); + opj_t1_dec_sigpass_mqc_internal(t1, bpno, t1->w, t1->h, t1->flags_stride); } /* VSC and BYPASS by Antonin */ static void opj_t1_dec_sigpass_mqc_vsc( opj_t1_t *t1, - OPJ_INT32 bpno, - OPJ_INT32 orient) + OPJ_INT32 bpno) { OPJ_INT32 one, half, oneplushalf, vsc; OPJ_UINT32 i, j, k; @@ -604,7 +579,6 @@ static void opj_t1_dec_sigpass_mqc_vsc( t1, &t1->flags[((j+1) * t1->flags_stride) + i + 1], &t1->data[(j * t1->w) + i], - orient, oneplushalf, vsc); } @@ -850,7 +824,6 @@ static void opj_t1_enc_clnpass_step( opj_t1_t *t1, opj_flag_t *flagsp, OPJ_INT32 *datap, - OPJ_UINT32 orient, OPJ_INT32 bpno, OPJ_INT32 one, OPJ_INT32 *nmsedec, @@ -867,7 +840,7 @@ static void opj_t1_enc_clnpass_step( goto LABEL_PARTIAL; } if (!(*flagsp & (T1_SIG | T1_VISIT))) { - opj_mqc_setcurctx(mqc, opj_t1_getctxno_zc(flag, orient)); + opj_mqc_setcurctx(mqc, opj_t1_getctxno_zc(mqc, flag)); v = (opj_int_abs(*datap) & one) ? 1 : 0; opj_mqc_encode(mqc, (OPJ_UINT32)v); if (v) { @@ -886,14 +859,11 @@ static void opj_t1_dec_clnpass_step_partial( opj_t1_t *t1, opj_flag_t *flagsp, OPJ_INT32 *datap, - OPJ_INT32 orient, OPJ_INT32 oneplushalf) { OPJ_INT32 v, flag; opj_mqc_t *mqc = t1->mqc; /* MQC component */ - OPJ_ARG_NOT_USED(orient); - flag = *flagsp; opj_mqc_setcurctx(mqc, opj_t1_getctxno_sc((OPJ_UINT32)flag)); v = opj_mqc_decode(mqc) ^ opj_t1_getspb((OPJ_UINT32)flag); @@ -906,7 +876,6 @@ static void opj_t1_dec_clnpass_step( opj_t1_t *t1, opj_flag_t *flagsp, OPJ_INT32 *datap, - OPJ_INT32 orient, OPJ_INT32 oneplushalf) { OPJ_INT32 v, flag; @@ -915,7 +884,7 @@ static void opj_t1_dec_clnpass_step( flag = *flagsp; if (!(flag & (T1_SIG | T1_VISIT))) { - opj_mqc_setcurctx(mqc, opj_t1_getctxno_zc((OPJ_UINT32)flag, (OPJ_UINT32)orient)); + opj_mqc_setcurctx(mqc, opj_t1_getctxno_zc(mqc, (OPJ_UINT32)flag)); if (opj_mqc_decode(mqc)) { opj_mqc_setcurctx(mqc, opj_t1_getctxno_sc((OPJ_UINT32)flag)); v = opj_mqc_decode(mqc) ^ opj_t1_getspb((OPJ_UINT32)flag); @@ -930,7 +899,6 @@ static void opj_t1_dec_clnpass_step_only_if_flag_not_sig_visit( opj_t1_t *t1, opj_flag_t *flagsp, OPJ_INT32 *datap, - OPJ_INT32 orient, OPJ_INT32 oneplushalf) { OPJ_INT32 v; @@ -941,7 +909,7 @@ static void opj_t1_dec_clnpass_step_only_if_flag_not_sig_visit( flag = *flagsp; /*if (!(flag & (T1_SIG | T1_VISIT)))*/ { - opj_mqc_setcurctx(mqc, opj_t1_getctxno_zc((OPJ_UINT32)flag, (OPJ_UINT32)orient)); + opj_mqc_setcurctx(mqc, opj_t1_getctxno_zc(mqc, (OPJ_UINT32)flag)); if (opj_mqc_decode(mqc)) { opj_mqc_setcurctx(mqc, opj_t1_getctxno_sc((OPJ_UINT32)flag)); v = opj_mqc_decode(mqc) ^ opj_t1_getspb((OPJ_UINT32)flag); @@ -956,7 +924,6 @@ static void opj_t1_dec_clnpass_step_vsc( opj_t1_t *t1, opj_flag_t *flagsp, OPJ_INT32 *datap, - OPJ_INT32 orient, OPJ_INT32 oneplushalf, OPJ_INT32 partial, OPJ_INT32 vsc) @@ -970,7 +937,7 @@ static void opj_t1_dec_clnpass_step_vsc( goto LABEL_PARTIAL; } if (!(flag & (T1_SIG | T1_VISIT))) { - opj_mqc_setcurctx(mqc, opj_t1_getctxno_zc((OPJ_UINT32)flag, (OPJ_UINT32)orient)); + opj_mqc_setcurctx(mqc, opj_t1_getctxno_zc(mqc, (OPJ_UINT32)flag)); if (opj_mqc_decode(mqc)) { LABEL_PARTIAL: opj_mqc_setcurctx(mqc, opj_t1_getctxno_sc((OPJ_UINT32)flag)); @@ -985,7 +952,6 @@ LABEL_PARTIAL: static void opj_t1_enc_clnpass( opj_t1_t *t1, OPJ_INT32 bpno, - OPJ_UINT32 orient, OPJ_INT32 *nmsedec, OPJ_UINT32 cblksty) { @@ -1037,7 +1003,6 @@ static void opj_t1_enc_clnpass( t1, &t1->flags[((j+1) * t1->flags_stride) + i + 1], &t1->data[(j * t1->data_stride) + i], - orient, bpno, one, nmsedec, @@ -1050,7 +1015,7 @@ static void opj_t1_enc_clnpass( #define MACRO_t1_flags_internal(x,y,flags_stride) t1->flags[((x)*(flags_stride))+(y)] -#define opj_t1_dec_clnpass_internal(t1, bpno, orient, cblksty, w, h, flags_stride) \ +#define opj_t1_dec_clnpass_internal(t1, bpno, cblksty, w, h, flags_stride) \ { \ OPJ_INT32 one, half, oneplushalf, agg, runlen, vsc; \ OPJ_UINT32 i, j, k; \ @@ -1090,7 +1055,6 @@ static void opj_t1_enc_clnpass( t1, \ &t1->flags[((j+1) * flags_stride) + i + 1], \ &t1->data[(j * w) + i], \ - orient, \ oneplushalf, \ agg && (j == k + (OPJ_UINT32)runlen), \ vsc); \ @@ -1121,9 +1085,9 @@ static void opj_t1_enc_clnpass( for (j = (OPJ_UINT32)runlen; j < 4 && j < h; ++j) { \ flags2 += flags_stride; \ if (agg && (j == (OPJ_UINT32)runlen)) { \ - opj_t1_dec_clnpass_step_partial(t1, flags2, data2, orient, oneplushalf); \ + opj_t1_dec_clnpass_step_partial(t1, flags2, data2, oneplushalf); \ } else { \ - opj_t1_dec_clnpass_step(t1, flags2, data2, orient, oneplushalf); \ + opj_t1_dec_clnpass_step(t1, flags2, data2, oneplushalf); \ } \ data2 += w; \ } \ @@ -1132,25 +1096,25 @@ static void opj_t1_enc_clnpass( flags2 += flags_stride; \ flag = *flags2; \ if (!(flag & (T1_SIG | T1_VISIT))) \ - opj_t1_dec_clnpass_step_only_if_flag_not_sig_visit(t1, flags2, data2, orient, oneplushalf); \ + opj_t1_dec_clnpass_step_only_if_flag_not_sig_visit(t1, flags2, data2, oneplushalf); \ *flags2 &= ~T1_VISIT; \ data2 += w; \ flags2 += flags_stride; \ flag = *flags2; \ if (!(flag & (T1_SIG | T1_VISIT))) \ - opj_t1_dec_clnpass_step_only_if_flag_not_sig_visit(t1, flags2, data2, orient, oneplushalf); \ + opj_t1_dec_clnpass_step_only_if_flag_not_sig_visit(t1, flags2, data2, oneplushalf); \ *flags2 &= ~T1_VISIT; \ data2 += w; \ flags2 += flags_stride; \ flag = *flags2; \ if (!(flag & (T1_SIG | T1_VISIT))) \ - opj_t1_dec_clnpass_step_only_if_flag_not_sig_visit(t1, flags2, data2, orient, oneplushalf); \ + opj_t1_dec_clnpass_step_only_if_flag_not_sig_visit(t1, flags2, data2, oneplushalf); \ *flags2 &= ~T1_VISIT; \ data2 += w; \ flags2 += flags_stride; \ flag = *flags2; \ if (!(flag & (T1_SIG | T1_VISIT))) \ - opj_t1_dec_clnpass_step_only_if_flag_not_sig_visit(t1, flags2, data2, orient, oneplushalf); \ + opj_t1_dec_clnpass_step_only_if_flag_not_sig_visit(t1, flags2, data2, oneplushalf); \ *flags2 &= ~T1_VISIT; \ data2 += w; \ } \ @@ -1163,7 +1127,7 @@ static void opj_t1_enc_clnpass( opj_flag_t *flags2 = flags1 + i; \ for (j = k; j < h; ++j) { \ flags2 += flags_stride; \ - opj_t1_dec_clnpass_step(t1, flags2, data2, orient, oneplushalf); \ + opj_t1_dec_clnpass_step(t1, flags2, data2, oneplushalf); \ data2 += w; \ } \ } \ @@ -1187,19 +1151,17 @@ static void opj_t1_enc_clnpass( static void opj_t1_dec_clnpass_64x64( opj_t1_t *t1, OPJ_INT32 bpno, - OPJ_INT32 orient, OPJ_INT32 cblksty) { - opj_t1_dec_clnpass_internal(t1, bpno, orient, cblksty, 64, 64, 66); + opj_t1_dec_clnpass_internal(t1, bpno, cblksty, 64, 64, 66); } static void opj_t1_dec_clnpass_generic( opj_t1_t *t1, OPJ_INT32 bpno, - OPJ_INT32 orient, OPJ_INT32 cblksty) { - opj_t1_dec_clnpass_internal(t1, bpno, orient, cblksty, t1->w, t1->h, t1->flags_stride); + opj_t1_dec_clnpass_internal(t1, bpno, cblksty, t1->w, t1->h, t1->flags_stride); } @@ -1443,6 +1405,8 @@ static OPJ_BOOL opj_t1_decode_cblk(opj_t1_t *t1, { opj_raw_t *raw = t1->raw; /* RAW component */ opj_mqc_t *mqc = t1->mqc; /* MQC component */ + + mqc->lut_ctxno_zc_orient = lut_ctxno_zc + orient * 256; OPJ_INT32 bpno_plus_one; OPJ_UINT32 passtype; @@ -1488,12 +1452,12 @@ static OPJ_BOOL opj_t1_decode_cblk(opj_t1_t *t1, switch (passtype) { case 0: if (type == T1_TYPE_RAW) { - opj_t1_dec_sigpass_raw(t1, bpno_plus_one, (OPJ_INT32)orient, (OPJ_INT32)cblksty); + opj_t1_dec_sigpass_raw(t1, bpno_plus_one, (OPJ_INT32)cblksty); } else { if (cblksty & J2K_CCP_CBLKSTY_VSC) { - opj_t1_dec_sigpass_mqc_vsc(t1, bpno_plus_one, (OPJ_INT32)orient); + opj_t1_dec_sigpass_mqc_vsc(t1, bpno_plus_one); } else { - opj_t1_dec_sigpass_mqc_64x64(t1, bpno_plus_one, (OPJ_INT32)orient); + opj_t1_dec_sigpass_mqc_64x64(t1, bpno_plus_one); } } break; @@ -1509,7 +1473,7 @@ static OPJ_BOOL opj_t1_decode_cblk(opj_t1_t *t1, } break; case 2: - opj_t1_dec_clnpass_64x64(t1, bpno_plus_one, (OPJ_INT32)orient, (OPJ_INT32)cblksty); + opj_t1_dec_clnpass_64x64(t1, bpno_plus_one, (OPJ_INT32)cblksty); break; } @@ -1531,12 +1495,12 @@ static OPJ_BOOL opj_t1_decode_cblk(opj_t1_t *t1, switch (passtype) { case 0: if (type == T1_TYPE_RAW) { - opj_t1_dec_sigpass_raw(t1, bpno_plus_one, (OPJ_INT32)orient, (OPJ_INT32)cblksty); + opj_t1_dec_sigpass_raw(t1, bpno_plus_one, (OPJ_INT32)cblksty); } else { if (cblksty & J2K_CCP_CBLKSTY_VSC) { - opj_t1_dec_sigpass_mqc_vsc(t1, bpno_plus_one, (OPJ_INT32)orient); + opj_t1_dec_sigpass_mqc_vsc(t1, bpno_plus_one); } else { - opj_t1_dec_sigpass_mqc_generic(t1, bpno_plus_one, (OPJ_INT32)orient); + opj_t1_dec_sigpass_mqc_generic(t1, bpno_plus_one); } } break; @@ -1552,7 +1516,7 @@ static OPJ_BOOL opj_t1_decode_cblk(opj_t1_t *t1, } break; case 2: - opj_t1_dec_clnpass_generic(t1, bpno_plus_one, (OPJ_INT32)orient, (OPJ_INT32)cblksty); + opj_t1_dec_clnpass_generic(t1, bpno_plus_one, (OPJ_INT32)cblksty); break; } @@ -1695,6 +1659,7 @@ static void opj_t1_encode_cblk(opj_t1_t *t1, OPJ_FLOAT64 cumwmsedec = 0.0; opj_mqc_t *mqc = t1->mqc; /* MQC component */ + mqc->lut_ctxno_zc_orient = lut_ctxno_zc + orient * 256; OPJ_UINT32 passno; OPJ_INT32 bpno; @@ -1731,13 +1696,13 @@ static void opj_t1_encode_cblk(opj_t1_t *t1, switch (passtype) { case 0: - opj_t1_enc_sigpass(t1, bpno, orient, &nmsedec, type, cblksty); + opj_t1_enc_sigpass(t1, bpno, &nmsedec, type, cblksty); break; case 1: opj_t1_enc_refpass(t1, bpno, &nmsedec, type, cblksty); break; case 2: - opj_t1_enc_clnpass(t1, bpno, orient, &nmsedec, cblksty); + opj_t1_enc_clnpass(t1, bpno, &nmsedec, cblksty); /* code switch SEGMARK (i.e. SEGSYM) */ if (cblksty & J2K_CCP_CBLKSTY_SEGSYM) opj_mqc_segmark_enc(mqc); From 31882ad7f4a0d9d0231c3fdb9c75a6b69912e1b7 Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Sun, 22 May 2016 00:54:06 +0200 Subject: [PATCH 06/22] Const'ify lut arrays so they are in the read-only data section --- src/lib/openjp2/t1_generate_luts.c | 14 +++++++------- src/lib/openjp2/t1_luts.h | 14 +++++++------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/lib/openjp2/t1_generate_luts.c b/src/lib/openjp2/t1_generate_luts.c index cba7245d..1e6e7b06 100644 --- a/src/lib/openjp2/t1_generate_luts.c +++ b/src/lib/openjp2/t1_generate_luts.c @@ -216,7 +216,7 @@ int main(int argc, char **argv) } } - printf("static OPJ_BYTE lut_ctxno_zc[1024] = {\n "); + printf("static const OPJ_BYTE lut_ctxno_zc[1024] = {\n "); for (i = 0; i < 1023; ++i) { printf("%i, ", lut_ctxno_zc[i]); if(!((i+1)&0x1f)) @@ -225,7 +225,7 @@ int main(int argc, char **argv) printf("%i\n};\n\n", lut_ctxno_zc[1023]); /* lut_ctxno_sc */ - printf("static OPJ_BYTE lut_ctxno_sc[256] = {\n "); + printf("static const OPJ_BYTE lut_ctxno_sc[256] = {\n "); for (i = 0; i < 255; ++i) { printf("0x%x, ", t1_init_ctxno_sc(i << 4)); if(!((i+1)&0xf)) @@ -234,7 +234,7 @@ int main(int argc, char **argv) printf("0x%x\n};\n\n", t1_init_ctxno_sc(255 << 4)); /* lut_spb */ - printf("static OPJ_BYTE lut_spb[256] = {\n "); + printf("static const OPJ_BYTE lut_spb[256] = {\n "); for (i = 0; i < 255; ++i) { printf("%i, ", t1_init_spb(i << 4)); if(!((i+1)&0x1f)) @@ -268,16 +268,16 @@ int main(int argc, char **argv) (int) (floor((u * u) * pow(2, T1_NMSEDEC_FRACBITS) + 0.5) / pow(2, T1_NMSEDEC_FRACBITS) * 8192.0)); } - printf("static OPJ_INT16 lut_nmsedec_sig[1 << T1_NMSEDEC_BITS] = {\n "); + printf("static const OPJ_INT16 lut_nmsedec_sig[1 << T1_NMSEDEC_BITS] = {\n "); dump_array16(lut_nmsedec_sig, 1 << T1_NMSEDEC_BITS); - printf("static OPJ_INT16 lut_nmsedec_sig0[1 << T1_NMSEDEC_BITS] = {\n "); + printf("static const OPJ_INT16 lut_nmsedec_sig0[1 << T1_NMSEDEC_BITS] = {\n "); dump_array16(lut_nmsedec_sig0, 1 << T1_NMSEDEC_BITS); - printf("static OPJ_INT16 lut_nmsedec_ref[1 << T1_NMSEDEC_BITS] = {\n "); + printf("static const OPJ_INT16 lut_nmsedec_ref[1 << T1_NMSEDEC_BITS] = {\n "); dump_array16(lut_nmsedec_ref, 1 << T1_NMSEDEC_BITS); - printf("static OPJ_INT16 lut_nmsedec_ref0[1 << T1_NMSEDEC_BITS] = {\n "); + printf("static const OPJ_INT16 lut_nmsedec_ref0[1 << T1_NMSEDEC_BITS] = {\n "); dump_array16(lut_nmsedec_ref0, 1 << T1_NMSEDEC_BITS); return 0; diff --git a/src/lib/openjp2/t1_luts.h b/src/lib/openjp2/t1_luts.h index 37776b65..c66a8aeb 100644 --- a/src/lib/openjp2/t1_luts.h +++ b/src/lib/openjp2/t1_luts.h @@ -1,6 +1,6 @@ /* This file was automatically generated by t1_generate_luts.c */ -static OPJ_BYTE lut_ctxno_zc[1024] = { +static const OPJ_BYTE lut_ctxno_zc[1024] = { 0, 1, 1, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, @@ -35,7 +35,7 @@ static OPJ_BYTE lut_ctxno_zc[1024] = { 2, 5, 5, 7, 5, 7, 7, 8, 5, 7, 7, 8, 7, 8, 8, 8, 2, 5, 5, 7, 5, 7, 7, 8, 5, 7, 7, 8, 7, 8, 8, 8 }; -static OPJ_BYTE lut_ctxno_sc[256] = { +static const OPJ_BYTE lut_ctxno_sc[256] = { 0x9, 0xa, 0xc, 0xd, 0xa, 0xa, 0xd, 0xd, 0xc, 0xd, 0xc, 0xd, 0xd, 0xd, 0xd, 0xd, 0x9, 0xa, 0xc, 0xb, 0xa, 0x9, 0xd, 0xc, 0xc, 0xb, 0xc, 0xb, 0xd, 0xc, 0xd, 0xc, 0x9, 0xa, 0xc, 0xb, 0xa, 0xa, 0xb, 0xb, 0xc, 0xd, 0x9, 0xa, 0xd, 0xd, 0xa, 0xa, @@ -54,7 +54,7 @@ static OPJ_BYTE lut_ctxno_sc[256] = { 0x9, 0xa, 0xc, 0xd, 0xa, 0xa, 0xd, 0xd, 0xc, 0xd, 0xc, 0xd, 0xd, 0xd, 0xd, 0xd }; -static OPJ_BYTE lut_spb[256] = { +static const OPJ_BYTE lut_spb[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -65,7 +65,7 @@ static OPJ_BYTE lut_spb[256] = { 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }; -static OPJ_INT16 lut_nmsedec_sig[1 << T1_NMSEDEC_BITS] = { +static const OPJ_INT16 lut_nmsedec_sig[1 << T1_NMSEDEC_BITS] = { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, @@ -84,7 +84,7 @@ static OPJ_INT16 lut_nmsedec_sig[1 << T1_NMSEDEC_BITS] = { 0x6c00, 0x6d80, 0x6f00, 0x7080, 0x7200, 0x7380, 0x7500, 0x7680 }; -static OPJ_INT16 lut_nmsedec_sig0[1 << T1_NMSEDEC_BITS] = { +static const OPJ_INT16 lut_nmsedec_sig0[1 << T1_NMSEDEC_BITS] = { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0080, 0x0080, 0x0080, 0x0080, 0x0100, 0x0100, 0x0100, 0x0180, 0x0180, 0x0200, 0x0200, 0x0280, 0x0280, 0x0300, 0x0300, 0x0380, 0x0400, 0x0400, @@ -103,7 +103,7 @@ static OPJ_INT16 lut_nmsedec_sig0[1 << T1_NMSEDEC_BITS] = { 0x7080, 0x7280, 0x7480, 0x7600, 0x7800, 0x7a00, 0x7c00, 0x7e00 }; -static OPJ_INT16 lut_nmsedec_ref[1 << T1_NMSEDEC_BITS] = { +static const OPJ_INT16 lut_nmsedec_ref[1 << T1_NMSEDEC_BITS] = { 0x1800, 0x1780, 0x1700, 0x1680, 0x1600, 0x1580, 0x1500, 0x1480, 0x1400, 0x1380, 0x1300, 0x1280, 0x1200, 0x1180, 0x1100, 0x1080, 0x1000, 0x0f80, 0x0f00, 0x0e80, 0x0e00, 0x0d80, 0x0d00, 0x0c80, @@ -122,7 +122,7 @@ static OPJ_INT16 lut_nmsedec_ref[1 << T1_NMSEDEC_BITS] = { 0x1400, 0x1480, 0x1500, 0x1580, 0x1600, 0x1680, 0x1700, 0x1780 }; -static OPJ_INT16 lut_nmsedec_ref0[1 << T1_NMSEDEC_BITS] = { +static const OPJ_INT16 lut_nmsedec_ref0[1 << T1_NMSEDEC_BITS] = { 0x2000, 0x1f00, 0x1e00, 0x1d00, 0x1c00, 0x1b00, 0x1a80, 0x1980, 0x1880, 0x1780, 0x1700, 0x1600, 0x1500, 0x1480, 0x1380, 0x1300, 0x1200, 0x1180, 0x1080, 0x1000, 0x0f00, 0x0e80, 0x0e00, 0x0d00, From 1da397e94a4e441a7c9a1aa4c2debd1c06ba05e2 Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Mon, 23 May 2016 01:33:06 +0200 Subject: [PATCH 07/22] Tier 1 decoding: add a colflags array Addition flag array such that colflags[1+0] is for state of col=0,row=0..3, colflags[1+1] for col=1, row=0..3, colflags[1+flags_stride] for col=0,row=4..7, ... This array avoids too much cache trashing when processing by 4 vertical samples as done in the various decoding steps. --- src/lib/openjp2/t1.c | 400 +++++++++++++++++++++++++++++++------------ src/lib/openjp2/t1.h | 36 +++- 2 files changed, 327 insertions(+), 109 deletions(-) diff --git a/src/lib/openjp2/t1.c b/src/lib/openjp2/t1.c index 30919585..9ad6ffd0 100644 --- a/src/lib/openjp2/t1.c +++ b/src/lib/openjp2/t1.c @@ -39,6 +39,8 @@ #include "opj_includes.h" #include "t1_luts.h" +/* #define CONSISTENCY_CHECK */ + /** @defgroup T1 T1 - Implementation of the tier-1 coding */ /*@{*/ @@ -51,7 +53,7 @@ static INLINE OPJ_UINT32 opj_t1_getctxno_mag(OPJ_UINT32 f); static OPJ_BYTE opj_t1_getspb(OPJ_UINT32 f); static OPJ_INT16 opj_t1_getnmsedec_sig(OPJ_UINT32 x, OPJ_UINT32 bitpos); static OPJ_INT16 opj_t1_getnmsedec_ref(OPJ_UINT32 x, OPJ_UINT32 bitpos); -static void opj_t1_updateflags(opj_flag_t *flagsp, OPJ_UINT32 s, OPJ_UINT32 stride); +static INLINE void opj_t1_updateflags(opj_flag_t *flagsp, OPJ_UINT32 s, OPJ_UINT32 stride); /** Encode significant pass */ @@ -80,20 +82,26 @@ static void opj_t1_dec_sigpass_step(opj_t1_t *t1, static INLINE void opj_t1_dec_sigpass_step_raw( opj_t1_t *t1, opj_flag_t *flagsp, + opj_colflag_t* colflagsp, OPJ_INT32 *datap, OPJ_INT32 oneplushalf, - OPJ_INT32 vsc); + OPJ_INT32 vsc, + OPJ_INT32 row); static INLINE void opj_t1_dec_sigpass_step_mqc( opj_t1_t *t1, opj_flag_t *flagsp, + opj_colflag_t* colflagsp, OPJ_INT32 *datap, - OPJ_INT32 oneplushalf); + OPJ_INT32 oneplushalf, + OPJ_INT32 row); static INLINE void opj_t1_dec_sigpass_step_mqc_vsc( opj_t1_t *t1, opj_flag_t *flagsp, + opj_colflag_t* colflagsp, OPJ_INT32 *datap, OPJ_INT32 oneplushalf, - OPJ_INT32 vsc); + OPJ_INT32 vsc, + OPJ_INT32 row); /** @@ -168,23 +176,28 @@ static void opj_t1_dec_refpass_step(opj_t1_t *t1, static INLINE void opj_t1_dec_refpass_step_raw( opj_t1_t *t1, opj_flag_t *flagsp, + opj_colflag_t *colflagsp, OPJ_INT32 *datap, OPJ_INT32 poshalf, OPJ_INT32 neghalf, - OPJ_INT32 vsc); + OPJ_INT32 row); static INLINE void opj_t1_dec_refpass_step_mqc( opj_t1_t *t1, opj_flag_t *flagsp, - OPJ_INT32 *datap, - OPJ_INT32 poshalf, - OPJ_INT32 neghalf); -static INLINE void opj_t1_dec_refpass_step_mqc_vsc( - opj_t1_t *t1, - opj_flag_t *flagsp, + opj_colflag_t *colflagsp, OPJ_INT32 *datap, OPJ_INT32 poshalf, OPJ_INT32 neghalf, - OPJ_INT32 vsc); + OPJ_INT32 row); +static INLINE void opj_t1_dec_refpass_step_mqc_vsc( + opj_t1_t *t1, + opj_flag_t *flagsp, + opj_colflag_t *colflagsp, + OPJ_INT32 *datap, + OPJ_INT32 poshalf, + OPJ_INT32 neghalf, + OPJ_INT32 vsc, + OPJ_INT32 row); @@ -206,20 +219,26 @@ Decode clean-up pass static void opj_t1_dec_clnpass_step_partial( opj_t1_t *t1, opj_flag_t *flagsp, + opj_colflag_t *colflagsp, OPJ_INT32 *datap, - OPJ_INT32 oneplushalf); + OPJ_INT32 oneplushalf, + OPJ_INT32 row); static void opj_t1_dec_clnpass_step( opj_t1_t *t1, opj_flag_t *flagsp, + opj_colflag_t *colflagsp, OPJ_INT32 *datap, - OPJ_INT32 oneplushalf); + OPJ_INT32 oneplushalf, + OPJ_INT32 row); static void opj_t1_dec_clnpass_step_vsc( opj_t1_t *t1, opj_flag_t *flagsp, + opj_colflag_t *colflagsp, OPJ_INT32 *datap, OPJ_INT32 oneplushalf, OPJ_INT32 partial, - OPJ_INT32 vsc); + OPJ_INT32 vsc, + OPJ_INT32 row); /** Encode clean-up pass */ @@ -319,7 +338,7 @@ static OPJ_INT16 opj_t1_getnmsedec_ref(OPJ_UINT32 x, OPJ_UINT32 bitpos) { return lut_nmsedec_ref0[x & ((1 << T1_NMSEDEC_BITS) - 1)]; } -static void opj_t1_updateflags(opj_flag_t *flagsp, OPJ_UINT32 s, OPJ_UINT32 stride) { +static INLINE void opj_t1_updateflags(opj_flag_t *flagsp, OPJ_UINT32 s, OPJ_UINT32 stride) { opj_flag_t *np = flagsp - stride; opj_flag_t *sp = flagsp + stride; @@ -342,6 +361,47 @@ static void opj_t1_updateflags(opj_flag_t *flagsp, OPJ_UINT32 s, OPJ_UINT32 stri sp[1] |= T1_SIG_NW; } +static INLINE void opj_t1_updateflagscolflags(opj_flag_t *flagsp, opj_colflag_t *colflagsp, OPJ_UINT32 s, OPJ_UINT32 stride, OPJ_INT32 row) +{ + opj_t1_updateflags(flagsp, s, stride); + if( row == 0 ) + { + *colflagsp |= (T1_COLFLAG_SIG_ROW_0 << (T1_COLFLAG_RBS * row)) | + (T1_COLFLAG_SIG_OTHER_ROW_0 << (T1_COLFLAG_RBS * (row+1))); + *(colflagsp - 1) |= (T1_COLFLAG_SIG_OTHER_ROW_0 << (T1_COLFLAG_RBS * row)) | + (T1_COLFLAG_SIG_OTHER_ROW_0 << (T1_COLFLAG_RBS * (row+1))); + *(colflagsp + 1) |= (T1_COLFLAG_SIG_OTHER_ROW_0 << (T1_COLFLAG_RBS * row)) | + (T1_COLFLAG_SIG_OTHER_ROW_0 << (T1_COLFLAG_RBS * (row+1))); + *(colflagsp - stride - 1) |= (T1_COLFLAG_SIG_OTHER_ROW_3); + *(colflagsp - stride) |= (T1_COLFLAG_SIG_OTHER_ROW_3); + *(colflagsp - stride + 1) |= (T1_COLFLAG_SIG_OTHER_ROW_3); + } + else if( row == 3 ) + { + *colflagsp |= (T1_COLFLAG_SIG_ROW_0 << (T1_COLFLAG_RBS * row)) | + (T1_COLFLAG_SIG_OTHER_ROW_0 << (T1_COLFLAG_RBS * (row-1))); + *(colflagsp - 1) |= (T1_COLFLAG_SIG_OTHER_ROW_0 << (T1_COLFLAG_RBS * row)) | + (T1_COLFLAG_SIG_OTHER_ROW_0 << (T1_COLFLAG_RBS * (row-1))); + *(colflagsp + 1) |= (T1_COLFLAG_SIG_OTHER_ROW_0 << (T1_COLFLAG_RBS * row)) | + (T1_COLFLAG_SIG_OTHER_ROW_0 << (T1_COLFLAG_RBS* (row-1))); + *(colflagsp + stride - 1) |= (T1_COLFLAG_SIG_OTHER_ROW_0); + *(colflagsp + stride) |= (T1_COLFLAG_SIG_OTHER_ROW_0); + *(colflagsp + stride + 1) |= (T1_COLFLAG_SIG_OTHER_ROW_0); + } + else + { + *(colflagsp - 1) |= (T1_COLFLAG_SIG_OTHER_ROW_0 << (T1_COLFLAG_RBS * row)) | + (T1_COLFLAG_SIG_OTHER_ROW_0 << (T1_COLFLAG_RBS * (row-1))) | + (T1_COLFLAG_SIG_OTHER_ROW_0 << (T1_COLFLAG_RBS * (row+1))); + *colflagsp |= (T1_COLFLAG_SIG_ROW_0 << (T1_COLFLAG_RBS * row)) | + (T1_COLFLAG_SIG_OTHER_ROW_0 << (T1_COLFLAG_RBS * (row-1))) | + (T1_COLFLAG_SIG_OTHER_ROW_0 << (T1_COLFLAG_RBS * (row+1))); + *(colflagsp + 1) |= (T1_COLFLAG_SIG_OTHER_ROW_0 << (T1_COLFLAG_RBS * row)) | + (T1_COLFLAG_SIG_OTHER_ROW_0 << (T1_COLFLAG_RBS * (row-1))) | + (T1_COLFLAG_SIG_OTHER_ROW_0 << (T1_COLFLAG_RBS * (row+1))); + } +} + static void opj_t1_enc_sigpass_step( opj_t1_t *t1, opj_flag_t *flagsp, OPJ_INT32 *datap, @@ -385,68 +445,88 @@ static void opj_t1_enc_sigpass_step( opj_t1_t *t1, static INLINE void opj_t1_dec_sigpass_step_raw( opj_t1_t *t1, opj_flag_t *flagsp, + opj_colflag_t* colflagsp, OPJ_INT32 *datap, OPJ_INT32 oneplushalf, - OPJ_INT32 vsc) + OPJ_INT32 vsc, + OPJ_INT32 row) { OPJ_INT32 v, flag; opj_raw_t *raw = t1->raw; /* RAW component */ flag = vsc ? ((*flagsp) & (~(T1_SIG_S | T1_SIG_SE | T1_SIG_SW | T1_SGN_S))) : (*flagsp); - if ((flag & T1_SIG_OTH) && !(flag & (T1_SIG | T1_VISIT))) { + if ((flag & T1_SIG_OTH) && !(*colflagsp & ((T1_COLFLAG_SIG_ROW_0 | T1_COLFLAG_VISIT_ROW_0) << (T1_COLFLAG_RBS * row)))) { if (opj_raw_decode(raw)) { v = (OPJ_INT32)opj_raw_decode(raw); /* ESSAI */ *datap = v ? -oneplushalf : oneplushalf; - opj_t1_updateflags(flagsp, (OPJ_UINT32)v, t1->flags_stride); + opj_t1_updateflagscolflags(flagsp, colflagsp, (OPJ_UINT32)v, t1->flags_stride, row); } +#ifdef CONSISTENCY_CHECK *flagsp |= T1_VISIT; +#endif + *colflagsp |= (T1_COLFLAG_VISIT_ROW_0 << (T1_COLFLAG_RBS * row)); } } static INLINE void opj_t1_dec_sigpass_step_mqc( opj_t1_t *t1, opj_flag_t *flagsp, + opj_colflag_t* colflagsp, OPJ_INT32 *datap, - OPJ_INT32 oneplushalf) + OPJ_INT32 oneplushalf, + OPJ_INT32 row) { OPJ_INT32 v, flag; opj_mqc_t *mqc = t1->mqc; /* MQC component */ - - flag = *flagsp; - if ((flag & T1_SIG_OTH) && !(flag & (T1_SIG | T1_VISIT))) { +#ifdef CONSISTENCY_CHECK + assert( ((*flagsp & T1_SIG_OTH) && !(*flagsp & (T1_SIG | T1_VISIT))) == + ((*colflagsp & ((T1_COLFLAG_SIG_ROW_0 | T1_COLFLAG_VISIT_ROW_0 | T1_COLFLAG_SIG_OTHER_ROW_0) << (T1_COLFLAG_RBS * row))) == + (T1_COLFLAG_SIG_OTHER_ROW_0 << (T1_COLFLAG_RBS * row))) ); +#endif + if( (*colflagsp & ((T1_COLFLAG_SIG_ROW_0 | T1_COLFLAG_VISIT_ROW_0 | T1_COLFLAG_SIG_OTHER_ROW_0) << (T1_COLFLAG_RBS * row))) == + (T1_COLFLAG_SIG_OTHER_ROW_0 << (T1_COLFLAG_RBS * row)) ) { + flag = *flagsp; opj_mqc_setcurctx(mqc, opj_t1_getctxno_zc(mqc, (OPJ_UINT32)flag)); if (opj_mqc_decode(mqc)) { opj_mqc_setcurctx(mqc, opj_t1_getctxno_sc((OPJ_UINT32)flag)); v = opj_mqc_decode(mqc) ^ opj_t1_getspb((OPJ_UINT32)flag); *datap = v ? -oneplushalf : oneplushalf; - opj_t1_updateflags(flagsp, (OPJ_UINT32)v, t1->flags_stride); + opj_t1_updateflagscolflags(flagsp, colflagsp, (OPJ_UINT32)v, t1->flags_stride, row); } +#ifdef CONSISTENCY_CHECK *flagsp |= T1_VISIT; +#endif + *colflagsp |= (T1_COLFLAG_VISIT_ROW_0 << (T1_COLFLAG_RBS * row)); } } /* VSC and BYPASS by Antonin */ static INLINE void opj_t1_dec_sigpass_step_mqc_vsc( opj_t1_t *t1, opj_flag_t *flagsp, + opj_colflag_t* colflagsp, OPJ_INT32 *datap, OPJ_INT32 oneplushalf, - OPJ_INT32 vsc) + OPJ_INT32 vsc, + OPJ_INT32 row) { OPJ_INT32 v, flag; opj_mqc_t *mqc = t1->mqc; /* MQC component */ flag = vsc ? ((*flagsp) & (~(T1_SIG_S | T1_SIG_SE | T1_SIG_SW | T1_SGN_S))) : (*flagsp); - if ((flag & T1_SIG_OTH) && !(flag & (T1_SIG | T1_VISIT))) { + if ((flag & T1_SIG_OTH) && !(*colflagsp & ((T1_COLFLAG_SIG_ROW_0 | T1_COLFLAG_VISIT_ROW_0) << (T1_COLFLAG_RBS * row)))) { opj_mqc_setcurctx(mqc, opj_t1_getctxno_zc(mqc, (OPJ_UINT32)flag)); if (opj_mqc_decode(mqc)) { opj_mqc_setcurctx(mqc, opj_t1_getctxno_sc((OPJ_UINT32)flag)); v = opj_mqc_decode(mqc) ^ opj_t1_getspb((OPJ_UINT32)flag); *datap = v ? -oneplushalf : oneplushalf; - opj_t1_updateflags(flagsp, (OPJ_UINT32)v, t1->flags_stride); + opj_t1_updateflagscolflags(flagsp, colflagsp, (OPJ_UINT32)v, t1->flags_stride, row); } +#ifdef CONSISTENCY_CHECK *flagsp |= T1_VISIT; +#endif + *colflagsp |= (T1_COLFLAG_VISIT_ROW_0 << (T1_COLFLAG_RBS * row)); } } /* VSC and BYPASS by Antonin */ @@ -489,21 +569,26 @@ static void opj_t1_dec_sigpass_raw( { OPJ_INT32 one, half, oneplushalf, vsc; OPJ_UINT32 i, j, k; + opj_colflag_t *colflags1 = &t1->colflags[t1->flags_stride + 1]; one = 1 << bpno; half = one >> 1; oneplushalf = one | half; for (k = 0; k < t1->h; k += 4) { for (i = 0; i < t1->w; ++i) { + opj_colflag_t *colflags2 = colflags1 + i; for (j = k; j < k + 4 && j < t1->h; ++j) { vsc = ((cblksty & J2K_CCP_CBLKSTY_VSC) && (j == k + 3 || j == t1->h - 1)) ? 1 : 0; opj_t1_dec_sigpass_step_raw( t1, &t1->flags[((j+1) * t1->flags_stride) + i + 1], + colflags2, &t1->data[(j * t1->w) + i], oneplushalf, - vsc); + vsc, + j - k); } } + colflags1 += t1->flags_stride; } } /* VSC and BYPASS by Antonin */ @@ -513,6 +598,7 @@ static void opj_t1_dec_sigpass_raw( OPJ_UINT32 i, j, k; \ OPJ_INT32 *data1 = t1->data; \ opj_flag_t *flags1 = &t1->flags[1]; \ + opj_colflag_t *colflags1 = &t1->colflags[flags_stride + 1]; \ one = 1 << bpno; \ half = one >> 1; \ oneplushalf = one | half; \ @@ -520,28 +606,32 @@ static void opj_t1_dec_sigpass_raw( for (i = 0; i < w; ++i) { \ OPJ_INT32 *data2 = data1 + i; \ opj_flag_t *flags2 = flags1 + i; \ + opj_colflag_t *colflags2 = colflags1 + i; \ + if( *colflags2 == 0 ) continue; \ flags2 += flags_stride; \ - opj_t1_dec_sigpass_step_mqc(t1, flags2, data2, oneplushalf); \ + opj_t1_dec_sigpass_step_mqc(t1, flags2, colflags2, data2, oneplushalf, 0); \ data2 += w; \ flags2 += flags_stride; \ - opj_t1_dec_sigpass_step_mqc(t1, flags2, data2, oneplushalf); \ + opj_t1_dec_sigpass_step_mqc(t1, flags2, colflags2, data2, oneplushalf, 1); \ data2 += w; \ flags2 += flags_stride; \ - opj_t1_dec_sigpass_step_mqc(t1, flags2, data2, oneplushalf); \ + opj_t1_dec_sigpass_step_mqc(t1, flags2, colflags2, data2, oneplushalf, 2); \ data2 += w; \ flags2 += flags_stride; \ - opj_t1_dec_sigpass_step_mqc(t1, flags2, data2, oneplushalf); \ + opj_t1_dec_sigpass_step_mqc(t1, flags2, colflags2, data2, oneplushalf, 3); \ data2 += w; \ } \ data1 += w << 2; \ flags1 += flags_stride << 2; \ + colflags1 += flags_stride; \ } \ for (i = 0; i < w; ++i) { \ OPJ_INT32 *data2 = data1 + i; \ opj_flag_t *flags2 = flags1 + i; \ + opj_colflag_t *colflags2 = colflags1 + i; \ for (j = k; j < h; ++j) { \ flags2 += flags_stride; \ - opj_t1_dec_sigpass_step_mqc(t1, flags2, data2, oneplushalf); \ + opj_t1_dec_sigpass_step_mqc(t1, flags2, colflags2, data2, oneplushalf, j - k); \ data2 += w; \ } \ } \ @@ -568,21 +658,26 @@ static void opj_t1_dec_sigpass_mqc_vsc( { OPJ_INT32 one, half, oneplushalf, vsc; OPJ_UINT32 i, j, k; + opj_colflag_t *colflags1 = &t1->colflags[t1->flags_stride + 1]; one = 1 << bpno; half = one >> 1; oneplushalf = one | half; for (k = 0; k < t1->h; k += 4) { for (i = 0; i < t1->w; ++i) { + opj_colflag_t *colflags2 = colflags1 + i; for (j = k; j < k + 4 && j < t1->h; ++j) { vsc = (j == k + 3 || j == t1->h - 1) ? 1 : 0; opj_t1_dec_sigpass_step_mqc_vsc( t1, &t1->flags[((j+1) * t1->flags_stride) + i + 1], + colflags2, &t1->data[(j * t1->w) + i], oneplushalf, - vsc); + vsc, + j - k); } } + colflags1 += t1->flags_stride; } } /* VSC and BYPASS by Antonin */ @@ -619,64 +714,81 @@ static void opj_t1_enc_refpass_step( opj_t1_t *t1, static INLINE void opj_t1_dec_refpass_step_raw( opj_t1_t *t1, opj_flag_t *flagsp, + opj_colflag_t *colflagsp, OPJ_INT32 *datap, OPJ_INT32 poshalf, OPJ_INT32 neghalf, - OPJ_INT32 vsc) + OPJ_INT32 row) { - OPJ_INT32 v, t, flag; + OPJ_INT32 v, t; opj_raw_t *raw = t1->raw; /* RAW component */ - flag = vsc ? ((*flagsp) & (~(T1_SIG_S | T1_SIG_SE | T1_SIG_SW | T1_SGN_S))) : (*flagsp); - if ((flag & (T1_SIG | T1_VISIT)) == T1_SIG) { + if ((*colflagsp & ((T1_COLFLAG_SIG_ROW_0 | T1_COLFLAG_VISIT_ROW_0) << (T1_COLFLAG_RBS * row))) == + ((T1_COLFLAG_SIG_ROW_0) << (T1_COLFLAG_RBS * row))) { v = (OPJ_INT32)opj_raw_decode(raw); t = v ? poshalf : neghalf; *datap += *datap < 0 ? -t : t; - *flagsp |= T1_REFINE; + *colflagsp |= (T1_COLFLAG_REFINE_ROW_0 << (T1_COLFLAG_RBS * row)); } } /* VSC and BYPASS by Antonin */ static INLINE void opj_t1_dec_refpass_step_mqc( opj_t1_t *t1, +#ifdef CONSISTENCY_CHECK opj_flag_t *flagsp, +#else + opj_flag_t *flagsp_unused, +#endif + opj_colflag_t *colflagsp, OPJ_INT32 *datap, OPJ_INT32 poshalf, - OPJ_INT32 neghalf) + OPJ_INT32 neghalf, + OPJ_INT32 row) { - OPJ_INT32 v, t, flag; + OPJ_INT32 v, t; opj_mqc_t *mqc = t1->mqc; /* MQC component */ - - flag = *flagsp; - if ((flag & (T1_SIG | T1_VISIT)) == T1_SIG) { - opj_mqc_setcurctx(mqc, opj_t1_getctxno_mag((OPJ_UINT32)flag)); /* ESSAI */ +#ifdef CONSISTENCY_CHECK + assert( ((*flagsp & (T1_SIG | T1_VISIT)) == T1_SIG) == + ((*colflagsp & ((T1_COLFLAG_SIG_ROW_0 | T1_COLFLAG_VISIT_ROW_0) << (T1_COLFLAG_RBS * row))) == ((T1_COLFLAG_SIG_ROW_0) << (T1_COLFLAG_RBS * row))) ); +#endif + if ((*colflagsp & ((T1_COLFLAG_SIG_ROW_0 | T1_COLFLAG_VISIT_ROW_0) << (T1_COLFLAG_RBS * row))) == + ((T1_COLFLAG_SIG_ROW_0) << (T1_COLFLAG_RBS * row))) { + OPJ_UINT32 tmp1 = (*colflagsp & (T1_COLFLAG_SIG_OTHER_ROW_0 << (T1_COLFLAG_RBS * row))) ? T1_CTXNO_MAG + 1 : T1_CTXNO_MAG; + OPJ_UINT32 tmp2 = (*colflagsp & (T1_COLFLAG_REFINE_ROW_0 << (T1_COLFLAG_RBS * row))) ? T1_CTXNO_MAG + 2 : tmp1; + opj_mqc_setcurctx(mqc, tmp2); /* ESSAI */ v = opj_mqc_decode(mqc); t = v ? poshalf : neghalf; *datap += *datap < 0 ? -t : t; - *flagsp |= T1_REFINE; + *colflagsp |= (T1_COLFLAG_REFINE_ROW_0 << (T1_COLFLAG_RBS * row)); } } /* VSC and BYPASS by Antonin */ static INLINE void opj_t1_dec_refpass_step_mqc_vsc( opj_t1_t *t1, opj_flag_t *flagsp, + opj_colflag_t *colflagsp, OPJ_INT32 *datap, OPJ_INT32 poshalf, OPJ_INT32 neghalf, - OPJ_INT32 vsc) + OPJ_INT32 vsc, + OPJ_INT32 row) { OPJ_INT32 v, t, flag; opj_mqc_t *mqc = t1->mqc; /* MQC component */ - flag = vsc ? ((*flagsp) & (~(T1_SIG_S | T1_SIG_SE | T1_SIG_SW | T1_SGN_S))) : (*flagsp); - if ((flag & (T1_SIG | T1_VISIT)) == T1_SIG) { - opj_mqc_setcurctx(mqc, opj_t1_getctxno_mag((OPJ_UINT32)flag)); /* ESSAI */ + if ((*colflagsp & ((T1_COLFLAG_SIG_ROW_0 | T1_COLFLAG_VISIT_ROW_0) << (T1_COLFLAG_RBS * row))) == + ((T1_COLFLAG_SIG_ROW_0) << (T1_COLFLAG_RBS * row))) { + OPJ_INT32 flag = vsc ? ((*flagsp) & (~(T1_SIG_S | T1_SIG_SE | T1_SIG_SW | T1_SGN_S))) : (*flagsp); + OPJ_UINT32 tmp1 = (flag & T1_SIG_OTH) ? T1_CTXNO_MAG + 1 : T1_CTXNO_MAG; + OPJ_UINT32 tmp2 = (*colflagsp & (T1_COLFLAG_REFINE_ROW_0 << (T1_COLFLAG_RBS * row))) ? T1_CTXNO_MAG + 2 : tmp1; + opj_mqc_setcurctx(mqc, tmp2); /* ESSAI */ v = opj_mqc_decode(mqc); t = v ? poshalf : neghalf; *datap += *datap < 0 ? -t : t; - *flagsp |= T1_REFINE; + *colflagsp |= (T1_COLFLAG_REFINE_ROW_0 << (T1_COLFLAG_RBS * row)); } } /* VSC and BYPASS by Antonin */ @@ -719,22 +831,24 @@ static void opj_t1_dec_refpass_raw( OPJ_INT32 one, poshalf, neghalf; OPJ_UINT32 i, j, k; OPJ_INT32 vsc; + opj_colflag_t *colflags1 = &t1->colflags[t1->flags_stride + 1]; one = 1 << bpno; poshalf = one >> 1; neghalf = bpno > 0 ? -poshalf : -1; for (k = 0; k < t1->h; k += 4) { for (i = 0; i < t1->w; ++i) { + opj_colflag_t *colflags2 = colflags1 + i; for (j = k; j < k + 4 && j < t1->h; ++j) { - vsc = ((cblksty & J2K_CCP_CBLKSTY_VSC) && (j == k + 3 || j == t1->h - 1)) ? 1 : 0; opj_t1_dec_refpass_step_raw( t1, &t1->flags[((j+1) * t1->flags_stride) + i + 1], + colflags2, &t1->data[(j * t1->w) + i], poshalf, - neghalf, - vsc); + neghalf, j - k); } } + colflags1 += t1->flags_stride; } } /* VSC and BYPASS by Antonin */ @@ -744,6 +858,7 @@ static void opj_t1_dec_refpass_raw( OPJ_UINT32 i, j, k; \ OPJ_INT32 *data1 = t1->data; \ opj_flag_t *flags1 = &t1->flags[1]; \ + opj_colflag_t *colflags1 = &t1->colflags[flags_stride + 1]; \ one = 1 << bpno; \ poshalf = one >> 1; \ neghalf = bpno > 0 ? -poshalf : -1; \ @@ -751,28 +866,32 @@ static void opj_t1_dec_refpass_raw( for (i = 0; i < w; ++i) { \ OPJ_INT32 *data2 = data1 + i; \ opj_flag_t *flags2 = flags1 + i; \ + opj_colflag_t *colflags2 = colflags1 + i; \ + if( *colflags2 == 0 ) continue; \ flags2 += flags_stride; \ - opj_t1_dec_refpass_step_mqc(t1, flags2, data2, poshalf, neghalf); \ + opj_t1_dec_refpass_step_mqc(t1, flags2, colflags2, data2, poshalf, neghalf, 0); \ data2 += w; \ flags2 += flags_stride; \ - opj_t1_dec_refpass_step_mqc(t1, flags2, data2, poshalf, neghalf); \ + opj_t1_dec_refpass_step_mqc(t1, flags2, colflags2, data2, poshalf, neghalf, 1); \ data2 += w; \ flags2 += flags_stride; \ - opj_t1_dec_refpass_step_mqc(t1, flags2, data2, poshalf, neghalf); \ + opj_t1_dec_refpass_step_mqc(t1, flags2, colflags2, data2, poshalf, neghalf, 2); \ data2 += w; \ flags2 += flags_stride; \ - opj_t1_dec_refpass_step_mqc(t1, flags2, data2, poshalf, neghalf); \ + opj_t1_dec_refpass_step_mqc(t1, flags2, colflags2, data2, poshalf, neghalf, 3); \ data2 += w; \ } \ data1 += w << 2; \ flags1 += flags_stride << 2; \ + colflags1 += flags_stride; \ } \ for (i = 0; i < w; ++i) { \ OPJ_INT32 *data2 = data1 + i; \ opj_flag_t *flags2 = flags1 + i; \ + opj_colflag_t *colflags2 = colflags1 + i; \ for (j = k; j < h; ++j) { \ flags2 += flags_stride; \ - opj_t1_dec_refpass_step_mqc(t1, flags2, data2, poshalf, neghalf); \ + opj_t1_dec_refpass_step_mqc(t1, flags2, colflags2, data2, poshalf, neghalf, j - k); \ data2 += w; \ } \ } \ @@ -800,22 +919,26 @@ static void opj_t1_dec_refpass_mqc_vsc( OPJ_INT32 one, poshalf, neghalf; OPJ_UINT32 i, j, k; OPJ_INT32 vsc; + opj_colflag_t *colflags1 = &t1->colflags[t1->flags_stride + 1]; one = 1 << bpno; poshalf = one >> 1; neghalf = bpno > 0 ? -poshalf : -1; for (k = 0; k < t1->h; k += 4) { for (i = 0; i < t1->w; ++i) { + opj_colflag_t *colflags2 = colflags1 + i; for (j = k; j < k + 4 && j < t1->h; ++j) { vsc = ((j == k + 3 || j == t1->h - 1)) ? 1 : 0; opj_t1_dec_refpass_step_mqc_vsc( t1, &t1->flags[((j+1) * t1->flags_stride) + i + 1], + colflags2, &t1->data[(j * t1->w) + i], poshalf, neghalf, - vsc); + vsc, j - k); } } + colflags1 += t1->flags_stride; } } /* VSC and BYPASS by Antonin */ @@ -858,8 +981,10 @@ LABEL_PARTIAL: static void opj_t1_dec_clnpass_step_partial( opj_t1_t *t1, opj_flag_t *flagsp, + opj_colflag_t *colflagsp, OPJ_INT32 *datap, - OPJ_INT32 oneplushalf) + OPJ_INT32 oneplushalf, + OPJ_INT32 row) { OPJ_INT32 v, flag; opj_mqc_t *mqc = t1->mqc; /* MQC component */ @@ -868,38 +993,48 @@ static void opj_t1_dec_clnpass_step_partial( opj_mqc_setcurctx(mqc, opj_t1_getctxno_sc((OPJ_UINT32)flag)); v = opj_mqc_decode(mqc) ^ opj_t1_getspb((OPJ_UINT32)flag); *datap = v ? -oneplushalf : oneplushalf; - opj_t1_updateflags(flagsp, (OPJ_UINT32)v, t1->flags_stride); + opj_t1_updateflagscolflags(flagsp, colflagsp, (OPJ_UINT32)v, t1->flags_stride, row); +#ifdef CONSISTENCY_CHECK *flagsp &= ~T1_VISIT; +#endif } /* VSC and BYPASS by Antonin */ static void opj_t1_dec_clnpass_step( opj_t1_t *t1, opj_flag_t *flagsp, + opj_colflag_t *colflagsp, OPJ_INT32 *datap, - OPJ_INT32 oneplushalf) + OPJ_INT32 oneplushalf, + OPJ_INT32 row) { OPJ_INT32 v, flag; opj_mqc_t *mqc = t1->mqc; /* MQC component */ - - flag = *flagsp; - if (!(flag & (T1_SIG | T1_VISIT))) { +#ifdef CONSISTENCY_CHECK + assert( (!(*flagsp & (T1_SIG | T1_VISIT))) == (!(*colflagsp & ((T1_COLFLAG_SIG_ROW_0 | T1_COLFLAG_VISIT_ROW_0) << (4*row)))) ); +#endif + if (!(*colflagsp & ((T1_COLFLAG_SIG_ROW_0 | T1_COLFLAG_VISIT_ROW_0) << (4*row)))) { + flag = *flagsp; opj_mqc_setcurctx(mqc, opj_t1_getctxno_zc(mqc, (OPJ_UINT32)flag)); if (opj_mqc_decode(mqc)) { opj_mqc_setcurctx(mqc, opj_t1_getctxno_sc((OPJ_UINT32)flag)); v = opj_mqc_decode(mqc) ^ opj_t1_getspb((OPJ_UINT32)flag); *datap = v ? -oneplushalf : oneplushalf; - opj_t1_updateflags(flagsp, (OPJ_UINT32)v, t1->flags_stride); + opj_t1_updateflagscolflags(flagsp, colflagsp, (OPJ_UINT32)v, t1->flags_stride, row); } } +#ifdef CONSISTENCY_CHECK *flagsp &= ~T1_VISIT; +#endif } /* VSC and BYPASS by Antonin */ static void opj_t1_dec_clnpass_step_only_if_flag_not_sig_visit( opj_t1_t *t1, opj_flag_t *flagsp, + opj_colflag_t *colflagsp, OPJ_INT32 *datap, - OPJ_INT32 oneplushalf) + OPJ_INT32 oneplushalf, + OPJ_INT32 row) { OPJ_INT32 v; OPJ_INT32 flag; @@ -914,7 +1049,7 @@ static void opj_t1_dec_clnpass_step_only_if_flag_not_sig_visit( opj_mqc_setcurctx(mqc, opj_t1_getctxno_sc((OPJ_UINT32)flag)); v = opj_mqc_decode(mqc) ^ opj_t1_getspb((OPJ_UINT32)flag); *datap = v ? -oneplushalf : oneplushalf; - opj_t1_updateflags(flagsp, v, t1->flags_stride); + opj_t1_updateflagscolflags(flagsp, colflagsp, v, t1->flags_stride, row); } } /*flagsp &= ~T1_VISIT;*/ @@ -923,10 +1058,12 @@ static void opj_t1_dec_clnpass_step_only_if_flag_not_sig_visit( static void opj_t1_dec_clnpass_step_vsc( opj_t1_t *t1, opj_flag_t *flagsp, + opj_colflag_t *colflagsp, OPJ_INT32 *datap, OPJ_INT32 oneplushalf, OPJ_INT32 partial, - OPJ_INT32 vsc) + OPJ_INT32 vsc, + OPJ_INT32 row) { OPJ_INT32 v, flag; @@ -936,17 +1073,19 @@ static void opj_t1_dec_clnpass_step_vsc( if (partial) { goto LABEL_PARTIAL; } - if (!(flag & (T1_SIG | T1_VISIT))) { + if (!(*colflagsp & ((T1_COLFLAG_SIG_ROW_0 | T1_COLFLAG_VISIT_ROW_0) << (T1_COLFLAG_RBS * row)))) { opj_mqc_setcurctx(mqc, opj_t1_getctxno_zc(mqc, (OPJ_UINT32)flag)); if (opj_mqc_decode(mqc)) { LABEL_PARTIAL: opj_mqc_setcurctx(mqc, opj_t1_getctxno_sc((OPJ_UINT32)flag)); v = opj_mqc_decode(mqc) ^ opj_t1_getspb((OPJ_UINT32)flag); *datap = v ? -oneplushalf : oneplushalf; - opj_t1_updateflags(flagsp, (OPJ_UINT32)v, t1->flags_stride); + opj_t1_updateflagscolflags(flagsp, colflagsp, v, t1->flags_stride, row); } } +#ifdef CONSISTENCY_CHECK *flagsp &= ~T1_VISIT; +#endif } static void opj_t1_enc_clnpass( @@ -1015,7 +1154,7 @@ static void opj_t1_enc_clnpass( #define MACRO_t1_flags_internal(x,y,flags_stride) t1->flags[((x)*(flags_stride))+(y)] -#define opj_t1_dec_clnpass_internal(t1, bpno, cblksty, w, h, flags_stride) \ +#define opj_t1_dec_clnpass_internal(consistency_check, t1, bpno, cblksty, w, h, flags_stride) \ { \ OPJ_INT32 one, half, oneplushalf, agg, runlen, vsc; \ OPJ_UINT32 i, j, k; \ @@ -1027,14 +1166,17 @@ static void opj_t1_enc_clnpass( half = one >> 1; \ oneplushalf = one | half; \ if (cblksty & J2K_CCP_CBLKSTY_VSC) { \ + opj_colflag_t *colflags1 = &t1->colflags[flags_stride + 1]; \ for (k = 0; k < h; k += 4) { \ for (i = 0; i < w; ++i) { \ + opj_colflag_t *colflags2 = colflags1 + i; \ if (k + 3 < h) { \ - agg = !(MACRO_t1_flags_internal(1 + k,1 + i,flags_stride) & (T1_SIG | T1_VISIT | T1_SIG_OTH) \ - || MACRO_t1_flags_internal(1 + k + 1,1 + i,flags_stride) & (T1_SIG | T1_VISIT | T1_SIG_OTH) \ - || MACRO_t1_flags_internal(1 + k + 2,1 + i,flags_stride) & (T1_SIG | T1_VISIT | T1_SIG_OTH) \ - || (MACRO_t1_flags_internal(1 + k + 3,1 + i,flags_stride) \ - & (~(T1_SIG_S | T1_SIG_SE | T1_SIG_SW | T1_SGN_S))) & (T1_SIG | T1_VISIT | T1_SIG_OTH)); \ + agg = !((*colflags2 & (T1_COLFLAG_SIG_ROW_0 | T1_COLFLAG_VISIT_ROW_0 | T1_COLFLAG_SIG_OTHER_ROW_0 | \ + T1_COLFLAG_SIG_ROW_1 | T1_COLFLAG_VISIT_ROW_1 | T1_COLFLAG_SIG_OTHER_ROW_1 | \ + T1_COLFLAG_SIG_ROW_2 | T1_COLFLAG_VISIT_ROW_2 | T1_COLFLAG_SIG_OTHER_ROW_2 | \ + T1_COLFLAG_SIG_ROW_3 | T1_COLFLAG_VISIT_ROW_3)) || \ + ((MACRO_t1_flags_internal(1 + k + 3,1 + i,flags_stride) \ + & ((~(T1_SIG_S | T1_SIG_SE | T1_SIG_SW | T1_SGN_S))) & (T1_SIG_OTH)))); \ } else { \ agg = 0; \ } \ @@ -1054,24 +1196,36 @@ static void opj_t1_enc_clnpass( opj_t1_dec_clnpass_step_vsc( \ t1, \ &t1->flags[((j+1) * flags_stride) + i + 1], \ + colflags2, \ &t1->data[(j * w) + i], \ oneplushalf, \ agg && (j == k + (OPJ_UINT32)runlen), \ - vsc); \ + vsc, j - k); \ } \ + *colflags2 &= ~(T1_COLFLAG_VISIT_ROW_0 | T1_COLFLAG_VISIT_ROW_1 | T1_COLFLAG_VISIT_ROW_2 | T1_COLFLAG_VISIT_ROW_3); \ } \ + colflags1 += flags_stride; \ } \ } else { \ OPJ_INT32 *data1 = t1->data; \ opj_flag_t *flags1 = &t1->flags[1]; \ + opj_colflag_t *colflags1 = &t1->colflags[flags_stride + 1]; \ for (k = 0; k < (h & ~3u); k += 4) { \ for (i = 0; i < w; ++i) { \ OPJ_INT32 *data2 = data1 + i; \ opj_flag_t *flags2 = flags1 + i; \ - agg = !((MACRO_t1_flags_internal(1 + k, 1 + i,flags_stride) | \ - MACRO_t1_flags_internal(1 + k + 1, 1 + i,flags_stride) | \ - MACRO_t1_flags_internal(1 + k + 2, 1 + i,flags_stride) | \ - MACRO_t1_flags_internal(1 + k + 3, 1 + i,flags_stride)) & (T1_SIG | T1_VISIT | T1_SIG_OTH)); \ + opj_colflag_t *colflags2 = colflags1 + i; \ + opj_colflag_t colflags = *colflags2; \ + agg = !(colflags & (T1_COLFLAG_SIG_ROW_0 | T1_COLFLAG_VISIT_ROW_0 | T1_COLFLAG_SIG_OTHER_ROW_0 | \ + T1_COLFLAG_SIG_ROW_1 | T1_COLFLAG_VISIT_ROW_1 | T1_COLFLAG_SIG_OTHER_ROW_1 | \ + T1_COLFLAG_SIG_ROW_2 | T1_COLFLAG_VISIT_ROW_2 | T1_COLFLAG_SIG_OTHER_ROW_2 | \ + T1_COLFLAG_SIG_ROW_3 | T1_COLFLAG_VISIT_ROW_3 | T1_COLFLAG_SIG_OTHER_ROW_3)); \ + if( consistency_check ) { \ + assert( agg == !((MACRO_t1_flags_internal(1 + k, 1 + i,flags_stride) | \ + MACRO_t1_flags_internal(1 + k + 1, 1 + i,flags_stride) | \ + MACRO_t1_flags_internal(1 + k + 2, 1 + i,flags_stride) | \ + MACRO_t1_flags_internal(1 + k + 3, 1 + i,flags_stride)) & (T1_SIG | T1_VISIT | T1_SIG_OTH)) ); \ + } \ if (agg) { \ opj_mqc_setcurctx(mqc, T1_CTXNO_AGG); \ if (!opj_mqc_decode(mqc)) { \ @@ -1084,52 +1238,59 @@ static void opj_t1_enc_clnpass( data2 += (OPJ_UINT32)runlen * w; \ for (j = (OPJ_UINT32)runlen; j < 4 && j < h; ++j) { \ flags2 += flags_stride; \ - if (agg && (j == (OPJ_UINT32)runlen)) { \ - opj_t1_dec_clnpass_step_partial(t1, flags2, data2, oneplushalf); \ + if (j == (OPJ_UINT32)runlen) { \ + opj_t1_dec_clnpass_step_partial(t1, flags2, colflags2, data2, oneplushalf, j); \ } else { \ - opj_t1_dec_clnpass_step(t1, flags2, data2, oneplushalf); \ + opj_t1_dec_clnpass_step(t1, flags2, colflags2, data2, oneplushalf, j); \ } \ data2 += w; \ } \ } else { \ - opj_flag_t flag; \ flags2 += flags_stride; \ - flag = *flags2; \ - if (!(flag & (T1_SIG | T1_VISIT))) \ - opj_t1_dec_clnpass_step_only_if_flag_not_sig_visit(t1, flags2, data2, oneplushalf); \ - *flags2 &= ~T1_VISIT; \ + if( consistency_check ) { assert( (!(colflags & (T1_COLFLAG_SIG_ROW_0 | T1_COLFLAG_VISIT_ROW_0))) == (!(*flags2 & (T1_SIG | T1_VISIT))) ); } \ + if (!(colflags & (T1_COLFLAG_SIG_ROW_0 | T1_COLFLAG_VISIT_ROW_0))) {\ + opj_t1_dec_clnpass_step_only_if_flag_not_sig_visit(t1, flags2, colflags2, data2, oneplushalf, 0); \ + } \ + if( consistency_check ) *flags2 &= ~T1_VISIT; \ data2 += w; \ flags2 += flags_stride; \ - flag = *flags2; \ - if (!(flag & (T1_SIG | T1_VISIT))) \ - opj_t1_dec_clnpass_step_only_if_flag_not_sig_visit(t1, flags2, data2, oneplushalf); \ - *flags2 &= ~T1_VISIT; \ + if( consistency_check ) { assert( (!(colflags & (T1_COLFLAG_SIG_ROW_1 | T1_COLFLAG_VISIT_ROW_1))) == (!(*flags2 & (T1_SIG | T1_VISIT))) ); } \ + if (!(colflags & (T1_COLFLAG_SIG_ROW_1 | T1_COLFLAG_VISIT_ROW_1))) {\ + opj_t1_dec_clnpass_step_only_if_flag_not_sig_visit(t1, flags2, colflags2, data2, oneplushalf, 1); \ + } \ + if( consistency_check ) *flags2 &= ~T1_VISIT; \ data2 += w; \ flags2 += flags_stride; \ - flag = *flags2; \ - if (!(flag & (T1_SIG | T1_VISIT))) \ - opj_t1_dec_clnpass_step_only_if_flag_not_sig_visit(t1, flags2, data2, oneplushalf); \ - *flags2 &= ~T1_VISIT; \ + if( consistency_check ) { assert( (!(colflags & (T1_COLFLAG_SIG_ROW_2 | T1_COLFLAG_VISIT_ROW_2))) == (!(*flags2 & (T1_SIG | T1_VISIT))) ); } \ + if (!(colflags & (T1_COLFLAG_SIG_ROW_2 | T1_COLFLAG_VISIT_ROW_2))) {\ + opj_t1_dec_clnpass_step_only_if_flag_not_sig_visit(t1, flags2, colflags2, data2, oneplushalf, 2); \ + } \ + if( consistency_check ) *flags2 &= ~T1_VISIT; \ data2 += w; \ flags2 += flags_stride; \ - flag = *flags2; \ - if (!(flag & (T1_SIG | T1_VISIT))) \ - opj_t1_dec_clnpass_step_only_if_flag_not_sig_visit(t1, flags2, data2, oneplushalf); \ - *flags2 &= ~T1_VISIT; \ + if( consistency_check ) { assert( (!(colflags & (T1_COLFLAG_SIG_ROW_3 | T1_COLFLAG_VISIT_ROW_3))) == (!(*flags2 & (T1_SIG | T1_VISIT))) ); } \ + if (!(colflags & (T1_COLFLAG_SIG_ROW_3 | T1_COLFLAG_VISIT_ROW_3))) {\ + opj_t1_dec_clnpass_step_only_if_flag_not_sig_visit(t1, flags2, colflags2, data2, oneplushalf, 3); \ + } \ + if( consistency_check ) *flags2 &= ~T1_VISIT; \ data2 += w; \ } \ + *colflags2 &= ~(T1_COLFLAG_VISIT_ROW_0 | T1_COLFLAG_VISIT_ROW_1 | T1_COLFLAG_VISIT_ROW_2 | T1_COLFLAG_VISIT_ROW_3); \ } \ data1 += w << 2; \ flags1 += flags_stride << 2; \ + colflags1 += flags_stride; \ } \ for (i = 0; i < w; ++i) { \ OPJ_INT32 *data2 = data1 + i; \ opj_flag_t *flags2 = flags1 + i; \ + opj_colflag_t *colflags2 = colflags1 + i; \ for (j = k; j < h; ++j) { \ flags2 += flags_stride; \ - opj_t1_dec_clnpass_step(t1, flags2, data2, oneplushalf); \ + opj_t1_dec_clnpass_step(t1, flags2, colflags2, data2, oneplushalf, j - k); \ data2 += w; \ } \ + *colflags2 &= ~(T1_COLFLAG_VISIT_ROW_0 | T1_COLFLAG_VISIT_ROW_1 | T1_COLFLAG_VISIT_ROW_2 | T1_COLFLAG_VISIT_ROW_3); \ } \ } \ \ @@ -1153,7 +1314,11 @@ static void opj_t1_dec_clnpass_64x64( OPJ_INT32 bpno, OPJ_INT32 cblksty) { - opj_t1_dec_clnpass_internal(t1, bpno, cblksty, 64, 64, 66); +#ifdef CONSISTENCY_CHECK + opj_t1_dec_clnpass_internal(OPJ_TRUE, t1, bpno, cblksty, 64, 64, 66); +#else + opj_t1_dec_clnpass_internal(OPJ_FALSE, t1, bpno, cblksty, 64, 64, 66); +#endif } static void opj_t1_dec_clnpass_generic( @@ -1161,7 +1326,11 @@ static void opj_t1_dec_clnpass_generic( OPJ_INT32 bpno, OPJ_INT32 cblksty) { - opj_t1_dec_clnpass_internal(t1, bpno, cblksty, t1->w, t1->h, t1->flags_stride); +#ifdef CONSISTENCY_CHECK + opj_t1_dec_clnpass_internal(OPJ_TRUE, t1, bpno, cblksty, t1->w, t1->h, t1->flags_stride); +#else + opj_t1_dec_clnpass_internal(OPJ_FALSE, t1, bpno, cblksty, t1->w, t1->h, t1->flags_stride); +#endif } @@ -1234,6 +1403,21 @@ static OPJ_BOOL opj_t1_allocate_buffers( t1->flagssize=flagssize; } memset(t1->flags,0,flagssize * sizeof(opj_flag_t)); + + if (!t1->encoder) { + OPJ_UINT32 colflags_size=t1->flags_stride * ((h+3) / 4 + 2); + + if(colflags_size > t1->colflags_size){ + opj_aligned_free(t1->colflags); + t1->colflags = (opj_colflag_t*) opj_aligned_malloc(colflags_size * sizeof(opj_colflag_t)); + if(!t1->colflags){ + /* FIXME event manager error callback */ + return OPJ_FALSE; + } + t1->colflags_size=colflags_size; + } + memset(t1->colflags,0,colflags_size * sizeof(opj_colflag_t)); + } t1->w=w; t1->h=h; @@ -1304,6 +1488,10 @@ void opj_t1_destroy(opj_t1_t *p_t1) p_t1->flags = 00; } + if (p_t1->colflags) { + opj_aligned_free(p_t1->colflags); + p_t1->colflags = 00; + } opj_free(p_t1); } diff --git a/src/lib/openjp2/t1.h b/src/lib/openjp2/t1.h index e9d3db57..22557d96 100644 --- a/src/lib/openjp2/t1.h +++ b/src/lib/openjp2/t1.h @@ -52,6 +52,7 @@ in T1.C are used by some function in TCD.C. /* CAUTION: the value of those constants must not be changed, otherwise the */ /* optimization of opj_t1_updateflags() will break! */ +/* BEGINNING of flags that apply to opj_flag_t */ #define T1_SIG_NE 0x0001 /**< Context orientation : North-East direction */ #define T1_SIG_SE 0x0002 /**< Context orientation : South-East direction */ #define T1_SIG_SW 0x0004 /**< Context orientation : South-West direction */ @@ -69,9 +70,10 @@ in T1.C are used by some function in TCD.C. #define T1_SGN_W 0x0800 #define T1_SGN (T1_SGN_N|T1_SGN_E|T1_SGN_S|T1_SGN_W) -#define T1_SIG 0x1000 -#define T1_REFINE 0x2000 -#define T1_VISIT 0x4000 +#define T1_SIG 0x1000 /**< No longer used by decoder */ +#define T1_REFINE 0x2000 /**< No longer used by decoder */ +#define T1_VISIT 0x4000 /**< No longer used by decoder */ +/* END of flags that apply to opj_flag_t */ #define T1_NUMCTXS_ZC 9 #define T1_NUMCTXS_SC 5 @@ -91,10 +93,32 @@ in T1.C are used by some function in TCD.C. #define T1_TYPE_MQ 0 /**< Normal coding using entropy coder */ #define T1_TYPE_RAW 1 /**< No encoding the information is store under raw format in codestream (mode switch RAW)*/ +/* Those flags are used by opj_colflag_t */ +#define T1_COLFLAG_RBS 4 /* RBS = Row Bit Shift */ +#define T1_COLFLAG_SIG_OTHER_ROW_0 (1 << 0) /**< This sample has at least one significant neighbour */ +#define T1_COLFLAG_SIG_ROW_0 (1 << 1) /**< This sample is significant */ +#define T1_COLFLAG_VISIT_ROW_0 (1 << 2) /**< This sample has been visited */ +#define T1_COLFLAG_REFINE_ROW_0 (1 << 3) /**< This sample has been refined */ +#define T1_COLFLAG_SIG_OTHER_ROW_1 (T1_COLFLAG_SIG_OTHER_ROW_0 << T1_COLFLAG_RBS) +#define T1_COLFLAG_SIG_ROW_1 (T1_COLFLAG_SIG_ROW_0 << T1_COLFLAG_RBS) +#define T1_COLFLAG_VISIT_ROW_1 (T1_COLFLAG_VISIT_ROW_0 << T1_COLFLAG_RBS) +#define T1_COLFLAG_REFINE_ROW_1 (T1_COLFLAG_REFINE_ROW_0 << T1_COLFLAG_RBS) +#define T1_COLFLAG_SIG_OTHER_ROW_2 (T1_COLFLAG_SIG_OTHER_ROW_0 << (2*T1_COLFLAG_RBS)) +#define T1_COLFLAG_SIG_ROW_2 (T1_COLFLAG_SIG_ROW_0 << (2*T1_COLFLAG_RBS)) +#define T1_COLFLAG_VISIT_ROW_2 (T1_COLFLAG_VISIT_ROW_0 << (2*T1_COLFLAG_RBS)) +#define T1_COLFLAG_REFINE_ROW_2 (T1_COLFLAG_REFINE_ROW_0 << (2*T1_COLFLAG_RBS)) +#define T1_COLFLAG_SIG_OTHER_ROW_3 (T1_COLFLAG_SIG_OTHER_ROW_0 << (3*T1_COLFLAG_RBS)) +#define T1_COLFLAG_SIG_ROW_3 (T1_COLFLAG_SIG_ROW_0 << (3*T1_COLFLAG_RBS)) +#define T1_COLFLAG_VISIT_ROW_3 (T1_COLFLAG_VISIT_ROW_0 << (3*T1_COLFLAG_RBS)) +#define T1_COLFLAG_REFINE_ROW_3 (T1_COLFLAG_REFINE_ROW_0 << (3*T1_COLFLAG_RBS)) + /* ----------------------------------------------------------------------- */ typedef OPJ_INT16 opj_flag_t; +/** Flags for 4 consecutive rows of a column */ +typedef OPJ_UINT16 opj_colflag_t; + /** Tier-1 coding (coding of code-block coefficients) */ @@ -107,11 +131,17 @@ typedef struct opj_t1 { OPJ_INT32 *data; opj_flag_t *flags; + /** Addition flag array such that colflags[1+0] is for state of col=0,row=0..3, + colflags[1+1] for col=1, row=0..3, colflags[1+flags_stride] for col=0,row=4..7, ... + This array avoids too much cache trashing when processing by 4 vertical samples + as done in the various decoding steps. */ + opj_colflag_t* colflags; OPJ_UINT32 w; OPJ_UINT32 h; OPJ_UINT32 datasize; OPJ_UINT32 flagssize; OPJ_UINT32 flags_stride; + OPJ_UINT32 colflags_size; OPJ_UINT32 data_stride; OPJ_BOOL encoder; } opj_t1_t; From 93f7f907117675cf6af227b4d0243a58f6c3640a Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Mon, 23 May 2016 10:25:55 +0200 Subject: [PATCH 08/22] opj_t1_decode_cblks(): tiny perf increase when loop unrolling --- src/lib/openjp2/t1.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/lib/openjp2/t1.c b/src/lib/openjp2/t1.c index 9ad6ffd0..89a7ff85 100644 --- a/src/lib/openjp2/t1.c +++ b/src/lib/openjp2/t1.c @@ -1559,7 +1559,18 @@ OPJ_BOOL opj_t1_decode_cblks( opj_t1_t* t1, if (tccp->qmfbid == 1) { OPJ_INT32* restrict tiledp = &tilec->data[(OPJ_UINT32)y * tile_w + (OPJ_UINT32)x]; for (j = 0; j < cblk_h; ++j) { - for (i = 0; i < cblk_w; ++i) { + i = 0; + for (; i < (cblk_w & ~3); i += 4) { + OPJ_INT32 tmp0 = datap[(j * cblk_w) + i]; + OPJ_INT32 tmp1 = datap[(j * cblk_w) + i+1]; + OPJ_INT32 tmp2 = datap[(j * cblk_w) + i+2]; + OPJ_INT32 tmp3 = datap[(j * cblk_w) + i+3]; + ((OPJ_INT32*)tiledp)[(j * tile_w) + i] = tmp0/2; + ((OPJ_INT32*)tiledp)[(j * tile_w) + i+1] = tmp1/2; + ((OPJ_INT32*)tiledp)[(j * tile_w) + i+2] = tmp2/2; + ((OPJ_INT32*)tiledp)[(j * tile_w) + i+3] = tmp3/2; + } + for (; i < cblk_w; ++i) { OPJ_INT32 tmp = datap[(j * cblk_w) + i]; ((OPJ_INT32*)tiledp)[(j * tile_w) + i] = tmp/2; } From 956c31d5a6e4530a92b6dd6099bdbf071144f6f1 Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Mon, 23 May 2016 10:34:44 +0200 Subject: [PATCH 09/22] opj_t1_dec_clnpass(): remove useless test in the runlen decoding path (of the non VSC case) --- src/lib/openjp2/t1.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lib/openjp2/t1.c b/src/lib/openjp2/t1.c index 89a7ff85..a2c5dc6a 100644 --- a/src/lib/openjp2/t1.c +++ b/src/lib/openjp2/t1.c @@ -1236,7 +1236,7 @@ static void opj_t1_enc_clnpass( runlen = (runlen << 1) | opj_mqc_decode(mqc); \ flags2 += (OPJ_UINT32)runlen * flags_stride; \ data2 += (OPJ_UINT32)runlen * w; \ - for (j = (OPJ_UINT32)runlen; j < 4 && j < h; ++j) { \ + for (j = (OPJ_UINT32)runlen; j < 4; ++j) { \ flags2 += flags_stride; \ if (j == (OPJ_UINT32)runlen) { \ opj_t1_dec_clnpass_step_partial(t1, flags2, colflags2, data2, oneplushalf, j); \ From 8371491a9968a31ce16d6ce37b775ef3c7d090c8 Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Mon, 23 May 2016 11:44:54 +0200 Subject: [PATCH 10/22] Better inlining of opj_t1_updateflagscolflags() w.r.t. flags_stride --- src/lib/openjp2/t1.c | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/src/lib/openjp2/t1.c b/src/lib/openjp2/t1.c index a2c5dc6a..0023ad74 100644 --- a/src/lib/openjp2/t1.c +++ b/src/lib/openjp2/t1.c @@ -93,7 +93,8 @@ static INLINE void opj_t1_dec_sigpass_step_mqc( opj_colflag_t* colflagsp, OPJ_INT32 *datap, OPJ_INT32 oneplushalf, - OPJ_INT32 row); + OPJ_INT32 row, + OPJ_INT32 flags_stride); static INLINE void opj_t1_dec_sigpass_step_mqc_vsc( opj_t1_t *t1, opj_flag_t *flagsp, @@ -474,7 +475,8 @@ static INLINE void opj_t1_dec_sigpass_step_mqc( opj_colflag_t* colflagsp, OPJ_INT32 *datap, OPJ_INT32 oneplushalf, - OPJ_INT32 row) + OPJ_INT32 row, + OPJ_INT32 flags_stride) { OPJ_INT32 v, flag; @@ -492,7 +494,7 @@ static INLINE void opj_t1_dec_sigpass_step_mqc( opj_mqc_setcurctx(mqc, opj_t1_getctxno_sc((OPJ_UINT32)flag)); v = opj_mqc_decode(mqc) ^ opj_t1_getspb((OPJ_UINT32)flag); *datap = v ? -oneplushalf : oneplushalf; - opj_t1_updateflagscolflags(flagsp, colflagsp, (OPJ_UINT32)v, t1->flags_stride, row); + opj_t1_updateflagscolflags(flagsp, colflagsp, (OPJ_UINT32)v, flags_stride, row); } #ifdef CONSISTENCY_CHECK *flagsp |= T1_VISIT; @@ -609,16 +611,16 @@ static void opj_t1_dec_sigpass_raw( opj_colflag_t *colflags2 = colflags1 + i; \ if( *colflags2 == 0 ) continue; \ flags2 += flags_stride; \ - opj_t1_dec_sigpass_step_mqc(t1, flags2, colflags2, data2, oneplushalf, 0); \ + opj_t1_dec_sigpass_step_mqc(t1, flags2, colflags2, data2, oneplushalf, 0, flags_stride); \ data2 += w; \ flags2 += flags_stride; \ - opj_t1_dec_sigpass_step_mqc(t1, flags2, colflags2, data2, oneplushalf, 1); \ + opj_t1_dec_sigpass_step_mqc(t1, flags2, colflags2, data2, oneplushalf, 1, flags_stride); \ data2 += w; \ flags2 += flags_stride; \ - opj_t1_dec_sigpass_step_mqc(t1, flags2, colflags2, data2, oneplushalf, 2); \ + opj_t1_dec_sigpass_step_mqc(t1, flags2, colflags2, data2, oneplushalf, 2, flags_stride); \ data2 += w; \ flags2 += flags_stride; \ - opj_t1_dec_sigpass_step_mqc(t1, flags2, colflags2, data2, oneplushalf, 3); \ + opj_t1_dec_sigpass_step_mqc(t1, flags2, colflags2, data2, oneplushalf, 3, flags_stride); \ data2 += w; \ } \ data1 += w << 2; \ @@ -631,7 +633,7 @@ static void opj_t1_dec_sigpass_raw( opj_colflag_t *colflags2 = colflags1 + i; \ for (j = k; j < h; ++j) { \ flags2 += flags_stride; \ - opj_t1_dec_sigpass_step_mqc(t1, flags2, colflags2, data2, oneplushalf, j - k); \ + opj_t1_dec_sigpass_step_mqc(t1, flags2, colflags2, data2, oneplushalf, j - k, flags_stride); \ data2 += w; \ } \ } \ @@ -1034,7 +1036,8 @@ static void opj_t1_dec_clnpass_step_only_if_flag_not_sig_visit( opj_colflag_t *colflagsp, OPJ_INT32 *datap, OPJ_INT32 oneplushalf, - OPJ_INT32 row) + OPJ_INT32 row, + OPJ_INT32 flags_stride) { OPJ_INT32 v; OPJ_INT32 flag; @@ -1049,7 +1052,7 @@ static void opj_t1_dec_clnpass_step_only_if_flag_not_sig_visit( opj_mqc_setcurctx(mqc, opj_t1_getctxno_sc((OPJ_UINT32)flag)); v = opj_mqc_decode(mqc) ^ opj_t1_getspb((OPJ_UINT32)flag); *datap = v ? -oneplushalf : oneplushalf; - opj_t1_updateflagscolflags(flagsp, colflagsp, v, t1->flags_stride, row); + opj_t1_updateflagscolflags(flagsp, colflagsp, v, flags_stride, row); } } /*flagsp &= ~T1_VISIT;*/ @@ -1249,28 +1252,28 @@ static void opj_t1_enc_clnpass( flags2 += flags_stride; \ if( consistency_check ) { assert( (!(colflags & (T1_COLFLAG_SIG_ROW_0 | T1_COLFLAG_VISIT_ROW_0))) == (!(*flags2 & (T1_SIG | T1_VISIT))) ); } \ if (!(colflags & (T1_COLFLAG_SIG_ROW_0 | T1_COLFLAG_VISIT_ROW_0))) {\ - opj_t1_dec_clnpass_step_only_if_flag_not_sig_visit(t1, flags2, colflags2, data2, oneplushalf, 0); \ + opj_t1_dec_clnpass_step_only_if_flag_not_sig_visit(t1, flags2, colflags2, data2, oneplushalf, 0, flags_stride); \ } \ if( consistency_check ) *flags2 &= ~T1_VISIT; \ data2 += w; \ flags2 += flags_stride; \ if( consistency_check ) { assert( (!(colflags & (T1_COLFLAG_SIG_ROW_1 | T1_COLFLAG_VISIT_ROW_1))) == (!(*flags2 & (T1_SIG | T1_VISIT))) ); } \ if (!(colflags & (T1_COLFLAG_SIG_ROW_1 | T1_COLFLAG_VISIT_ROW_1))) {\ - opj_t1_dec_clnpass_step_only_if_flag_not_sig_visit(t1, flags2, colflags2, data2, oneplushalf, 1); \ + opj_t1_dec_clnpass_step_only_if_flag_not_sig_visit(t1, flags2, colflags2, data2, oneplushalf, 1, flags_stride); \ } \ if( consistency_check ) *flags2 &= ~T1_VISIT; \ data2 += w; \ flags2 += flags_stride; \ if( consistency_check ) { assert( (!(colflags & (T1_COLFLAG_SIG_ROW_2 | T1_COLFLAG_VISIT_ROW_2))) == (!(*flags2 & (T1_SIG | T1_VISIT))) ); } \ if (!(colflags & (T1_COLFLAG_SIG_ROW_2 | T1_COLFLAG_VISIT_ROW_2))) {\ - opj_t1_dec_clnpass_step_only_if_flag_not_sig_visit(t1, flags2, colflags2, data2, oneplushalf, 2); \ + opj_t1_dec_clnpass_step_only_if_flag_not_sig_visit(t1, flags2, colflags2, data2, oneplushalf, 2, flags_stride); \ } \ if( consistency_check ) *flags2 &= ~T1_VISIT; \ data2 += w; \ flags2 += flags_stride; \ if( consistency_check ) { assert( (!(colflags & (T1_COLFLAG_SIG_ROW_3 | T1_COLFLAG_VISIT_ROW_3))) == (!(*flags2 & (T1_SIG | T1_VISIT))) ); } \ if (!(colflags & (T1_COLFLAG_SIG_ROW_3 | T1_COLFLAG_VISIT_ROW_3))) {\ - opj_t1_dec_clnpass_step_only_if_flag_not_sig_visit(t1, flags2, colflags2, data2, oneplushalf, 3); \ + opj_t1_dec_clnpass_step_only_if_flag_not_sig_visit(t1, flags2, colflags2, data2, oneplushalf, 3, flags_stride); \ } \ if( consistency_check ) *flags2 &= ~T1_VISIT; \ data2 += w; \ From 107eb31531ca688e2799406e69e9383efc13448f Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Mon, 23 May 2016 13:45:15 +0200 Subject: [PATCH 11/22] Improve perf of opj_t1_dec_sigpass_mqc_vsc() and opj_t1_dec_refpass_mqc_vsc() with loop unrolling --- src/lib/openjp2/t1.c | 92 ++++++++++++++++++++++++++++++++------------ 1 file changed, 67 insertions(+), 25 deletions(-) diff --git a/src/lib/openjp2/t1.c b/src/lib/openjp2/t1.c index 0023ad74..277261d7 100644 --- a/src/lib/openjp2/t1.c +++ b/src/lib/openjp2/t1.c @@ -660,27 +660,48 @@ static void opj_t1_dec_sigpass_mqc_vsc( { OPJ_INT32 one, half, oneplushalf, vsc; OPJ_UINT32 i, j, k; - opj_colflag_t *colflags1 = &t1->colflags[t1->flags_stride + 1]; + OPJ_INT32 *data1 = t1->data; + opj_flag_t *flags1 = &t1->flags[1]; + opj_colflag_t *colflags1 = &t1->colflags[t1->flags_stride + 1]; one = 1 << bpno; half = one >> 1; oneplushalf = one | half; - for (k = 0; k < t1->h; k += 4) { + for (k = 0; k < (t1->h & ~3); k += 4) { for (i = 0; i < t1->w; ++i) { + OPJ_INT32 *data2 = data1 + i; + opj_flag_t *flags2 = flags1 + i; opj_colflag_t *colflags2 = colflags1 + i; - for (j = k; j < k + 4 && j < t1->h; ++j) { - vsc = (j == k + 3 || j == t1->h - 1) ? 1 : 0; - opj_t1_dec_sigpass_step_mqc_vsc( - t1, - &t1->flags[((j+1) * t1->flags_stride) + i + 1], - colflags2, - &t1->data[(j * t1->w) + i], - oneplushalf, - vsc, - j - k); - } + flags2 += t1->flags_stride; + opj_t1_dec_sigpass_step_mqc_vsc(t1, flags2, colflags2, data2, oneplushalf, 0, 0); + data2 += t1->w; + flags2 += t1->flags_stride; + opj_t1_dec_sigpass_step_mqc_vsc(t1, flags2, colflags2, data2, oneplushalf, 0, 1); + data2 += t1->w; + flags2 += t1->flags_stride; + opj_t1_dec_sigpass_step_mqc_vsc(t1, flags2, colflags2, data2, oneplushalf, 0, 2); + data2 += t1->w; + flags2 += t1->flags_stride; + opj_t1_dec_sigpass_step_mqc_vsc(t1, flags2, colflags2, data2, oneplushalf, 1, 3); + data2 += t1->w; } + data1 += t1->w << 2; + flags1 += t1->flags_stride << 2; colflags1 += t1->flags_stride; } + for (i = 0; i < t1->w; ++i) { + opj_colflag_t *colflags2 = colflags1 + i; + for (j = k; j < t1->h; ++j) { + vsc = (j == t1->h - 1) ? 1 : 0; + opj_t1_dec_sigpass_step_mqc_vsc( + t1, + &t1->flags[((j+1) * t1->flags_stride) + i + 1], + colflags2, + &t1->data[(j * t1->w) + i], + oneplushalf, + vsc, + j - k); + } + } } /* VSC and BYPASS by Antonin */ @@ -921,27 +942,48 @@ static void opj_t1_dec_refpass_mqc_vsc( OPJ_INT32 one, poshalf, neghalf; OPJ_UINT32 i, j, k; OPJ_INT32 vsc; + OPJ_INT32 *data1 = t1->data; + opj_flag_t *flags1 = &t1->flags[1]; opj_colflag_t *colflags1 = &t1->colflags[t1->flags_stride + 1]; one = 1 << bpno; poshalf = one >> 1; neghalf = bpno > 0 ? -poshalf : -1; - for (k = 0; k < t1->h; k += 4) { + for (k = 0; k < (t1->h & ~3); k += 4) { for (i = 0; i < t1->w; ++i) { + OPJ_INT32 *data2 = data1 + i; + opj_flag_t *flags2 = flags1 + i; opj_colflag_t *colflags2 = colflags1 + i; - for (j = k; j < k + 4 && j < t1->h; ++j) { - vsc = ((j == k + 3 || j == t1->h - 1)) ? 1 : 0; - opj_t1_dec_refpass_step_mqc_vsc( - t1, - &t1->flags[((j+1) * t1->flags_stride) + i + 1], - colflags2, - &t1->data[(j * t1->w) + i], - poshalf, - neghalf, - vsc, j - k); - } + flags2 += t1->flags_stride; + opj_t1_dec_refpass_step_mqc_vsc(t1, flags2, colflags2, data2, poshalf, neghalf, 0, 0); + data2 += t1->w; + flags2 += t1->flags_stride; + opj_t1_dec_refpass_step_mqc_vsc(t1, flags2, colflags2, data2, poshalf, neghalf, 0, 1); + data2 += t1->w; + flags2 += t1->flags_stride; + opj_t1_dec_refpass_step_mqc_vsc(t1, flags2, colflags2, data2, poshalf, neghalf, 0, 2); + data2 += t1->w; + flags2 += t1->flags_stride; + opj_t1_dec_refpass_step_mqc_vsc(t1, flags2, colflags2, data2, poshalf, neghalf, 1, 3); + data2 += t1->w; } + data1 += t1->w << 2; + flags1 += t1->flags_stride << 2; colflags1 += t1->flags_stride; } + for (i = 0; i < t1->w; ++i) { + opj_colflag_t *colflags2 = colflags1 + i; + for (j = k; j < t1->h; ++j) { + vsc = (j == t1->h - 1) ? 1 : 0; + opj_t1_dec_refpass_step_mqc_vsc( + t1, + &t1->flags[((j+1) * t1->flags_stride) + i + 1], + colflags2, + &t1->data[(j * t1->w) + i], + poshalf, neghalf, + vsc, + j - k); + } + } } /* VSC and BYPASS by Antonin */ From 7092f7ea112fcc44e7426c462bf01a406b076620 Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Mon, 23 May 2016 16:00:04 +0200 Subject: [PATCH 12/22] Fix MSVC210 build issue (use of C99 declaration after statement) introduced in ba1edf6cd41415594729bc90ad3b0008af48251e --- src/lib/openjp2/t1.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/lib/openjp2/t1.c b/src/lib/openjp2/t1.c index 277261d7..e1097bf5 100644 --- a/src/lib/openjp2/t1.c +++ b/src/lib/openjp2/t1.c @@ -1650,13 +1650,13 @@ static OPJ_BOOL opj_t1_decode_cblk(opj_t1_t *t1, opj_raw_t *raw = t1->raw; /* RAW component */ opj_mqc_t *mqc = t1->mqc; /* MQC component */ - mqc->lut_ctxno_zc_orient = lut_ctxno_zc + orient * 256; - OPJ_INT32 bpno_plus_one; OPJ_UINT32 passtype; OPJ_UINT32 segno, passno; OPJ_BYTE type = T1_TYPE_MQ; /* BYPASS mode */ + mqc->lut_ctxno_zc_orient = lut_ctxno_zc + orient * 256; + if(!opj_t1_allocate_buffers( t1, (OPJ_UINT32)(cblk->x1 - cblk->x0), @@ -1903,7 +1903,6 @@ static void opj_t1_encode_cblk(opj_t1_t *t1, OPJ_FLOAT64 cumwmsedec = 0.0; opj_mqc_t *mqc = t1->mqc; /* MQC component */ - mqc->lut_ctxno_zc_orient = lut_ctxno_zc + orient * 256; OPJ_UINT32 passno; OPJ_INT32 bpno; @@ -1914,6 +1913,8 @@ static void opj_t1_encode_cblk(opj_t1_t *t1, OPJ_BYTE type = T1_TYPE_MQ; OPJ_FLOAT64 tempwmsedec; + mqc->lut_ctxno_zc_orient = lut_ctxno_zc + orient * 256; + max = 0; for (i = 0; i < t1->w; ++i) { for (j = 0; j < t1->h; ++j) { From 54179fe1d53156c6b440166fe71a10f238a6ea56 Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Wed, 25 May 2016 16:34:52 +0200 Subject: [PATCH 13/22] Add threading and thread pool API --- src/lib/openjp2/CMakeLists.txt | 40 ++ src/lib/openjp2/openjpeg.h | 13 + src/lib/openjp2/opj_includes.h | 2 + src/lib/openjp2/thread.c | 959 +++++++++++++++++++++++++++++++++ src/lib/openjp2/thread.h | 253 +++++++++ 5 files changed, 1267 insertions(+) create mode 100644 src/lib/openjp2/thread.c create mode 100644 src/lib/openjp2/thread.h diff --git a/src/lib/openjp2/CMakeLists.txt b/src/lib/openjp2/CMakeLists.txt index c02a9948..f45ceb34 100644 --- a/src/lib/openjp2/CMakeLists.txt +++ b/src/lib/openjp2/CMakeLists.txt @@ -9,6 +9,8 @@ include_directories( ) # Defines the source code for the library set(OPENJPEG_SRCS + ${CMAKE_CURRENT_SOURCE_DIR}/thread.c + ${CMAKE_CURRENT_SOURCE_DIR}/thread.h ${CMAKE_CURRENT_SOURCE_DIR}/bio.c ${CMAKE_CURRENT_SOURCE_DIR}/bio.h ${CMAKE_CURRENT_SOURCE_DIR}/cio.c @@ -74,6 +76,11 @@ if(OPJ_DISABLE_TPSOT_FIX) add_definitions(-DOPJ_DISABLE_TPSOT_FIX) endif() +# Special case for old i586-mingw32msvc-gcc cross compiler +if(NOT WIN32 AND CMAKE_COMPILER_IS_GNUCC AND CMAKE_C_COMPILER MATCHES ".*mingw32msvc.*" ) + set(WIN32 YES) +endif() + # Build the library if(WIN32) if(BUILD_SHARED_LIBS) @@ -143,3 +150,36 @@ if(OPJ_USE_DSYMUTIL) DEPENDS ${OPENJPEG_LIBRARY_NAME}) endif() endif() + +################################################################################# +# threading configuration +################################################################################# +set(CMAKE_THREAD_PREFER_PTHREAD TRUE) + +option(USE_THREAD "Build with thread/mutex support " ON) +if(NOT USE_THREAD) + add_definitions( -DMUTEX_stub) +endif(NOT USE_THREAD) + +find_package(Threads QUIET) + +if(USE_THREAD AND WIN32 AND NOT Threads_FOUND ) + add_definitions( -DMUTEX_win32) + set(Threads_FOUND YES) +endif() + +if(USE_THREAD AND Threads_FOUND AND CMAKE_USE_WIN32_THREADS_INIT ) + add_definitions( -DMUTEX_win32) +endif(USE_THREAD AND Threads_FOUND AND CMAKE_USE_WIN32_THREADS_INIT ) + +if(USE_THREAD AND Threads_FOUND AND CMAKE_USE_PTHREADS_INIT ) + add_definitions( -DMUTEX_pthread) +endif(USE_THREAD AND Threads_FOUND AND CMAKE_USE_PTHREADS_INIT ) + +if(USE_THREAD AND NOT Threads_FOUND) + message(FATAL_ERROR "No thread library found and thread/mutex support is required by USE_THREAD option") +endif(USE_THREAD AND NOT Threads_FOUND) + +if(USE_THREAD AND Threads_FOUND AND CMAKE_USE_PTHREADS_INIT) + TARGET_LINK_LIBRARIES(${OPENJPEG_LIBRARY_NAME} ${CMAKE_THREAD_LIBS_INIT}) +endif(USE_THREAD AND Threads_FOUND AND CMAKE_USE_PTHREADS_INIT) diff --git a/src/lib/openjp2/openjpeg.h b/src/lib/openjp2/openjpeg.h index c07e9c84..369693df 100644 --- a/src/lib/openjp2/openjpeg.h +++ b/src/lib/openjp2/openjpeg.h @@ -1554,6 +1554,19 @@ OPJ_API OPJ_BOOL OPJ_CALLCONV opj_set_MCT( opj_cparameters_t *parameters, OPJ_INT32 * p_dc_shift, OPJ_UINT32 pNbComp); +/* +========================================================== + Thread functions +========================================================== +*/ + +/** Returns if the library is built with thread support. + * OPJ_TRUE if mutex, condition, thread, thread pool are available. + */ +OPJ_API OPJ_BOOL OPJ_CALLCONV opj_has_thread_support(void); + +/** Return the number of virtual CPUs */ +OPJ_API int OPJ_CALLCONV opj_get_num_cpus(void); #ifdef __cplusplus diff --git a/src/lib/openjp2/opj_includes.h b/src/lib/openjp2/opj_includes.h index 58a5a9a9..c2cc31fa 100644 --- a/src/lib/openjp2/opj_includes.h +++ b/src/lib/openjp2/opj_includes.h @@ -182,6 +182,8 @@ static INLINE long opj_lrintf(float f) { #include "bio.h" #include "cio.h" +#include "thread.h" + #include "image.h" #include "invert.h" #include "j2k.h" diff --git a/src/lib/openjp2/thread.c b/src/lib/openjp2/thread.c new file mode 100644 index 00000000..b2f8b5b2 --- /dev/null +++ b/src/lib/openjp2/thread.c @@ -0,0 +1,959 @@ +/* + * The copyright in this software is being made available under the 2-clauses + * BSD License, included below. This software may be subject to other third + * party and contributor rights, including patent rights, and no such rights + * are granted under this license. + * + * Copyright (c) 2016, Even Rouault + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS' + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "opj_includes.h" + +#include "thread.h" +#include + +#ifdef MUTEX_win32 + +/* Some versions of x86_64-w64-mingw32-gc -m32 resolve InterlockedCompareExchange() */ +/* as __sync_val_compare_and_swap_4 but fails to link it. As this protects against */ +/* a rather unlikely race, skip it */ +#if !(defined(__MINGW32__) && defined(__i386__)) +#define HAVE_INTERLOCKED_COMPARE_EXCHANGE 1 +#endif + +#include + +OPJ_BOOL OPJ_CALLCONV opj_has_thread_support(void) +{ + return OPJ_TRUE; +} + +int OPJ_CALLCONV opj_get_num_cpus(void) +{ + SYSTEM_INFO info; + DWORD dwNum; + GetSystemInfo(&info); + dwNum = info.dwNumberOfProcessors; + if( dwNum < 1 ) + return 1; + return (int)dwNum; +} + +struct opj_mutex_t +{ + CRITICAL_SECTION cs; +}; + +opj_mutex_t* opj_mutex_create(void) +{ + opj_mutex_t* mutex = (opj_mutex_t*) opj_malloc(sizeof(opj_mutex_t)); + if( !mutex ) + return NULL; + InitializeCriticalSectionAndSpinCount(&(mutex->cs), 4000); + return mutex; +} + +void opj_mutex_lock(opj_mutex_t* mutex) +{ + EnterCriticalSection( &(mutex->cs) ); +} + +void opj_mutex_unlock(opj_mutex_t* mutex) +{ + LeaveCriticalSection( &(mutex->cs) ); +} + +void opj_mutex_destroy(opj_mutex_t* mutex) +{ + if( !mutex ) return; + DeleteCriticalSection( &(mutex->cs) ); + opj_free( mutex ); +} + +struct opj_cond_waiter_list_t +{ + HANDLE hEvent; + struct opj_cond_waiter_list_t* next; +}; +typedef struct opj_cond_waiter_list_t opj_cond_waiter_list_t; + +struct opj_cond_t +{ + opj_mutex_t *internal_mutex; + opj_cond_waiter_list_t *waiter_list; +}; + +static DWORD TLSKey = 0; +static volatile LONG inTLSLockedSection = 0; +static volatile int TLSKeyInit = OPJ_FALSE; + +opj_cond_t* opj_cond_create(void) +{ + opj_cond_t* cond = (opj_cond_t*) opj_malloc(sizeof(opj_cond_t)); + if( !cond ) + return NULL; + + /* Make sure that the TLS key is allocated in a thread-safe way */ + /* We cannot use a global mutex/critical section since its creation itself would not be */ + /* thread-safe, so use InterlockedCompareExchange trick */ + while( OPJ_TRUE ) + { + +#if HAVE_INTERLOCKED_COMPARE_EXCHANGE + if( InterlockedCompareExchange(&inTLSLockedSection, 1, 0) == 0 ) +#endif + { + if( !TLSKeyInit ) + { + TLSKey = TlsAlloc(); + TLSKeyInit = OPJ_TRUE; + } +#if HAVE_INTERLOCKED_COMPARE_EXCHANGE + InterlockedCompareExchange(&inTLSLockedSection, 0, 1); +#endif + break; + } + } + + if( TLSKey == TLS_OUT_OF_INDEXES ) + { + opj_free(cond); + return NULL; + } + cond->internal_mutex = opj_mutex_create(); + if (cond->internal_mutex == NULL) + { + opj_free(cond); + return NULL; + } + cond->waiter_list = NULL; + return cond; +} + +void opj_cond_wait(opj_cond_t* cond, opj_mutex_t* mutex) +{ + opj_cond_waiter_list_t* item; + HANDLE hEvent = (HANDLE) TlsGetValue( TLSKey ); + if (hEvent == NULL) + { + hEvent = CreateEvent(NULL, /* security attributes */ + 0, /* manual reset = no */ + 0, /* initial state = unsignaled */ + NULL /* no name */); + assert(hEvent); + + TlsSetValue( TLSKey, hEvent ); + } + + /* Insert the waiter into the waiter list of the condition */ + opj_mutex_lock(cond->internal_mutex); + + item = (opj_cond_waiter_list_t*)opj_malloc(sizeof(opj_cond_waiter_list_t)); + assert(item != NULL); + + item->hEvent = hEvent; + item->next = cond->waiter_list; + + cond->waiter_list = item; + + opj_mutex_unlock(cond->internal_mutex); + + /* Release the client mutex before waiting for the event being signaled */ + opj_mutex_unlock(mutex); + + /* Ideally we would check that we do not get WAIT_FAILED but it is hard */ + /* to report a failure. */ + WaitForSingleObject(hEvent, INFINITE); + + /* Reacquire the client mutex */ + opj_mutex_lock(mutex); +} + +void opj_cond_signal(opj_cond_t* cond) +{ + opj_cond_waiter_list_t* psIter; + + /* Signal the first registered event, and remove it from the list */ + opj_mutex_lock(cond->internal_mutex); + + psIter = cond->waiter_list; + if (psIter != NULL) + { + SetEvent(psIter->hEvent); + cond->waiter_list = psIter->next; + opj_free(psIter); + } + + opj_mutex_unlock(cond->internal_mutex); +} + +void opj_cond_destroy(opj_cond_t* cond) +{ + if( !cond ) return; + opj_mutex_destroy(cond->internal_mutex); + assert(cond->waiter_list == NULL); + opj_free(cond); +} + +struct opj_thread_t +{ + opj_thread_fn thread_fn; + void* user_data; + HANDLE hThread; +}; + +static DWORD WINAPI opj_thread_callback_adapter( void *info ) +{ + opj_thread_t* thread = (opj_thread_t*) info; + HANDLE hEvent = NULL; + + thread->thread_fn( thread->user_data ); + + /* Free the handle possible allocated by a cond */ + while( OPJ_TRUE ) + { + /* Make sure TLSKey is not being created just at that moment... */ +#if HAVE_INTERLOCKED_COMPARE_EXCHANGE + if( InterlockedCompareExchange(&inTLSLockedSection, 1, 0) == 0 ) +#endif + { + if( TLSKeyInit ) + { + hEvent = (HANDLE) TlsGetValue( TLSKey ); + } +#if HAVE_INTERLOCKED_COMPARE_EXCHANGE + InterlockedCompareExchange(&inTLSLockedSection, 0, 1); +#endif + break; + } + } + if( hEvent ) + CloseHandle(hEvent); + + return 0; +} + +opj_thread_t* opj_thread_create( opj_thread_fn thread_fn, void* user_data ) +{ + opj_thread_t* thread; + DWORD nThreadId = 0; + + assert( thread_fn ); + + thread = (opj_thread_t*) opj_malloc( sizeof(opj_thread_t) ); + if( !thread ) + return NULL; + thread->thread_fn = thread_fn; + thread->user_data = user_data; + + thread->hThread = CreateThread( NULL, 0, opj_thread_callback_adapter, thread, + 0, &nThreadId ); + + if( thread->hThread == NULL ) + { + opj_free( thread ); + return NULL; + } + return thread; +} + +void opj_thread_join( opj_thread_t* thread ) +{ + WaitForSingleObject(thread->hThread, INFINITE); + CloseHandle( thread->hThread ); + + opj_free(thread); +} + +#elif MUTEX_pthread + +#include +#include +#include + +OPJ_BOOL OPJ_CALLCONV opj_has_thread_support(void) +{ + return OPJ_TRUE; +} + +int OPJ_CALLCONV opj_get_num_cpus(void) +{ +#ifdef _SC_NPROCESSORS_ONLN + return (int)sysconf(_SC_NPROCESSORS_ONLN); +#else + return 1; +#endif +} + +struct opj_mutex_t +{ + pthread_mutex_t mutex; +}; + +opj_mutex_t* opj_mutex_create(void) +{ + opj_mutex_t* mutex = (opj_mutex_t*) opj_malloc(sizeof(opj_mutex_t)); + if( !mutex ) + return NULL; + pthread_mutex_t pthr_mutex = PTHREAD_MUTEX_INITIALIZER; + mutex->mutex = pthr_mutex; + return mutex; +} + +void opj_mutex_lock(opj_mutex_t* mutex) +{ + pthread_mutex_lock(&(mutex->mutex)); +} + +void opj_mutex_unlock(opj_mutex_t* mutex) +{ + pthread_mutex_unlock(&(mutex->mutex)); +} + +void opj_mutex_destroy(opj_mutex_t* mutex) +{ + if( !mutex ) return; + pthread_mutex_destroy(&(mutex->mutex)); + opj_free(mutex); +} + +struct opj_cond_t +{ + pthread_cond_t cond; +}; + +opj_cond_t* opj_cond_create(void) +{ + opj_cond_t* cond = (opj_cond_t*) opj_malloc(sizeof(opj_cond_t)); + if( !cond ) + return NULL; + if( pthread_cond_init(&(cond->cond), NULL) != 0 ) + { + opj_free(cond); + return NULL; + } + return cond; +} + +void opj_cond_wait(opj_cond_t* cond, opj_mutex_t* mutex) +{ + pthread_cond_wait(&(cond->cond), &(mutex->mutex)); +} + +void opj_cond_signal(opj_cond_t* cond) +{ + int ret = pthread_cond_signal(&(cond->cond)); + (void)ret; + assert(ret == 0); +} + +void opj_cond_destroy(opj_cond_t* cond) +{ + if( !cond ) return; + pthread_cond_destroy(&(cond->cond)); + opj_free(cond); +} + + +struct opj_thread_t +{ + opj_thread_fn thread_fn; + void* user_data; + pthread_t thread; +}; + +static void* opj_thread_callback_adapter( void* info ) +{ + opj_thread_t* thread = (opj_thread_t*) info; + thread->thread_fn( thread->user_data ); + return NULL; +} + +opj_thread_t* opj_thread_create( opj_thread_fn thread_fn, void* user_data ) +{ + pthread_attr_t attr; + opj_thread_t* thread; + + assert( thread_fn ); + + thread = (opj_thread_t*) opj_malloc( sizeof(opj_thread_t) ); + if( !thread ) + return NULL; + thread->thread_fn = thread_fn; + thread->user_data = user_data; + + pthread_attr_init( &attr ); + pthread_attr_setdetachstate( &attr, PTHREAD_CREATE_JOINABLE ); + if( pthread_create( &(thread->thread), &attr, + opj_thread_callback_adapter, (void *) thread ) != 0 ) + { + opj_free( thread ); + return NULL; + } + return thread; +} + +void opj_thread_join( opj_thread_t* thread ) +{ + void* status; + pthread_join( thread->thread, &status); + + opj_free(thread); +} + +#else +/* Stub implementation */ + +OPJ_BOOL OPJ_CALLCONV opj_has_thread_support(void) +{ + return OPJ_FALSE; +} + +int OPJ_CALLCONV opj_get_num_cpus(void) +{ + return 1; +} + +opj_mutex_t* opj_mutex_create(void) +{ + return NULL; +} + +void opj_mutex_lock(opj_mutex_t* mutex) +{ + (void) mutex; +} + +void opj_mutex_unlock(opj_mutex_t* mutex) +{ + (void) mutex; +} + +void opj_mutex_destroy(opj_mutex_t* mutex) +{ + (void) mutex; +} + +opj_cond_t* opj_cond_create(void) +{ + return NULL; +} + +void opj_cond_wait(opj_cond_t* cond, opj_mutex_t* mutex) +{ + (void) cond; + (void) mutex; +} + +void opj_cond_signal(opj_cond_t* cond) +{ + (void) cond; +} + +void opj_cond_destroy(opj_cond_t* cond) +{ + (void) cond; +} + +opj_thread_t* opj_thread_create( opj_thread_fn thread_fn, void* user_data ) +{ + (void) thread_fn; + (void) user_data; + return NULL; +} + +void opj_thread_join( opj_thread_t* thread ) +{ + (void) thread; +} + +#endif + +typedef struct +{ + int key; + void* value; + opj_tls_free_func opj_free_func; +} opj_tls_key_val_t; + +struct opj_tls_t +{ + opj_tls_key_val_t* key_val; + int key_val_count; +}; + +static opj_tls_t* opj_tls_new(void) +{ + return (opj_tls_t*) opj_calloc(1, sizeof(opj_tls_t)); +} + +static void opj_tls_destroy(opj_tls_t* tls) +{ + int i; + if( !tls ) return; + for(i=0;ikey_val_count;i++) + { + if( tls->key_val[i].opj_free_func ) + tls->key_val[i].opj_free_func(tls->key_val[i].value); + } + opj_free(tls->key_val); + opj_free(tls); +} + +void* opj_tls_get(opj_tls_t* tls, int key) +{ + int i; + for(i=0;ikey_val_count;i++) + { + if( tls->key_val[i].key == key ) + return tls->key_val[i].value; + } + return NULL; +} + +OPJ_BOOL opj_tls_set(opj_tls_t* tls, int key, void* value, opj_tls_free_func opj_free_func) +{ + opj_tls_key_val_t* new_key_val; + int i; + for(i=0;ikey_val_count;i++) + { + if( tls->key_val[i].key == key ) + { + if( tls->key_val[i].opj_free_func ) + tls->key_val[i].opj_free_func(tls->key_val[i].value); + tls->key_val[i].value = value; + tls->key_val[i].opj_free_func = opj_free_func; + return OPJ_TRUE; + } + } + new_key_val = (opj_tls_key_val_t*) opj_realloc( tls->key_val, + (tls->key_val_count + 1) * sizeof(opj_tls_key_val_t) ); + if( !new_key_val ) + return OPJ_FALSE; + tls->key_val = new_key_val; + new_key_val[tls->key_val_count].key = key; + new_key_val[tls->key_val_count].value = value; + new_key_val[tls->key_val_count].opj_free_func = opj_free_func; + tls->key_val_count ++; + return OPJ_TRUE; +} + + +typedef struct +{ + opj_job_fn job_fn; + void *user_data; +} opj_worker_thread_job_t; + +typedef struct +{ + opj_thread_pool_t *tp; + opj_thread_t *thread; + int marked_as_waiting; + + opj_mutex_t *mutex; + opj_cond_t *cond; +} opj_worker_thread_t; + +typedef enum +{ + OPJWTS_OK, + OPJWTS_STOP, + OPJWTS_ERROR +} opj_worker_thread_state; + +struct opj_job_list_t +{ + opj_worker_thread_job_t* job; + struct opj_job_list_t* next; +}; +typedef struct opj_job_list_t opj_job_list_t; + +struct opj_worker_thread_list_t +{ + opj_worker_thread_t* worker_thread; + struct opj_worker_thread_list_t* next; +}; +typedef struct opj_worker_thread_list_t opj_worker_thread_list_t; + +struct opj_thread_pool_t +{ + opj_worker_thread_t* worker_threads; + int worker_threads_count; + opj_cond_t* cond; + opj_mutex_t* mutex; + volatile opj_worker_thread_state state; + opj_job_list_t* job_queue; + volatile int pending_jobs_count; + opj_worker_thread_list_t* waiting_worker_thread_list; + int waiting_worker_thread_count; + opj_tls_t* tls; + int signaling_threshold; +}; + +static OPJ_BOOL opj_thread_pool_setup(opj_thread_pool_t* tp, int num_threads); +static opj_worker_thread_job_t* opj_thread_pool_get_next_job(opj_thread_pool_t* tp, + opj_worker_thread_t* worker_thread, + OPJ_BOOL signal_job_finished); + +opj_thread_pool_t* opj_thread_pool_create(int num_threads) +{ + opj_thread_pool_t* tp; + + tp = (opj_thread_pool_t*) opj_calloc(1, sizeof(opj_thread_pool_t)); + if( !tp ) + return NULL; + tp->state = OPJWTS_OK; + + if( num_threads <= 0 ) + { + tp->tls = opj_tls_new(); + if( !tp->tls ) + { + opj_free(tp); + tp = NULL; + } + return tp; + } + + tp->mutex = opj_mutex_create(); + if( !tp->mutex ) + { + opj_free(tp); + return NULL; + } + if( !opj_thread_pool_setup(tp, num_threads) ) + { + opj_thread_pool_destroy(tp); + return NULL; + } + return tp; +} + +static void opj_worker_thread_function(void* user_data) +{ + opj_worker_thread_t* worker_thread; + opj_thread_pool_t* tp; + opj_tls_t* tls; + OPJ_BOOL job_finished = OPJ_FALSE; + + worker_thread = (opj_worker_thread_t* ) user_data; + tp = worker_thread->tp; + tls = opj_tls_new(); + + while( OPJ_TRUE ) + { + opj_worker_thread_job_t* job = opj_thread_pool_get_next_job(tp, worker_thread, job_finished); + if( job == NULL ) + break; + + if( job->job_fn ) + { + job->job_fn(job->user_data, tls); + } + opj_free(job); + job_finished = OPJ_TRUE; + } + + opj_tls_destroy(tls); +} + +static OPJ_BOOL opj_thread_pool_setup(opj_thread_pool_t* tp, int num_threads) +{ + int i; + OPJ_BOOL bRet = OPJ_TRUE; + + assert( num_threads > 0 ); + + tp->cond = opj_cond_create(); + if( tp->cond == NULL ) + return OPJ_FALSE; + + tp->worker_threads = (opj_worker_thread_t*) opj_calloc( num_threads, + sizeof(opj_worker_thread_t) ); + if( tp->worker_threads == NULL ) + return OPJ_FALSE; + tp->worker_threads_count = num_threads; + + for(i=0;iworker_threads[i].tp = tp; + + tp->worker_threads[i].mutex = opj_mutex_create(); + if( tp->worker_threads[i].mutex == NULL ) + { + tp->worker_threads_count = i; + bRet = OPJ_FALSE; + break; + } + + tp->worker_threads[i].cond = opj_cond_create(); + if( tp->worker_threads[i].cond == NULL ) + { + opj_mutex_destroy(tp->worker_threads[i].mutex); + tp->worker_threads_count = i; + bRet = OPJ_FALSE; + break; + } + + tp->worker_threads[i].marked_as_waiting = OPJ_FALSE; + + tp->worker_threads[i].thread = opj_thread_create(opj_worker_thread_function, + &(tp->worker_threads[i])); + if( tp->worker_threads[i].thread == NULL ) + { + tp->worker_threads_count = i; + bRet = OPJ_FALSE; + break; + } + } + + /* Wait all threads to be started */ + /* printf("waiting for all threads to be started\n"); */ + opj_mutex_lock(tp->mutex); + while( tp->waiting_worker_thread_count < num_threads ) + { + opj_cond_wait(tp->cond, tp->mutex); + } + opj_mutex_unlock(tp->mutex); + /* printf("all threads started\n"); */ + + if( tp->state == OPJWTS_ERROR ) + bRet = OPJ_FALSE; + + return bRet; +} + +/* +void opj_waiting() +{ + printf("waiting!\n"); +} +*/ + +static opj_worker_thread_job_t* opj_thread_pool_get_next_job(opj_thread_pool_t* tp, + opj_worker_thread_t* worker_thread, + OPJ_BOOL signal_job_finished) +{ + while( OPJ_TRUE ) + { + opj_job_list_t* top_job_iter; + + opj_mutex_lock(tp->mutex); + + if( signal_job_finished ) + { + signal_job_finished = OPJ_FALSE; + tp->pending_jobs_count --; + /*printf("tp=%p, remaining jobs: %d\n", tp, tp->pending_jobs_count);*/ + if( tp->pending_jobs_count <= tp->signaling_threshold ) + opj_cond_signal(tp->cond); + } + + if( tp->state == OPJWTS_STOP ) + { + opj_mutex_unlock(tp->mutex); + return NULL; + } + top_job_iter = tp->job_queue; + if( top_job_iter ) + { + opj_worker_thread_job_t* job; + tp->job_queue = top_job_iter->next; + + job = top_job_iter->job; + opj_mutex_unlock(tp->mutex); + opj_free(top_job_iter); + return job; + } + + /* opj_waiting(); */ + if( !worker_thread->marked_as_waiting ) + { + opj_worker_thread_list_t* item; + + worker_thread->marked_as_waiting = OPJ_TRUE; + tp->waiting_worker_thread_count ++; + assert(tp->waiting_worker_thread_count <= tp->worker_threads_count); + + item= (opj_worker_thread_list_t*) opj_malloc(sizeof(opj_worker_thread_list_t)); + if( item == NULL ) + { + tp->state = OPJWTS_ERROR; + opj_cond_signal(tp->cond); + + opj_mutex_unlock(tp->mutex); + return NULL; + } + + item->worker_thread = worker_thread; + item->next = tp->waiting_worker_thread_list; + tp->waiting_worker_thread_list = item; + } + + /* printf("signaling that worker thread is ready\n"); */ + opj_cond_signal(tp->cond); + + opj_mutex_lock(worker_thread->mutex); + opj_mutex_unlock(tp->mutex); + + /* printf("waiting for job\n"); */ + opj_cond_wait( worker_thread->cond, worker_thread->mutex ); + + opj_mutex_unlock(worker_thread->mutex); + /* printf("got job\n"); */ + } +} + +OPJ_BOOL opj_thread_pool_submit_job(opj_thread_pool_t* tp, + opj_job_fn job_fn, + void* user_data) +{ + opj_worker_thread_job_t* job; + opj_job_list_t* item; + + if( tp->mutex == NULL ) + { + job_fn( user_data, tp->tls ); + return OPJ_TRUE; + } + + job = (opj_worker_thread_job_t*)opj_malloc(sizeof(opj_worker_thread_job_t)); + if( job == NULL ) + return OPJ_FALSE; + job->job_fn = job_fn; + job->user_data = user_data; + + item = (opj_job_list_t*) opj_malloc(sizeof(opj_job_list_t)); + if( item == NULL ) + { + opj_free(job); + return OPJ_FALSE; + } + item->job = job; + + opj_mutex_lock(tp->mutex); + + tp->signaling_threshold = 100 * tp->worker_threads_count; + while( tp->pending_jobs_count > tp->signaling_threshold ) + { + /* printf("%d jobs enqueued. Waiting\n", tp->pending_jobs_count); */ + opj_cond_wait(tp->cond, tp->mutex); + /* printf("...%d jobs enqueued.\n", tp->pending_jobs_count); */ + } + + item->next = tp->job_queue; + tp->job_queue = item; + tp->pending_jobs_count ++; + + if( tp->waiting_worker_thread_list ) + { + opj_worker_thread_t* worker_thread; + opj_worker_thread_list_t* next; + opj_worker_thread_list_t* to_opj_free; + + worker_thread = tp->waiting_worker_thread_list->worker_thread; + + assert( worker_thread->marked_as_waiting ); + worker_thread->marked_as_waiting = OPJ_FALSE; + + next = tp->waiting_worker_thread_list->next; + to_opj_free = tp->waiting_worker_thread_list; + tp->waiting_worker_thread_list = next; + tp->waiting_worker_thread_count --; + + opj_mutex_lock(worker_thread->mutex); + opj_mutex_unlock(tp->mutex); + opj_cond_signal(worker_thread->cond); + opj_mutex_unlock(worker_thread->mutex); + + opj_free(to_opj_free); + } + else + opj_mutex_unlock(tp->mutex); + + return OPJ_TRUE; +} + +void opj_thread_pool_wait_completion(opj_thread_pool_t* tp, int max_remaining_jobs) +{ + if( tp->mutex == NULL ) + { + return; + } + + if( max_remaining_jobs < 0 ) + max_remaining_jobs = 0; + opj_mutex_lock(tp->mutex); + tp->signaling_threshold = max_remaining_jobs; + while( tp->pending_jobs_count > max_remaining_jobs ) + { + /*printf("tp=%p, jobs before wait = %d, max_remaining_jobs = %d\n", tp, tp->pending_jobs_count, max_remaining_jobs);*/ + opj_cond_wait(tp->cond, tp->mutex); + /*printf("tp=%p, jobs after wait = %d\n", tp, tp->pending_jobs_count);*/ + } + opj_mutex_unlock(tp->mutex); +} + +int opj_thread_pool_get_thread_count(opj_thread_pool_t* tp) +{ + return tp->worker_threads_count; +} + +void opj_thread_pool_destroy(opj_thread_pool_t* tp) +{ + if( !tp ) return; + if( tp->cond ) + { + int i; + opj_thread_pool_wait_completion(tp, 0); + + tp->state = OPJWTS_STOP; + + for(i=0;iworker_threads_count;i++) + { + opj_mutex_lock(tp->worker_threads[i].mutex); + opj_cond_signal(tp->worker_threads[i].cond); + opj_mutex_unlock(tp->worker_threads[i].mutex); + opj_thread_join(tp->worker_threads[i].thread); + opj_cond_destroy(tp->worker_threads[i].cond); + opj_mutex_destroy(tp->worker_threads[i].mutex); + } + + opj_free(tp->worker_threads); + + while( tp->waiting_worker_thread_list != NULL ) + { + opj_worker_thread_list_t* next = tp->waiting_worker_thread_list->next; + opj_free( tp->waiting_worker_thread_list ); + tp->waiting_worker_thread_list = next; + } + + opj_cond_destroy(tp->cond); + } + opj_mutex_destroy(tp->mutex); + opj_tls_destroy(tp->tls); + opj_free(tp); +} diff --git a/src/lib/openjp2/thread.h b/src/lib/openjp2/thread.h new file mode 100644 index 00000000..241e6d88 --- /dev/null +++ b/src/lib/openjp2/thread.h @@ -0,0 +1,253 @@ +/* + * The copyright in this software is being made available under the 2-clauses + * BSD License, included below. This software may be subject to other third + * party and contributor rights, including patent rights, and no such rights + * are granted under this license. + * + * Copyright (c) 2016, Even Rouault + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS' + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef THREAD_H +#define THREAD_H + +#include "openjpeg.h" + +/** +@file thread.h +@brief Thread API + +The functions in thread.c have for goal to manage mutex, conditions, thread +creation and thread pools that accept jobs. +*/ + +/** @defgroup THREAD THREAD - Mutex, conditions, threads and thread pools */ +/*@{*/ + +/** @name Mutex */ +/*@{*/ + +/** Opaque type for a mutex */ +typedef struct opj_mutex_t opj_mutex_t; + +/** Creates a mutex. + * @return the mutex or NULL in case of error (can for example happen if the library + * is built without thread support) + */ +opj_mutex_t* opj_mutex_create(void); + +/** Lock/acquire the mutex. + * @param mutex the mutex to acquire. + */ +void opj_mutex_lock(opj_mutex_t* mutex); + +/** Unlock/release the mutex. + * @param mutex the mutex to release. + */ +void opj_mutex_unlock(opj_mutex_t* mutex); + +/** Destroy a mutex + * @param mutex the mutex to destroy. + */ +void opj_mutex_destroy(opj_mutex_t* mutex); + +/*@}*/ + +/** @name Condition */ +/*@{*/ + +/** Opaque type for a condition */ +typedef struct opj_cond_t opj_cond_t; + +/** Creates a condition. + * @return the condition or NULL in case of error (can for example happen if the library + * is built without thread support) + */ +opj_cond_t* opj_cond_create(void); + +/** Wait for the condition to be signaled. + * The semantics is the same as the POSIX pthread_cond_wait. + * The provided mutex *must* be acquired before calling this function, and + * released afterwards. + * The mutex will be released by this function while it must wait for the condition + * and reacquired afterwards. + * In some particular situations, the function might return even if the condition is not signaled + * with opj_cond_signal(), hence the need to check with an application level + * mechanism. + * + * Waiting thread : + * \code + * opj_mutex_lock(mutex); + * while( !some_application_level_condition ) + * { + * opj_cond_wait(cond, mutex); + * } + * opj_mutex_unlock(mutex); + * \endcode + * + * Signaling thread : + * \code + * opj_mutex_lock(mutex); + * some_application_level_condition = TRUE; + * opj_cond_signal(cond); + * opj_mutex_unlock(mutex); + * \endcode + * + * @param cond the condition to wait. + * @param mutex the mutex (in acquired state before calling this function) + */ +void opj_cond_wait(opj_cond_t* cond, opj_mutex_t* mutex); + +/** Signal waiting threads on a condition. + * One of the thread waiting with opj_cond_wait() will be waken up. + * It is strongly advised that this call is done with the mutex that is used + * by opj_cond_wait(), in a acquired state. + * @param cond the condition to signal. + */ +void opj_cond_signal(opj_cond_t* cond); + +/** Destroy a condition + * @param cond the condition to destroy. + */ +void opj_cond_destroy(opj_cond_t* cond); + +/*@}*/ + +/** @name Thread */ +/*@{*/ + +/** Opaque type for a thread handle */ +typedef struct opj_thread_t opj_thread_t; + +/** User function to execute in a thread + * @param user_data user data provided with opj_thread_create() + */ +typedef void (*opj_thread_fn)(void* user_data); + +/** Creates a new thread. + * @param thread_fn Function to run in the new thread. + * @param user_data user data provided to the thread function. Might be NULL. + * @return a thread handle or NULL in case of failure (can for example happen if the library + * is built without thread support) + */ +opj_thread_t* opj_thread_create( opj_thread_fn thread_fn, void* user_data ); + +/** Wait for a thread to be finished and release associated resources to the + * thread handle. + * @param thread the thread to wait for being finished. + */ +void opj_thread_join( opj_thread_t* thread ); + +/*@}*/ + +/** @name Thread local storage */ +/*@{*/ +/** Opaque type for a thread local storage */ +typedef struct opj_tls_t opj_tls_t; + +/** Get a thread local value corresponding to the provided key. + * @param tls thread local storage handle + * @param key key whose value to retrieve. + * @return value associated with the key, or NULL is missing. + */ +void* opj_tls_get(opj_tls_t* tls, int key); + +/** Type of the function used to free a TLS value */ +typedef void (*opj_tls_free_func)(void* value); + +/** Set a thread local value corresponding to the provided key. + * @param tls thread local storage handle + * @param key key whose value to set. + * @param value value to set (may be NULL). + * @param free_func function to call currently installed value. + * @return OPJ_TRUE if successful. + */ +OPJ_BOOL opj_tls_set(opj_tls_t* tls, int key, void* value, opj_tls_free_func free_func); + +/*@}*/ + +/** @name Thread pool */ +/*@{*/ + +/** Opaque type for a thread pool */ +typedef struct opj_thread_pool_t opj_thread_pool_t; + +/** Create a new thread pool. + * num_thread must nominally be >= 1 to create a real thread pool. If num_threads + * is negative or null, then a dummy thread pool will be created. All functions + * operating on the thread pool will work, but job submission will be run + * synchronously in the calling thread. + * + * @param num_threads the number of threads to allocate for this thread pool. + * @return a thread pool handle, or NULL in case of failure (can for example happen if the library + * is built without thread support) + */ +opj_thread_pool_t* opj_thread_pool_create(int num_threads); + +/** User function to execute in a thread + * @param user_data user data provided with opj_thread_create() + * @param tls handle to thread local storage + */ +typedef void (*opj_job_fn)(void* user_data, opj_tls_t* tls); + + +/** Submit a new job to be run by one of the thread in the thread pool. + * The job ( thread_fn, user_data ) will be added in the queue of jobs managed + * by the thread pool, and run by the first thread that is no longer busy. + * + * @param tp the thread pool handle. + * @param job_fn Function to run. Must not be NULL. + * @param user_data User data provided to thread_fn. + * @return OPJ_TRUE if the job was successfully submitted. + */ +OPJ_BOOL opj_thread_pool_submit_job(opj_thread_pool_t* tp, opj_job_fn job_fn, void* user_data); + +/** Wait that no more than max_remaining_jobs jobs are remaining in the queue of + * the thread pool. The aim of this function is to avoid submitting too many + * jobs while the thread pool cannot cope fast enough with them, which would + * result potentially in out-of-memory situations with too many job descriptions + * being queued. + * + * @param tp the thread pool handle + * @param max_remaining_jobs maximum number of jobs allowed to be queued without waiting. + */ +void opj_thread_pool_wait_completion(opj_thread_pool_t* tp, int max_remaining_jobs); + +/** Return the number of threads associated with the thread pool. + * + * @param tp the thread pool handle. + * @return number of threads associated with the thread pool. + */ +int opj_thread_pool_get_thread_count(opj_thread_pool_t* tp); + +/** Destroy a thread pool. + * @param tp the thread pool handle. + */ +void opj_thread_pool_destroy(opj_thread_pool_t* tp); + +/*@}*/ + +/*@}*/ + +#endif /* THREAD_H */ From d4b7f03cfa4732132767188782683f3d957da912 Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Wed, 25 May 2016 16:36:47 +0200 Subject: [PATCH 14/22] Add opj_codec_set_threads() in public API and propagate resulting thread pool to tcd level By default, only the main thread is used. If opj_codec_set_threads() is not used, but the OPJ_NUM_THREADS environment variable is set, its value will be used to initialize the number of threads. The value can be either an integer number, or "ALL_CPUS". If OPJ_NUM_THREADS is set and this function is called, this function will override the behaviour of the environment variable. --- src/lib/openjp2/j2k.c | 55 +++++++++++++++++++++++++++++++++++-- src/lib/openjp2/j2k.h | 8 ++++++ src/lib/openjp2/jp2.c | 5 ++++ src/lib/openjp2/jp2.h | 2 ++ src/lib/openjp2/openjpeg.c | 18 ++++++++++++ src/lib/openjp2/openjpeg.h | 19 +++++++++++++ src/lib/openjp2/opj_codec.h | 4 +++ src/lib/openjp2/tcd.c | 4 ++- src/lib/openjp2/tcd.h | 6 +++- 9 files changed, 117 insertions(+), 4 deletions(-) diff --git a/src/lib/openjp2/j2k.c b/src/lib/openjp2/j2k.c index 9eaa155e..68b2f82e 100644 --- a/src/lib/openjp2/j2k.c +++ b/src/lib/openjp2/j2k.c @@ -5944,6 +5944,32 @@ void opj_j2k_setup_decoder(opj_j2k_t *j2k, opj_dparameters_t *parameters) } } +OPJ_BOOL opj_j2k_set_threads(opj_j2k_t *j2k, OPJ_UINT32 num_threads) +{ + if( opj_has_thread_support() ) + { + opj_thread_pool_destroy(j2k->m_tp); + j2k->m_tp = opj_thread_pool_create((int)num_threads); + if( j2k->m_tp == 0 ) + { + j2k->m_tp = opj_thread_pool_create(0); + return OPJ_FALSE; + } + return OPJ_TRUE; + } + return OPJ_FALSE; +} + +static int opj_j2k_get_default_thread_count() +{ + const char* num_threads = getenv("OPJ_NUM_THREADS"); + if( num_threads == NULL || !opj_has_thread_support() ) + return 0; + if( strcmp(num_threads, "ALL_CPUS") == 0 ) + return opj_get_num_cpus(); + return atoi(num_threads); +} + /* ----------------------------------------------------------------------- */ /* J2K encoder interface */ /* ----------------------------------------------------------------------- */ @@ -5981,6 +6007,17 @@ opj_j2k_t* opj_j2k_create_compress(void) return NULL; } + l_j2k->m_tp = opj_thread_pool_create(opj_j2k_get_default_thread_count()); + if( !l_j2k->m_tp ) + { + l_j2k->m_tp = opj_thread_pool_create(0); + } + if( !l_j2k->m_tp ) + { + opj_j2k_destroy(l_j2k); + return NULL; + } + return l_j2k; } @@ -7486,7 +7523,7 @@ static OPJ_BOOL opj_j2k_copy_default_tcp_and_create_tcd ( opj_j2k_t * p_j2 return OPJ_FALSE; } - if ( !opj_tcd_init(p_j2k->m_tcd, l_image, &(p_j2k->m_cp)) ) { + if ( !opj_tcd_init(p_j2k->m_tcd, l_image, &(p_j2k->m_cp), p_j2k->m_tp) ) { opj_tcd_destroy(p_j2k->m_tcd); p_j2k->m_tcd = 00; opj_event_msg(p_manager, EVT_ERROR, "Cannot decode tile, memory error\n"); @@ -7567,6 +7604,9 @@ void opj_j2k_destroy (opj_j2k_t *p_j2k) opj_image_destroy(p_j2k->m_output_image); p_j2k->m_output_image = NULL; + opj_thread_pool_destroy(p_j2k->m_tp); + p_j2k->m_tp = NULL; + opj_free(p_j2k); } @@ -8658,6 +8698,17 @@ opj_j2k_t* opj_j2k_create_decompress(void) return 00; } + l_j2k->m_tp = opj_thread_pool_create(opj_j2k_get_default_thread_count()); + if( !l_j2k->m_tp ) + { + l_j2k->m_tp = opj_thread_pool_create(0); + } + if( !l_j2k->m_tp ) + { + opj_j2k_destroy(l_j2k); + return NULL; + } + return l_j2k; } @@ -10934,7 +10985,7 @@ static OPJ_BOOL opj_j2k_create_tcd( opj_j2k_t *p_j2k, return OPJ_FALSE; } - if (!opj_tcd_init(p_j2k->m_tcd,p_j2k->m_private_image,&p_j2k->m_cp)) { + if (!opj_tcd_init(p_j2k->m_tcd,p_j2k->m_private_image,&p_j2k->m_cp, p_j2k->m_tp)) { opj_tcd_destroy(p_j2k->m_tcd); p_j2k->m_tcd = 00; return OPJ_FALSE; diff --git a/src/lib/openjp2/j2k.h b/src/lib/openjp2/j2k.h index 358e0739..be85d5d9 100644 --- a/src/lib/openjp2/j2k.h +++ b/src/lib/openjp2/j2k.h @@ -589,6 +589,12 @@ typedef struct opj_j2k /** the current tile coder/decoder **/ struct opj_tcd * m_tcd; + + /** Number of threads to use */ + int m_num_threads; + + /** Thread pool */ + opj_thread_pool_t* m_tp; } opj_j2k_t; @@ -607,6 +613,8 @@ Decoding parameters are returned in j2k->cp. */ void opj_j2k_setup_decoder(opj_j2k_t *j2k, opj_dparameters_t *parameters); +OPJ_BOOL opj_j2k_set_threads(opj_j2k_t *j2k, OPJ_UINT32 num_threads); + /** * Creates a J2K compression structure * diff --git a/src/lib/openjp2/jp2.c b/src/lib/openjp2/jp2.c index a607c8a9..e156ebfc 100644 --- a/src/lib/openjp2/jp2.c +++ b/src/lib/openjp2/jp2.c @@ -1767,6 +1767,11 @@ void opj_jp2_setup_decoder(opj_jp2_t *jp2, opj_dparameters_t *parameters) jp2->ignore_pclr_cmap_cdef = parameters->flags & OPJ_DPARAMETERS_IGNORE_PCLR_CMAP_CDEF_FLAG; } +OPJ_BOOL opj_jp2_set_threads(opj_jp2_t *jp2, OPJ_UINT32 num_threads) +{ + return opj_j2k_set_threads(jp2->j2k, num_threads); +} + /* ----------------------------------------------------------------------- */ /* JP2 encoder interface */ /* ----------------------------------------------------------------------- */ diff --git a/src/lib/openjp2/jp2.h b/src/lib/openjp2/jp2.h index 94138832..b54d0bfd 100644 --- a/src/lib/openjp2/jp2.h +++ b/src/lib/openjp2/jp2.h @@ -243,6 +243,8 @@ Decoding parameters are returned in jp2->j2k->cp. */ void opj_jp2_setup_decoder(opj_jp2_t *jp2, opj_dparameters_t *parameters); +OPJ_BOOL opj_jp2_set_threads(opj_jp2_t *jp2, OPJ_UINT32 num_threads); + /** * Decode an image from a JPEG-2000 file stream * @param jp2 JP2 decompressor handle diff --git a/src/lib/openjp2/openjpeg.c b/src/lib/openjp2/openjpeg.c index 5114cc10..ee3e14b6 100644 --- a/src/lib/openjp2/openjpeg.c +++ b/src/lib/openjp2/openjpeg.c @@ -239,6 +239,9 @@ opj_codec_t* OPJ_CALLCONV opj_create_decompress(OPJ_CODEC_FORMAT p_format) OPJ_UINT32 res_factor, struct opj_event_mgr * p_manager)) opj_j2k_set_decoded_resolution_factor; + l_codec->opj_set_threads = + (OPJ_BOOL (*) ( void * p_codec, OPJ_UINT32 num_threads )) opj_j2k_set_threads; + l_codec->m_codec = opj_j2k_create_decompress(); if (! l_codec->m_codec) { @@ -315,6 +318,9 @@ opj_codec_t* OPJ_CALLCONV opj_create_decompress(OPJ_CODEC_FORMAT p_format) OPJ_UINT32 res_factor, opj_event_mgr_t * p_manager)) opj_jp2_set_decoded_resolution_factor; + l_codec->opj_set_threads = + (OPJ_BOOL (*) ( void * p_codec, OPJ_UINT32 num_threads )) opj_jp2_set_threads; + l_codec->m_codec = opj_jp2_create(OPJ_TRUE); if (! l_codec->m_codec) { @@ -354,6 +360,18 @@ void OPJ_CALLCONV opj_set_default_decoder_parameters(opj_dparameters_t *paramete } } + +OPJ_API OPJ_CALLCONV opj_codec_set_threads(opj_codec_t *p_codec, + int num_threads) +{ + if (p_codec ) { + opj_codec_private_t * l_codec = (opj_codec_private_t *) p_codec; + + return l_codec->opj_set_threads(l_codec->m_codec, num_threads); + } + return OPJ_FALSE; +} + OPJ_BOOL OPJ_CALLCONV opj_setup_decoder(opj_codec_t *p_codec, opj_dparameters_t *parameters ) diff --git a/src/lib/openjp2/openjpeg.h b/src/lib/openjp2/openjpeg.h index 369693df..7912c236 100644 --- a/src/lib/openjp2/openjpeg.h +++ b/src/lib/openjp2/openjpeg.h @@ -1262,6 +1262,25 @@ OPJ_API void OPJ_CALLCONV opj_set_default_decoder_parameters(opj_dparameters_t * OPJ_API OPJ_BOOL OPJ_CALLCONV opj_setup_decoder(opj_codec_t *p_codec, opj_dparameters_t *parameters ); +/** + * Allocates worker threads for the compressor/decompressor. + * + * By default, only the main thread is used. If this function is not used, + * but the OPJ_NUM_THREADS environment variable is set, its value will be + * used to initialize the number of threads. The value can be either an integer + * number, or "ALL_CPUS". If OPJ_NUM_THREADS is set and this function is called, + * this function will override the behaviour of the environment variable. + * + * Note: currently only has effect on the decompressor. + * + * @param p_codec decompressor handler + * @param num_threads number of threads. + * + * @return OPJ_TRUE if the decoder is correctly set + */ +OPJ_API OPJ_BOOL OPJ_CALLCONV opj_codec_set_threads(opj_codec_t *p_codec, + int num_threads); + /** * Decodes an image header. * diff --git a/src/lib/openjp2/opj_codec.h b/src/lib/openjp2/opj_codec.h index 6bd791fa..c88005d7 100644 --- a/src/lib/openjp2/opj_codec.h +++ b/src/lib/openjp2/opj_codec.h @@ -113,6 +113,7 @@ typedef struct opj_codec_private OPJ_BOOL (*opj_set_decoded_resolution_factor) ( void * p_codec, OPJ_UINT32 res_factor, opj_event_mgr_t * p_manager); + } m_decompression; /** @@ -157,6 +158,9 @@ typedef struct opj_codec_private void (*opj_dump_codec) (void * p_codec, OPJ_INT32 info_flag, FILE* output_stream); opj_codestream_info_v2_t* (*opj_get_codec_info)(void* p_codec); opj_codestream_index_t* (*opj_get_codec_index)(void* p_codec); + + /** Set number of threads */ + OPJ_BOOL (*opj_set_threads) ( void * p_codec, OPJ_UINT32 num_threads ); } opj_codec_private_t; diff --git a/src/lib/openjp2/tcd.c b/src/lib/openjp2/tcd.c index b8cd3072..d76a3f9d 100644 --- a/src/lib/openjp2/tcd.c +++ b/src/lib/openjp2/tcd.c @@ -580,7 +580,8 @@ OPJ_BOOL opj_tcd_rateallocate( opj_tcd_t *tcd, OPJ_BOOL opj_tcd_init( opj_tcd_t *p_tcd, opj_image_t * p_image, - opj_cp_t * p_cp ) + opj_cp_t * p_cp, + opj_thread_pool_t* p_tp ) { p_tcd->image = p_image; p_tcd->cp = p_cp; @@ -597,6 +598,7 @@ OPJ_BOOL opj_tcd_init( opj_tcd_t *p_tcd, p_tcd->tcd_image->tiles->numcomps = p_image->numcomps; p_tcd->tp_pos = p_cp->m_specific_param.m_enc.m_tp_pos; + p_tcd->thread_pool = p_tp; return OPJ_TRUE; } diff --git a/src/lib/openjp2/tcd.h b/src/lib/openjp2/tcd.h index 07f8379a..77817bf6 100644 --- a/src/lib/openjp2/tcd.h +++ b/src/lib/openjp2/tcd.h @@ -220,6 +220,8 @@ typedef struct opj_tcd OPJ_UINT32 tcd_tileno; /** tell if the tcd is a decoder. */ OPJ_UINT32 m_is_decoder : 1; + /** Thread pool */ + opj_thread_pool_t* thread_pool; } opj_tcd_t; /** @name Exported functions */ @@ -249,12 +251,14 @@ void opj_tcd_destroy(opj_tcd_t *tcd); * @param p_tcd TCD handle. * @param p_image raw image. * @param p_cp coding parameters. + * @param p_tp thread pool * * @return true if the encoding values could be set (false otherwise). */ OPJ_BOOL opj_tcd_init( opj_tcd_t *p_tcd, opj_image_t * p_image, - opj_cp_t * p_cp ); + opj_cp_t * p_cp, + opj_thread_pool_t* p_tp); /** * Allocates memory for decoding a specific tile. From 5fbb8b2645a085391b070162d8551aa960caab6a Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Wed, 25 May 2016 16:38:44 +0200 Subject: [PATCH 15/22] Use thread-pool for T1 decoding --- src/lib/openjp2/opj_includes.h | 1 + src/lib/openjp2/t1.c | 224 +++++++++++++++++++++------------ src/lib/openjp2/t1.h | 3 +- src/lib/openjp2/tcd.c | 20 +-- src/lib/openjp2/tls_keys.h | 37 ++++++ 5 files changed, 191 insertions(+), 94 deletions(-) create mode 100644 src/lib/openjp2/tls_keys.h diff --git a/src/lib/openjp2/opj_includes.h b/src/lib/openjp2/opj_includes.h index c2cc31fa..e835fae4 100644 --- a/src/lib/openjp2/opj_includes.h +++ b/src/lib/openjp2/opj_includes.h @@ -183,6 +183,7 @@ static INLINE long opj_lrintf(float f) { #include "cio.h" #include "thread.h" +#include "tls_keys.h" #include "image.h" #include "invert.h" diff --git a/src/lib/openjp2/t1.c b/src/lib/openjp2/t1.c index e1097bf5..adf64bb4 100644 --- a/src/lib/openjp2/t1.c +++ b/src/lib/openjp2/t1.c @@ -1540,13 +1540,140 @@ void opj_t1_destroy(opj_t1_t *p_t1) opj_free(p_t1); } -OPJ_BOOL opj_t1_decode_cblks( opj_t1_t* t1, - opj_tcd_tilecomp_t* tilec, - opj_tccp_t* tccp - ) +typedef struct +{ + OPJ_UINT32 resno; + opj_tcd_cblk_dec_t* cblk; + opj_tcd_band_t* band; + opj_tcd_tilecomp_t* tilec; + opj_tccp_t* tccp; + volatile OPJ_BOOL* pret; +} opj_t1_cblk_decode_processing_job_t; + +static void opj_t1_destroy_wrapper(void* t1) +{ + opj_t1_destroy( (opj_t1_t*) t1 ); +} + +static void opj_t1_clbl_decode_processor(void* user_data, opj_tls_t* tls) +{ + opj_tcd_cblk_dec_t* cblk; + opj_tcd_band_t* band; + opj_tcd_tilecomp_t* tilec; + opj_tccp_t* tccp; + OPJ_INT32* restrict datap; + OPJ_UINT32 cblk_w, cblk_h; + OPJ_INT32 x, y; + OPJ_UINT32 i, j; + opj_t1_cblk_decode_processing_job_t* job; + opj_t1_t* t1; + OPJ_UINT32 resno; + OPJ_UINT32 tile_w; + + job = (opj_t1_cblk_decode_processing_job_t*) user_data; + resno = job->resno; + cblk = job->cblk; + band = job->band; + tilec = job->tilec; + tccp = job->tccp; + tile_w = (OPJ_UINT32)(tilec->x1 - tilec->x0); + + if( !*(job->pret) ) + { + opj_free(job); + return; + } + + t1 = (opj_t1_t*) opj_tls_get(tls, OPJ_TLS_KEY_T1); + if( t1 == NULL ) + { + t1 = opj_t1_create( OPJ_FALSE ); + opj_tls_set( tls, OPJ_TLS_KEY_T1, t1, opj_t1_destroy_wrapper ); + } + + if (OPJ_FALSE == opj_t1_decode_cblk( + t1, + cblk, + band->bandno, + (OPJ_UINT32)tccp->roishift, + tccp->cblksty)) { + *(job->pret) = OPJ_FALSE; + opj_free(job); + return; + } + + x = cblk->x0 - band->x0; + y = cblk->y0 - band->y0; + if (band->bandno & 1) { + opj_tcd_resolution_t* pres = &tilec->resolutions[resno - 1]; + x += pres->x1 - pres->x0; + } + if (band->bandno & 2) { + opj_tcd_resolution_t* pres = &tilec->resolutions[resno - 1]; + y += pres->y1 - pres->y0; + } + + datap=t1->data; + cblk_w = t1->w; + cblk_h = t1->h; + + if (tccp->roishift) { + OPJ_INT32 thresh = 1 << tccp->roishift; + for (j = 0; j < cblk_h; ++j) { + for (i = 0; i < cblk_w; ++i) { + OPJ_INT32 val = datap[(j * cblk_w) + i]; + OPJ_INT32 mag = abs(val); + if (mag >= thresh) { + mag >>= tccp->roishift; + datap[(j * cblk_w) + i] = val < 0 ? -mag : mag; + } + } + } + } + if (tccp->qmfbid == 1) { + OPJ_INT32* restrict tiledp = &tilec->data[(OPJ_UINT32)y * tile_w + (OPJ_UINT32)x]; + for (j = 0; j < cblk_h; ++j) { + i = 0; + for (; i < (cblk_w & ~3); i += 4) { + OPJ_INT32 tmp0 = datap[(j * cblk_w) + i]; + OPJ_INT32 tmp1 = datap[(j * cblk_w) + i+1]; + OPJ_INT32 tmp2 = datap[(j * cblk_w) + i+2]; + OPJ_INT32 tmp3 = datap[(j * cblk_w) + i+3]; + ((OPJ_INT32*)tiledp)[(j * tile_w) + i] = tmp0/2; + ((OPJ_INT32*)tiledp)[(j * tile_w) + i+1] = tmp1/2; + ((OPJ_INT32*)tiledp)[(j * tile_w) + i+2] = tmp2/2; + ((OPJ_INT32*)tiledp)[(j * tile_w) + i+3] = tmp3/2; + } + for (; i < cblk_w; ++i) { + OPJ_INT32 tmp = datap[(j * cblk_w) + i]; + ((OPJ_INT32*)tiledp)[(j * tile_w) + i] = tmp/2; + } + } + } else { /* if (tccp->qmfbid == 0) */ + OPJ_FLOAT32* restrict tiledp = (OPJ_FLOAT32*) &tilec->data[(OPJ_UINT32)y * tile_w + (OPJ_UINT32)x]; + for (j = 0; j < cblk_h; ++j) { + OPJ_FLOAT32* restrict tiledp2 = tiledp; + for (i = 0; i < cblk_w; ++i) { + OPJ_FLOAT32 tmp = (OPJ_FLOAT32)*datap * band->stepsize; + *tiledp2 = tmp; + datap++; + tiledp2++; + } + tiledp += tile_w; + } + } + + opj_free(job); +} + + +void opj_t1_decode_cblks( opj_thread_pool_t* tp, + volatile OPJ_BOOL* pret, + opj_tcd_tilecomp_t* tilec, + opj_tccp_t* tccp + ) { OPJ_UINT32 resno, bandno, precno, cblkno; - OPJ_UINT32 tile_w = (OPJ_UINT32)(tilec->x1 - tilec->x0); for (resno = 0; resno < tilec->minimum_num_resolutions; ++resno) { opj_tcd_resolution_t* res = &tilec->resolutions[resno]; @@ -1559,85 +1686,24 @@ OPJ_BOOL opj_t1_decode_cblks( opj_t1_t* t1, for (cblkno = 0; cblkno < precinct->cw * precinct->ch; ++cblkno) { opj_tcd_cblk_dec_t* cblk = &precinct->cblks.dec[cblkno]; - OPJ_INT32* restrict datap; - OPJ_UINT32 cblk_w, cblk_h; - OPJ_INT32 x, y; - OPJ_UINT32 i, j; + opj_t1_cblk_decode_processing_job_t* job; - if (OPJ_FALSE == opj_t1_decode_cblk( - t1, - cblk, - band->bandno, - (OPJ_UINT32)tccp->roishift, - tccp->cblksty)) { - return OPJ_FALSE; - } - - x = cblk->x0 - band->x0; - y = cblk->y0 - band->y0; - if (band->bandno & 1) { - opj_tcd_resolution_t* pres = &tilec->resolutions[resno - 1]; - x += pres->x1 - pres->x0; - } - if (band->bandno & 2) { - opj_tcd_resolution_t* pres = &tilec->resolutions[resno - 1]; - y += pres->y1 - pres->y0; - } - - datap=t1->data; - cblk_w = t1->w; - cblk_h = t1->h; - - if (tccp->roishift) { - OPJ_INT32 thresh = 1 << tccp->roishift; - for (j = 0; j < cblk_h; ++j) { - for (i = 0; i < cblk_w; ++i) { - OPJ_INT32 val = datap[(j * cblk_w) + i]; - OPJ_INT32 mag = abs(val); - if (mag >= thresh) { - mag >>= tccp->roishift; - datap[(j * cblk_w) + i] = val < 0 ? -mag : mag; - } - } - } - } - if (tccp->qmfbid == 1) { - OPJ_INT32* restrict tiledp = &tilec->data[(OPJ_UINT32)y * tile_w + (OPJ_UINT32)x]; - for (j = 0; j < cblk_h; ++j) { - i = 0; - for (; i < (cblk_w & ~3); i += 4) { - OPJ_INT32 tmp0 = datap[(j * cblk_w) + i]; - OPJ_INT32 tmp1 = datap[(j * cblk_w) + i+1]; - OPJ_INT32 tmp2 = datap[(j * cblk_w) + i+2]; - OPJ_INT32 tmp3 = datap[(j * cblk_w) + i+3]; - ((OPJ_INT32*)tiledp)[(j * tile_w) + i] = tmp0/2; - ((OPJ_INT32*)tiledp)[(j * tile_w) + i+1] = tmp1/2; - ((OPJ_INT32*)tiledp)[(j * tile_w) + i+2] = tmp2/2; - ((OPJ_INT32*)tiledp)[(j * tile_w) + i+3] = tmp3/2; - } - for (; i < cblk_w; ++i) { - OPJ_INT32 tmp = datap[(j * cblk_w) + i]; - ((OPJ_INT32*)tiledp)[(j * tile_w) + i] = tmp/2; - } - } - } else { /* if (tccp->qmfbid == 0) */ - OPJ_FLOAT32* restrict tiledp = (OPJ_FLOAT32*) &tilec->data[(OPJ_UINT32)y * tile_w + (OPJ_UINT32)x]; - for (j = 0; j < cblk_h; ++j) { - OPJ_FLOAT32* restrict tiledp2 = tiledp; - for (i = 0; i < cblk_w; ++i) { - OPJ_FLOAT32 tmp = (OPJ_FLOAT32)*datap * band->stepsize; - *tiledp2 = tmp; - datap++; - tiledp2++; - } - tiledp += tile_w; - } - } + job = (opj_t1_cblk_decode_processing_job_t*) opj_calloc(1, sizeof(opj_t1_cblk_decode_processing_job_t)); + job->resno = resno; + job->cblk = cblk; + job->band = band; + job->tilec = tilec; + job->tccp = tccp; + job->pret = pret; + opj_thread_pool_submit_job( tp, opj_t1_clbl_decode_processor, job ); + if( !(*pret) ) + return; } /* cblkno */ } /* precno */ } /* bandno */ } /* resno */ - return OPJ_TRUE; + + return; } diff --git a/src/lib/openjp2/t1.h b/src/lib/openjp2/t1.h index 22557d96..5afc6490 100644 --- a/src/lib/openjp2/t1.h +++ b/src/lib/openjp2/t1.h @@ -172,7 +172,8 @@ Decode the code-blocks of a tile @param tilec The tile to decode @param tccp Tile coding parameters */ -OPJ_BOOL opj_t1_decode_cblks( opj_t1_t* t1, +void opj_t1_decode_cblks( opj_thread_pool_t* tp, + volatile OPJ_BOOL* pret, opj_tcd_tilecomp_t* tilec, opj_tccp_t* tccp); diff --git a/src/lib/openjp2/tcd.c b/src/lib/openjp2/tcd.c index d76a3f9d..a34fa18c 100644 --- a/src/lib/openjp2/tcd.c +++ b/src/lib/openjp2/tcd.c @@ -1568,30 +1568,22 @@ static OPJ_BOOL opj_tcd_t2_decode (opj_tcd_t *p_tcd, static OPJ_BOOL opj_tcd_t1_decode ( opj_tcd_t *p_tcd ) { OPJ_UINT32 compno; - opj_t1_t * l_t1; opj_tcd_tile_t * l_tile = p_tcd->tcd_image->tiles; opj_tcd_tilecomp_t* l_tile_comp = l_tile->comps; opj_tccp_t * l_tccp = p_tcd->tcp->tccps; - - - l_t1 = opj_t1_create(OPJ_FALSE); - if (l_t1 == 00) { - return OPJ_FALSE; - } + volatile OPJ_BOOL ret = OPJ_TRUE; for (compno = 0; compno < l_tile->numcomps; ++compno) { - /* The +3 is headroom required by the vectorized DWT */ - if (OPJ_FALSE == opj_t1_decode_cblks(l_t1, l_tile_comp, l_tccp)) { - opj_t1_destroy(l_t1); - return OPJ_FALSE; - } + opj_t1_decode_cblks(p_tcd->thread_pool, &ret, l_tile_comp, l_tccp); + if( !ret ) + break; ++l_tile_comp; ++l_tccp; } - opj_t1_destroy(l_t1); + opj_thread_pool_wait_completion(p_tcd->thread_pool, 0); - return OPJ_TRUE; + return ret; } diff --git a/src/lib/openjp2/tls_keys.h b/src/lib/openjp2/tls_keys.h new file mode 100644 index 00000000..fb26498d --- /dev/null +++ b/src/lib/openjp2/tls_keys.h @@ -0,0 +1,37 @@ +/* + * The copyright in this software is being made available under the 2-clauses + * BSD License, included below. This software may be subject to other third + * party and contributor rights, including patent rights, and no such rights + * are granted under this license. + * + * Copyright (c) 2016, Even Rouault + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS' + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef TLS_KEYS_H +#define TLS_KEYS_H + +#define OPJ_TLS_KEY_T1 0 + +#endif From 57b216bb587aa7eba13afbbfd6a1fe5f04201b61 Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Wed, 25 May 2016 18:07:15 +0200 Subject: [PATCH 16/22] Use thread pool for DWT decoding --- src/lib/openjp2/dwt.c | 172 +++++++++++++++++++++++++++++++++++++----- src/lib/openjp2/dwt.h | 3 +- src/lib/openjp2/tcd.c | 2 +- 3 files changed, 157 insertions(+), 20 deletions(-) diff --git a/src/lib/openjp2/dwt.c b/src/lib/openjp2/dwt.c index a4ff01ba..e21cc16f 100644 --- a/src/lib/openjp2/dwt.c +++ b/src/lib/openjp2/dwt.c @@ -124,7 +124,7 @@ static void opj_dwt_encode_stepsize(OPJ_INT32 stepsize, OPJ_INT32 numbps, opj_st /** Inverse wavelet transform in 2-D. */ -static OPJ_BOOL opj_dwt_decode_tile(opj_tcd_tilecomp_t* tilec, OPJ_UINT32 i, DWT1DFN fn); +static OPJ_BOOL opj_dwt_decode_tile(opj_thread_pool_t* tp, opj_tcd_tilecomp_t* tilec, OPJ_UINT32 i, DWT1DFN fn); static OPJ_BOOL opj_dwt_encode_procedure( opj_tcd_tilecomp_t * tilec, void (*p_function)(OPJ_INT32 *, OPJ_INT32,OPJ_INT32,OPJ_INT32) ); @@ -473,8 +473,8 @@ OPJ_BOOL opj_dwt_encode(opj_tcd_tilecomp_t * tilec) /* */ /* Inverse 5-3 wavelet transform in 2-D. */ /* */ -OPJ_BOOL opj_dwt_decode(opj_tcd_tilecomp_t* tilec, OPJ_UINT32 numres) { - return opj_dwt_decode_tile(tilec, numres, &opj_dwt_decode_1); +OPJ_BOOL opj_dwt_decode(opj_thread_pool_t* tp, opj_tcd_tilecomp_t* tilec, OPJ_UINT32 numres) { + return opj_dwt_decode_tile(tp, tilec, numres, &opj_dwt_decode_1); } @@ -556,10 +556,72 @@ static OPJ_UINT32 opj_dwt_max_resolution(opj_tcd_resolution_t* restrict r, OPJ_U return mr ; } +typedef struct +{ + opj_dwt_t h; + DWT1DFN dwt_1D; + OPJ_UINT32 rw; + OPJ_UINT32 w; + OPJ_INT32 * restrict tiledp; + int min_j; + int max_j; +} opj_dwd_decode_h_job_t; + +static void opj_dwt_decode_h_func(void* user_data, opj_tls_t* tls) +{ + int j; + opj_dwd_decode_h_job_t* job; + (void)tls; + + job = (opj_dwd_decode_h_job_t*)user_data; + for( j = job->min_j; j < job->max_j; j++ ) + { + opj_dwt_interleave_h(&job->h, &job->tiledp[j*job->w]); + (job->dwt_1D)(&job->h); + memcpy(&job->tiledp[j*job->w], job->h.mem, job->rw * sizeof(OPJ_INT32)); + } + + opj_aligned_free(job->h.mem); + opj_free(job); +} + +typedef struct +{ + opj_dwt_t v; + DWT1DFN dwt_1D; + OPJ_UINT32 rh; + OPJ_UINT32 w; + OPJ_INT32 * restrict tiledp; + int min_j; + int max_j; +} opj_dwd_decode_v_job_t; + +static void opj_dwt_decode_v_func(void* user_data, opj_tls_t* tls) +{ + int j; + opj_dwd_decode_v_job_t* job; + (void)tls; + + job = (opj_dwd_decode_v_job_t*)user_data; + for( j = job->min_j; j < job->max_j; j++ ) + { + OPJ_UINT32 k; + opj_dwt_interleave_v(&job->v, &job->tiledp[j], (OPJ_INT32)job->w); + (job->dwt_1D)(&job->v); + for(k = 0; k < job->rh; ++k) { + job->tiledp[k * job->w + j] = job->v.mem[k]; + } + } + + opj_aligned_free(job->v.mem); + opj_free(job); +} + + /* */ /* Inverse wavelet transform in 2-D. */ /* */ -static OPJ_BOOL opj_dwt_decode_tile(opj_tcd_tilecomp_t* tilec, OPJ_UINT32 numres, DWT1DFN dwt_1D) { +static OPJ_BOOL opj_dwt_decode_tile(opj_thread_pool_t* tp, opj_tcd_tilecomp_t* tilec, OPJ_UINT32 numres, DWT1DFN dwt_1D) { opj_dwt_t h; opj_dwt_t v; @@ -569,11 +631,15 @@ static OPJ_BOOL opj_dwt_decode_tile(opj_tcd_tilecomp_t* tilec, OPJ_UINT32 numres OPJ_UINT32 rh = (OPJ_UINT32)(tr->y1 - tr->y0); /* height of the resolution level computed */ OPJ_UINT32 w = (OPJ_UINT32)(tilec->x1 - tilec->x0); + size_t h_mem_size; + int num_threads; if (numres == 1U) { return OPJ_TRUE; } - h.mem = (OPJ_INT32*)opj_aligned_malloc(opj_dwt_max_resolution(tr, numres) * sizeof(OPJ_INT32)); + num_threads = opj_thread_pool_get_thread_count(tp); + h_mem_size = opj_dwt_max_resolution(tr, numres) * sizeof(OPJ_INT32); + h.mem = (OPJ_INT32*)opj_aligned_malloc(h_mem_size); if (! h.mem){ /* FIXME event manager error callback */ return OPJ_FALSE; @@ -595,23 +661,93 @@ static OPJ_BOOL opj_dwt_decode_tile(opj_tcd_tilecomp_t* tilec, OPJ_UINT32 numres h.dn = (OPJ_INT32)(rw - (OPJ_UINT32)h.sn); h.cas = tr->x0 % 2; - for(j = 0; j < rh; ++j) { - opj_dwt_interleave_h(&h, &tiledp[j*w]); - (dwt_1D)(&h); - memcpy(&tiledp[j*w], h.mem, rw * sizeof(OPJ_INT32)); - } + if( num_threads <= 1 || rh == 1 ) + { + for(j = 0; j < rh; ++j) { + opj_dwt_interleave_h(&h, &tiledp[j*w]); + (dwt_1D)(&h); + memcpy(&tiledp[j*w], h.mem, rw * sizeof(OPJ_INT32)); + } + } + else + { + int num_jobs = num_threads; + if( rh < num_jobs ) + num_jobs = rh; + for( j = 0; j < num_jobs; j++ ) + { + opj_dwd_decode_h_job_t* job; + + job = (opj_dwd_decode_h_job_t*) opj_malloc(sizeof(opj_dwd_decode_h_job_t)); + job->h = h; + job->dwt_1D = dwt_1D; + job->rw = rw; + job->w = w; + job->tiledp = tiledp; + job->min_j = j * (rh / num_jobs); + job->max_j = (j+1) * (rh / num_jobs); + if( job->max_j > rh || j == num_jobs - 1 ) + job->max_j = rh; + job->h.mem = (OPJ_INT32*)opj_aligned_malloc(h_mem_size); + if (!job->h.mem) + { + /* FIXME event manager error callback */ + opj_thread_pool_wait_completion(tp, 0); + opj_free(job); + opj_aligned_free(h.mem); + return OPJ_FALSE; + } + opj_thread_pool_submit_job( tp, opj_dwt_decode_h_func, job ); + } + opj_thread_pool_wait_completion(tp, 0); + } v.dn = (OPJ_INT32)(rh - (OPJ_UINT32)v.sn); v.cas = tr->y0 % 2; - for(j = 0; j < rw; ++j){ - OPJ_UINT32 k; - opj_dwt_interleave_v(&v, &tiledp[j], (OPJ_INT32)w); - (dwt_1D)(&v); - for(k = 0; k < rh; ++k) { - tiledp[k * w + j] = v.mem[k]; - } - } + if( num_threads <= 1 || rw == 1 ) + { + for(j = 0; j < rw; ++j){ + OPJ_UINT32 k; + opj_dwt_interleave_v(&v, &tiledp[j], (OPJ_INT32)w); + (dwt_1D)(&v); + for(k = 0; k < rh; ++k) { + tiledp[k * w + j] = v.mem[k]; + } + } + } + else + { + int num_jobs = num_threads; + if( rw < num_jobs ) + num_jobs = rw; + for( j = 0; j < num_jobs; j++ ) + { + opj_dwd_decode_v_job_t* job; + + job = (opj_dwd_decode_v_job_t*) opj_malloc(sizeof(opj_dwd_decode_v_job_t)); + job->v = v; + job->dwt_1D = dwt_1D; + job->rh = rh; + job->w = w; + job->tiledp = tiledp; + job->min_j = j * (rw / num_jobs); + job->max_j = (j+1) * (rw / num_jobs); + if( job->max_j > rw || j == num_jobs - 1 ) + job->max_j = rw; + job->v.mem = (OPJ_INT32*)opj_aligned_malloc(h_mem_size); + if (!job->v.mem) + { + /* FIXME event manager error callback */ + opj_thread_pool_wait_completion(tp, 0); + opj_free(job); + opj_aligned_free(v.mem); + return OPJ_FALSE; + } + opj_thread_pool_submit_job( tp, opj_dwt_decode_v_func, job ); + } + opj_thread_pool_wait_completion(tp, 0); + } } opj_aligned_free(h.mem); return OPJ_TRUE; diff --git a/src/lib/openjp2/dwt.h b/src/lib/openjp2/dwt.h index 21fe942a..93850026 100644 --- a/src/lib/openjp2/dwt.h +++ b/src/lib/openjp2/dwt.h @@ -63,10 +63,11 @@ OPJ_BOOL opj_dwt_encode(opj_tcd_tilecomp_t * tilec); /** Inverse 5-3 wavelet transform in 2-D. Apply a reversible inverse DWT transform to a component of an image. +@param tp Thread pool @param tilec Tile component information (current tile) @param numres Number of resolution levels to decode */ -OPJ_BOOL opj_dwt_decode(opj_tcd_tilecomp_t* tilec, OPJ_UINT32 numres); +OPJ_BOOL opj_dwt_decode(opj_thread_pool_t* tp, opj_tcd_tilecomp_t* tilec, OPJ_UINT32 numres); /** Get the gain of a subband for the reversible 5-3 DWT. diff --git a/src/lib/openjp2/tcd.c b/src/lib/openjp2/tcd.c index a34fa18c..2980f723 100644 --- a/src/lib/openjp2/tcd.c +++ b/src/lib/openjp2/tcd.c @@ -1610,7 +1610,7 @@ static OPJ_BOOL opj_tcd_dwt_decode ( opj_tcd_t *p_tcd ) */ if (l_tccp->qmfbid == 1) { - if (! opj_dwt_decode(l_tile_comp, l_img_comp->resno_decoded+1)) { + if (! opj_dwt_decode(p_tcd->thread_pool, l_tile_comp, l_img_comp->resno_decoded+1)) { return OPJ_FALSE; } } From e3eb0a206d66cc873eccb57fc12a0497de400aca Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Wed, 25 May 2016 18:39:41 +0200 Subject: [PATCH 17/22] .travis.yml: add a conf with OPJ_NUM_THREADS=2 --- .travis.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.travis.yml b/.travis.yml index 0618f26e..3d73f75e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -8,6 +8,9 @@ matrix: - os: linux compiler: gcc env: OPJ_CI_ARCH=x86_64 OPJ_CI_BUILD_CONFIGURATION=Release OPJ_CI_INCLUDE_IF_DEPLOY=1 + - os: linux + compiler: gcc + env: OPJ_CI_ARCH=x86_64 OPJ_CI_BUILD_CONFIGURATION=Release OPJ_NUM_THREADS=2 - os: linux compiler: gcc env: OPJ_CI_ARCH=i386 OPJ_CI_BUILD_CONFIGURATION=Release From d67cd2220a291b54718f731052be4a9397f67077 Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Wed, 25 May 2016 19:08:04 +0200 Subject: [PATCH 18/22] opj_decompress: add a -threads option --- src/bin/jp2/opj_decompress.c | 36 ++++++++++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/src/bin/jp2/opj_decompress.c b/src/bin/jp2/opj_decompress.c index ab7ff04a..0e02c56b 100644 --- a/src/bin/jp2/opj_decompress.c +++ b/src/bin/jp2/opj_decompress.c @@ -150,6 +150,8 @@ typedef struct opj_decompress_params int upsample; /* split output components to different files */ int split_pnm; + /** number of threads */ + int num_threads; }opj_decompress_parameters; /* -------------------------------------------------------------------------- */ @@ -224,8 +226,11 @@ static void decode_help_display(void) { " -upsample\n" " Downsampled components will be upsampled to image size\n" " -split-pnm\n" - " Split output components to different files when writing to PNM\n" - "\n"); + " Split output components to different files when writing to PNM\n"); + if( opj_has_thread_support() ) { + fprintf(stdout," -threads \n" + " Number of threads to use for decoding.\n"); + } /* UniPG>> */ #ifdef USE_JPWL fprintf(stdout," -W \n" @@ -520,7 +525,8 @@ int parse_cmdline_decoder(int argc, char **argv, opj_decompress_parameters *para {"OutFor", REQ_ARG, NULL,'O'}, {"force-rgb", NO_ARG, NULL, 1}, {"upsample", NO_ARG, NULL, 1}, - {"split-pnm", NO_ARG, NULL, 1} + {"split-pnm", NO_ARG, NULL, 1}, + {"threads", REQ_ARG, NULL, 'T'} }; const char optlist[] = "i:o:r:l:x:d:t:p:" @@ -808,6 +814,22 @@ int parse_cmdline_decoder(int argc, char **argv, opj_decompress_parameters *para break; #endif /* USE_JPWL */ /* <num_threads = opj_get_num_cpus(); + if( parameters->num_threads == 1 ) + parameters->num_threads = 0; + } + else + { + sscanf(opj_optarg, "%d", ¶meters->num_threads); + } + } + break; /* ----------------------------------------------------- */ @@ -1306,7 +1328,13 @@ int main(int argc, char **argv) opj_destroy_codec(l_codec); failed = 1; goto fin; } - + + if( parameters.num_threads >= 1 && !opj_codec_set_threads(l_codec, parameters.num_threads) ) { + fprintf(stderr, "ERROR -> opj_decompress: failed to set number of threads\n"); + opj_stream_destroy(l_stream); + opj_destroy_codec(l_codec); + failed = 1; goto fin; + } /* Read the main header of the codestream and if necessary the JP2 boxes*/ if(! opj_read_header(l_stream, l_codec, &image)){ From 69497d35c0e35a1f9b789d016e9eb4946b8f0fab Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Wed, 25 May 2016 21:39:21 +0200 Subject: [PATCH 19/22] opj_decompress: use clock_gettime() instead of getrusage() so as to get the time spent, and not to the total CPU time --- src/bin/jp2/CMakeLists.txt | 3 +++ src/bin/jp2/opj_decompress.c | 24 +++++++++++++++--------- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/src/bin/jp2/CMakeLists.txt b/src/bin/jp2/CMakeLists.txt index dc013c21..ad7bce71 100644 --- a/src/bin/jp2/CMakeLists.txt +++ b/src/bin/jp2/CMakeLists.txt @@ -57,6 +57,9 @@ foreach(exe opj_decompress opj_compress opj_dump) # On unix you need to link to the math library: if(UNIX) target_link_libraries(${exe} m) + IF("${CMAKE_SYSTEM_NAME}" MATCHES "Linux") + target_link_libraries(${exe} rt) + endif() endif() # Install exe install(TARGETS ${exe} diff --git a/src/bin/jp2/opj_decompress.c b/src/bin/jp2/opj_decompress.c index 0e02c56b..57fe554b 100644 --- a/src/bin/jp2/opj_decompress.c +++ b/src/bin/jp2/opj_decompress.c @@ -43,6 +43,7 @@ #include #include #include +#include #ifdef _WIN32 #include "windirent.h" @@ -907,17 +908,22 @@ OPJ_FLOAT64 opj_clock(void) { /* t is the high resolution performance counter (see MSDN) */ QueryPerformanceCounter ( & t ) ; return freq.QuadPart ? (t.QuadPart / (OPJ_FLOAT64)freq.QuadPart) : 0; +#elif defined(__linux) + struct timespec ts; + clock_gettime(CLOCK_REALTIME, &ts); + return( ts.tv_sec + ts.tv_nsec * 1e-9 ); #else - /* Unix or Linux: use resource usage */ - struct rusage t; - OPJ_FLOAT64 procTime; - /* (1) Get the rusage data structure at this moment (man getrusage) */ - getrusage(0,&t); - /* (2) What is the elapsed time ? - CPU time = User time + System time */ + /* Unix : use resource usage */ + /* FIXME: this counts the total CPU time, instead of the user perceived time */ + struct rusage t; + OPJ_FLOAT64 procTime; + /* (1) Get the rusage data structure at this moment (man getrusage) */ + getrusage(0,&t); + /* (2) What is the elapsed time ? - CPU time = User time + System time */ /* (2a) Get the seconds */ - procTime = (OPJ_FLOAT64)(t.ru_utime.tv_sec + t.ru_stime.tv_sec); - /* (2b) More precisely! Get the microseconds part ! */ - return ( procTime + (OPJ_FLOAT64)(t.ru_utime.tv_usec + t.ru_stime.tv_usec) * 1e-6 ) ; + procTime = (OPJ_FLOAT64)(t.ru_utime.tv_sec + t.ru_stime.tv_sec); + /* (2b) More precisely! Get the microseconds part ! */ + return ( procTime + (OPJ_FLOAT64)(t.ru_utime.tv_usec + t.ru_stime.tv_usec) * 1e-6 ) ; #endif } From 7d3c7a345f05adbc9ca26d8ca7f6c7fffa5096be Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Thu, 26 May 2016 23:51:32 +0200 Subject: [PATCH 20/22] Be robust to failed allocations of job structures --- src/lib/openjp2/dwt.c | 20 ++++++++++++++++++++ src/lib/openjp2/t1.c | 5 +++++ 2 files changed, 25 insertions(+) diff --git a/src/lib/openjp2/dwt.c b/src/lib/openjp2/dwt.c index e21cc16f..18f8d9c3 100644 --- a/src/lib/openjp2/dwt.c +++ b/src/lib/openjp2/dwt.c @@ -679,6 +679,16 @@ static OPJ_BOOL opj_dwt_decode_tile(opj_thread_pool_t* tp, opj_tcd_tilecomp_t* t opj_dwd_decode_h_job_t* job; job = (opj_dwd_decode_h_job_t*) opj_malloc(sizeof(opj_dwd_decode_h_job_t)); + if( !job ) + { + /* It would be nice to fallback to single thread case, but */ + /* unfortunately some jobs may be launched and have modified */ + /* tiledp, so it is not practical to recover from that error */ + /* FIXME event manager error callback */ + opj_thread_pool_wait_completion(tp, 0); + opj_aligned_free(h.mem); + return OPJ_FALSE; + } job->h = h; job->dwt_1D = dwt_1D; job->rw = rw; @@ -726,6 +736,16 @@ static OPJ_BOOL opj_dwt_decode_tile(opj_thread_pool_t* tp, opj_tcd_tilecomp_t* t opj_dwd_decode_v_job_t* job; job = (opj_dwd_decode_v_job_t*) opj_malloc(sizeof(opj_dwd_decode_v_job_t)); + if( !job ) + { + /* It would be nice to fallback to single thread case, but */ + /* unfortunately some jobs may be launched and have modified */ + /* tiledp, so it is not practical to recover from that error */ + /* FIXME event manager error callback */ + opj_thread_pool_wait_completion(tp, 0); + opj_aligned_free(v.mem); + return OPJ_FALSE; + } job->v = v; job->dwt_1D = dwt_1D; job->rh = rh; diff --git a/src/lib/openjp2/t1.c b/src/lib/openjp2/t1.c index adf64bb4..b1c6128d 100644 --- a/src/lib/openjp2/t1.c +++ b/src/lib/openjp2/t1.c @@ -1689,6 +1689,11 @@ void opj_t1_decode_cblks( opj_thread_pool_t* tp, opj_t1_cblk_decode_processing_job_t* job; job = (opj_t1_cblk_decode_processing_job_t*) opj_calloc(1, sizeof(opj_t1_cblk_decode_processing_job_t)); + if( !job ) + { + *pret = OPJ_FALSE; + return; + } job->resno = resno; job->cblk = cblk; job->band = band; From 4f9abb9a45ffd711f9717db15d062fa020ed6cf5 Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Thu, 11 Aug 2016 21:50:46 +0200 Subject: [PATCH 21/22] [Win32] Use _beginthreadex instead of CreateThread() --- src/lib/openjp2/thread.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/lib/openjp2/thread.c b/src/lib/openjp2/thread.c index b2f8b5b2..59b5d87e 100644 --- a/src/lib/openjp2/thread.c +++ b/src/lib/openjp2/thread.c @@ -44,6 +44,7 @@ #endif #include +#include OPJ_BOOL OPJ_CALLCONV opj_has_thread_support(void) { @@ -224,11 +225,11 @@ struct opj_thread_t HANDLE hThread; }; -static DWORD WINAPI opj_thread_callback_adapter( void *info ) +unsigned int __stdcall opj_thread_callback_adapter( void *info ) { opj_thread_t* thread = (opj_thread_t*) info; HANDLE hEvent = NULL; - + thread->thread_fn( thread->user_data ); /* Free the handle possible allocated by a cond */ @@ -258,7 +259,6 @@ static DWORD WINAPI opj_thread_callback_adapter( void *info ) opj_thread_t* opj_thread_create( opj_thread_fn thread_fn, void* user_data ) { opj_thread_t* thread; - DWORD nThreadId = 0; assert( thread_fn ); @@ -268,8 +268,8 @@ opj_thread_t* opj_thread_create( opj_thread_fn thread_fn, void* user_data ) thread->thread_fn = thread_fn; thread->user_data = user_data; - thread->hThread = CreateThread( NULL, 0, opj_thread_callback_adapter, thread, - 0, &nThreadId ); + thread->hThread = (HANDLE)_beginthreadex(NULL, 0, + opj_thread_callback_adapter, thread, 0, NULL); if( thread->hThread == NULL ) { From ab22c5bad55fccdc440847c896baaf4bf89365a0 Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Thu, 8 Sep 2016 09:43:36 +0200 Subject: [PATCH 22/22] opj_thread_pool: fix potential deadlock at thread pool destruction --- src/lib/openjp2/thread.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/lib/openjp2/thread.c b/src/lib/openjp2/thread.c index 59b5d87e..fce563d0 100644 --- a/src/lib/openjp2/thread.c +++ b/src/lib/openjp2/thread.c @@ -930,7 +930,9 @@ void opj_thread_pool_destroy(opj_thread_pool_t* tp) int i; opj_thread_pool_wait_completion(tp, 0); + opj_mutex_lock(tp->mutex); tp->state = OPJWTS_STOP; + opj_mutex_unlock(tp->mutex); for(i=0;iworker_threads_count;i++) {