From b684247201349584a8dc07d99dcc5fcfd3f267e1 Mon Sep 17 00:00:00 2001
From: Aous Naman <aous72@yahoo.com>
Date: Sat, 4 Sep 2021 12:09:59 +1000
Subject: [PATCH] Formatted files with prepare_commit.sh.  Code fixed to
 compile with Visual Studio 10

---
 src/lib/openjp2/fbc_dec.c | 4014 ++++++++++++++++++-------------------
 src/lib/openjp2/j2k.c     |   16 +-
 src/lib/openjp2/j2k.h     |    3 +-
 src/lib/openjp2/t1.c      |    3 +-
 src/lib/openjp2/t2.c      |   98 +-
 src/lib/openjp2/tcd.h     |    6 +-
 6 files changed, 2055 insertions(+), 2085 deletions(-)

diff --git a/src/lib/openjp2/fbc_dec.c b/src/lib/openjp2/fbc_dec.c
index 1627fedb..0b52ca7d 100644
--- a/src/lib/openjp2/fbc_dec.c
+++ b/src/lib/openjp2/fbc_dec.c
@@ -2,21 +2,21 @@
 // This software is released under the 2-Clause BSD license, included
 // below.
 //
-// Copyright (c) 2021, Aous Naman 
+// Copyright (c) 2021, Aous Naman
 // Copyright (c) 2021, Kakadu Software Pty Ltd, Australia
 // Copyright (c) 2021, The University of New South Wales, Australia
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
-// 
+//
 // 1. Redistributions of source code must retain the above copyright
 // notice, this list of conditions and the following disclaimer.
-// 
+//
 // 2. Redistributions in binary form must reproduce the above copyright
 // notice, this list of conditions and the following disclaimer in the
 // documentation and/or other materials provided with the distribution.
-// 
+//
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
 // IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 // TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
@@ -36,7 +36,7 @@
 //***************************************************************************/
 
 //***************************************************************************/
-/** @file fbc.cpp
+/** @file fbc_dec.cpp
  *  @brief implements HTJ2K block decoder
  */
 
@@ -50,19 +50,19 @@
 // compiler detection
 /////////////////////////////////////////////////////////////////////////////
 #ifdef _MSC_VER
-  #define OPJ_COMPILER_MSVC
+#define OPJ_COMPILER_MSVC
 #elif (defined __GNUC__)
-  #define OPJ_COMPILER_GNUC
+#define OPJ_COMPILER_GNUC
 #endif
 
 //************************************************************************/
-/** @brief Displays the error message for disabling the decoding of CUP 
+/** @brief Displays the error message for disabling the decoding of CUP
   *        pass due to insufficient precision once
   */
 static OPJ_BOOL cannot_decode_due_to_insufficient_precision = OPJ_FALSE;
 
 //************************************************************************/
-/** @brief Displays the error message for disabling the decoding of SPP and 
+/** @brief Displays the error message for disabling the decoding of SPP and
   *        MRP passes once
   */
 static OPJ_BOOL cannot_decode_spp_mrp_msg = OPJ_FALSE;
@@ -71,21 +71,21 @@ static OPJ_BOOL cannot_decode_spp_mrp_msg = OPJ_FALSE;
 /** @brief Generates population count (i.e., the number of set bits)
   *
   *   @param [in]  val is the value for which population count is sought
-  */ 
-static inline 
+  */
+static INLINE
 OPJ_UINT32 population_count(OPJ_UINT32 val)
 {
 #ifdef OPJ_COMPILER_MSVC
-  return (OPJ_UINT32)__popcnt(val);
+    return (OPJ_UINT32)__popcnt(val);
 #elif (defined OPJ_COMPILER_GNUC)
-  return (OPJ_UINT32)__builtin_popcount(val);
+    return (OPJ_UINT32)__builtin_popcount(val);
 #else
-  val -= ((val >> 1) & 0x55555555);
-  val = (((val >> 2) & 0x33333333) + (val & 0x33333333));
-  val = (((val >> 4) + val) & 0x0f0f0f0f);
-  val += (val >> 8);
-  val += (val >> 16);
-  return (OPJ_UINT32)(val & 0x0000003f);
+    val -= ((val >> 1) & 0x55555555);
+    val = (((val >> 2) & 0x33333333) + (val & 0x33333333));
+    val = (((val >> 4) + val) & 0x0f0f0f0f);
+    val += (val >> 8);
+    val += (val >> 16);
+    return (OPJ_UINT32)(val & 0x0000003f);
 #endif
 }
 
@@ -93,26 +93,26 @@ OPJ_UINT32 population_count(OPJ_UINT32 val)
 /** @brief Counts the number of leading zeros
   *
   *   @param [in]  val is the value for which leading zero count is sought
-  */ 
+  */
 #ifdef OPJ_COMPILER_MSVC
-  #pragma intrinsic(_BitScanReverse)
+#pragma intrinsic(_BitScanReverse)
 #endif
-static inline 
+static INLINE
 OPJ_UINT32 count_leading_zeros(OPJ_UINT32 val)
 {
 #ifdef OPJ_COMPILER_MSVC
-  unsigned long result = 0;
-  _BitScanReverse(&result, val);
-  return 31U ^ (OPJ_UINT32)result;
+    unsigned long result = 0;
+    _BitScanReverse(&result, val);
+    return 31U ^ (OPJ_UINT32)result;
 #elif (defined OPJ_COMPILER_GNUC)
-  return (OPJ_UINT32)__builtin_clz(val);
+    return (OPJ_UINT32)__builtin_clz(val);
 #else
-  val |= (val >> 1);
-  val |= (val >> 2);
-  val |= (val >> 4);
-  val |= (val >> 8);
-  val |= (val >> 16);
-  return 32U - population_count(val);
+    val |= (val >> 1);
+    val |= (val >> 2);
+    val |= (val >> 4);
+    val |= (val >> 8);
+    val |= (val >> 16);
+    return 32U - population_count(val);
 #endif
 }
 
@@ -122,24 +122,24 @@ OPJ_UINT32 count_leading_zeros(OPJ_UINT32 val)
   *  A number of events is decoded from the MEL bitstream ahead of time
   *  and stored in run/num_runs.
   *  Each run represents the number of zero events before a one event.
-  */ 
+  */
 typedef struct dec_mel {
-  // data decoding machinary
-  OPJ_UINT8* data;  //!<the address of data (or bitstream)
-  OPJ_UINT64 tmp;   //!<temporary buffer for read data
-  int bits;         //!<number of bits stored in tmp
-  int size;         //!<number of bytes in MEL code
-  OPJ_BOOL unstuff; //!<true if the next bit needs to be unstuffed
-  int k;            //!<state of MEL decoder
+    // data decoding machinary
+    OPJ_UINT8* data;  //!<the address of data (or bitstream)
+    OPJ_UINT64 tmp;   //!<temporary buffer for read data
+    int bits;         //!<number of bits stored in tmp
+    int size;         //!<number of bytes in MEL code
+    OPJ_BOOL unstuff; //!<true if the next bit needs to be unstuffed
+    int k;            //!<state of MEL decoder
 
-  // queue of decoded runs
-  int num_runs;    //!<number of decoded runs left in runs (maximum 8)
-  OPJ_UINT64 runs; //!<runs of decoded MEL codewords (7 bits/run)
+    // queue of decoded runs
+    int num_runs;    //!<number of decoded runs left in runs (maximum 8)
+    OPJ_UINT64 runs; //!<runs of decoded MEL codewords (7 bits/run)
 } dec_mel_t;
 
 //************************************************************************/
 /** @brief Reads and unstuffs the MEL bitstream
-  * 
+  *
   *  This design needs more bytes in the codeblock buffer than the length
   *  of the cleanup pass by up to 2 bytes.
   *
@@ -149,171 +149,181 @@ typedef struct dec_mel {
   *
   *  @param [in]  melp is a pointer to dec_mel_t structure
   */
-static inline
+static INLINE
 void mel_read(dec_mel_t *melp)
 {
-  OPJ_UINT32 val; 
-  int bits;
-  OPJ_UINT32 t;
-  OPJ_BOOL unstuff;
-  
-  if (melp->bits > 32)  //there are enough bits in the tmp variable
-    return;             // return without reading new data
+    OPJ_UINT32 val;
+    int bits;
+    OPJ_UINT32 t;
+    OPJ_BOOL unstuff;
 
-  val = 0xFFFFFFFF;
-  //the next line (the if statement) needs to be tested first
-  //if (melp->size > 0)              // if there is data in the MEL segment
+    if (melp->bits > 32) { //there are enough bits in the tmp variable
+        return;    // return without reading new data
+    }
+    val = 0xFFFFFFFF;
+    //the next line (the if statement) needs to be tested first
+    //if (melp->size > 0)              // if there is data in the MEL segment
     val = *(OPJ_UINT32*)melp->data;  // read 32 bits from MEL data
-      
-  // next we unstuff them before adding them to the buffer
-  bits = 32 - melp->unstuff; // number of bits in val, subtract 1 if
-                             // the previously read byte requires 
-                             // unstuffing
 
-  // data is unstuffed and accumulated in t
-  // bits has the number of bits in t
-  t = (melp->size > 0) ? (val & 0xFF) : 0xFF; // feed 0xFF if the 
-                                  // MEL bitstream has been exhausted
-  if (melp->size == 1) t |= 0xF;  // if this is 1 byte before the last
-                                  // in MEL+VLC segments (remember they
-                                  // can overlap)
-  melp->data += melp->size-- > 0; // advance data by 1 byte if we have not
-                                  // reached the end of the MEL segment
-  unstuff = ((val & 0xFF) == 0xFF); // true if the byte needs unstuffing
+    // next we unstuff them before adding them to the buffer
+    bits = 32 - melp->unstuff;      // number of bits in val, subtract 1 if
+    // the previously read byte requires
+    // unstuffing
 
-  bits -= unstuff; // there is one less bit in t if unstuffing is needed
-  t = t << (8 - unstuff); // move up to make room for the next byte
+    // data is unstuffed and accumulated in t
+    // bits has the number of bits in t
+    t = (melp->size > 0) ? (val & 0xFF) : 0xFF; // feed 0xFF if the
+    // MEL bitstream has been exhausted
+    if (melp->size == 1) {
+        t |= 0xF;    // if this is 1 byte before the last
+    }
+    // in MEL+VLC segments (remember they
+    // can overlap)
+    melp->data += melp->size-- > 0; // advance data by 1 byte if we have not
+    // reached the end of the MEL segment
+    unstuff = ((val & 0xFF) == 0xFF); // true if the byte needs unstuffing
 
-  //this is a repeat of the above
-  t |= (melp->size > 0) ? ((val>>8) & 0xFF) : 0xFF;
-  if (melp->size == 1) t |= 0xF;
-  melp->data += melp->size-- > 0;
-  unstuff = (((val >> 8) & 0xFF) == 0xFF);
+    bits -= unstuff; // there is one less bit in t if unstuffing is needed
+    t = t << (8 - unstuff); // move up to make room for the next byte
 
-  bits -= unstuff;
-  t = t << (8 - unstuff);
+    //this is a repeat of the above
+    t |= (melp->size > 0) ? ((val >> 8) & 0xFF) : 0xFF;
+    if (melp->size == 1) {
+        t |= 0xF;
+    }
+    melp->data += melp->size-- > 0;
+    unstuff = (((val >> 8) & 0xFF) == 0xFF);
 
-  t |= (melp->size > 0) ? ((val>>16) & 0xFF) : 0xFF;
-  if (melp->size == 1) t |= 0xF;
-  melp->data += melp->size-- > 0;
-  unstuff = (((val >> 16) & 0xFF) == 0xFF);
+    bits -= unstuff;
+    t = t << (8 - unstuff);
 
-  bits -= unstuff;
-  t = t << (8 - unstuff);
+    t |= (melp->size > 0) ? ((val >> 16) & 0xFF) : 0xFF;
+    if (melp->size == 1) {
+        t |= 0xF;
+    }
+    melp->data += melp->size-- > 0;
+    unstuff = (((val >> 16) & 0xFF) == 0xFF);
 
-  t |= (melp->size > 0) ? ((val>>24) & 0xFF) : 0xFF;
-  if (melp->size == 1) t |= 0xF;
-  melp->data += melp->size-- > 0;
-  melp->unstuff = (((val >> 24) & 0xFF) == 0xFF);
+    bits -= unstuff;
+    t = t << (8 - unstuff);
 
-  // move t to tmp, and push the result all the way up, so we read from
-  // the MSB
-  melp->tmp |= ((OPJ_UINT64)t) << (64 - bits - melp->bits);
-  melp->bits += bits; //increment the number of bits in tmp
+    t |= (melp->size > 0) ? ((val >> 24) & 0xFF) : 0xFF;
+    if (melp->size == 1) {
+        t |= 0xF;
+    }
+    melp->data += melp->size-- > 0;
+    melp->unstuff = (((val >> 24) & 0xFF) == 0xFF);
+
+    // move t to tmp, and push the result all the way up, so we read from
+    // the MSB
+    melp->tmp |= ((OPJ_UINT64)t) << (64 - bits - melp->bits);
+    melp->bits += bits; //increment the number of bits in tmp
 }
 
 //************************************************************************/
 /** @brief Decodes unstuffed MEL segment bits stored in tmp to runs
-  * 
+  *
   *  Runs are stored in "runs" and the number of runs in "num_runs".
-  *  Each run represents a number of zero events that may or may not 
+  *  Each run represents a number of zero events that may or may not
   *  terminate in a 1 event.
   *  Each run is stored in 7 bits.  The LSB is 1 if the run terminates in
-  *  a 1 event, 0 otherwise.  The next 6 bits, for the case terminating 
-  *  with 1, contain the number of consecutive 0 zero events * 2; for the 
-  *  case terminating with 0, they store (number of consecutive 0 zero 
+  *  a 1 event, 0 otherwise.  The next 6 bits, for the case terminating
+  *  with 1, contain the number of consecutive 0 zero events * 2; for the
+  *  case terminating with 0, they store (number of consecutive 0 zero
   *  events - 1) * 2.
   *  A total of 6 bits (made up of 1 + 5) should have been enough.
   *
   *  @param [in]  melp is a pointer to dec_mel_t structure
   */
-static inline
+static INLINE
 void mel_decode(dec_mel_t *melp)
 {
-  static const int mel_exp[13] = { //MEL exponents
-    0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 5
-  };
+    static const int mel_exp[13] = { //MEL exponents
+        0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 5
+    };
 
-  if (melp->bits < 6) // if there are less than 6 bits in tmp
-    mel_read(melp);   // then read from the MEL bitstream
-                      // 6 bits is the largest decodable MEL cwd
+    if (melp->bits < 6) { // if there are less than 6 bits in tmp
+        mel_read(melp);    // then read from the MEL bitstream
+    }
+    // 6 bits is the largest decodable MEL cwd
 
-  //repeat so long that there is enough decodable bits in tmp,
-  // and the runs store is not full (num_runs < 8)
-  while (melp->bits >= 6 && melp->num_runs < 8)
-  {
-    int eval = mel_exp[melp->k]; // number of bits associated with state
-    int run = 0;
-    if (melp->tmp & (1ull<<63)) //The next bit to decode (stored in MSB)
-    { //one is found
-      run = 1 << eval;  
-      run--; // consecutive runs of 0 events - 1
-      melp->k = melp->k + 1 < 12 ? melp->k + 1 : 12;//increment, max is 12
-      melp->tmp <<= 1; // consume one bit from tmp
-      melp->bits -= 1;
-      run = run << 1; // a stretch of zeros not terminating in one
+    //repeat so long that there is enough decodable bits in tmp,
+    // and the runs store is not full (num_runs < 8)
+    while (melp->bits >= 6 && melp->num_runs < 8) {
+        int eval = mel_exp[melp->k]; // number of bits associated with state
+        int run = 0;
+        if (melp->tmp & (1ull << 63)) { //The next bit to decode (stored in MSB)
+            //one is found
+            run = 1 << eval;
+            run--; // consecutive runs of 0 events - 1
+            melp->k = melp->k + 1 < 12 ? melp->k + 1 : 12;//increment, max is 12
+            melp->tmp <<= 1; // consume one bit from tmp
+            melp->bits -= 1;
+            run = run << 1; // a stretch of zeros not terminating in one
+        } else {
+            //0 is found
+            run = (int)(melp->tmp >> (63 - eval)) & ((1 << eval) - 1);
+            melp->k = melp->k - 1 > 0 ? melp->k - 1 : 0; //decrement, min is 0
+            melp->tmp <<= eval + 1; //consume eval + 1 bits (max is 6)
+            melp->bits -= eval + 1;
+            run = (run << 1) + 1; // a stretch of zeros terminating with one
+        }
+        eval = melp->num_runs * 7;                 // 7 bits per run
+        melp->runs &= ~((OPJ_UINT64)0x3F << eval); // 6 bits are sufficient
+        melp->runs |= ((OPJ_UINT64)run) << eval;   // store the value in runs
+        melp->num_runs++;                          // increment count
     }
-    else
-    { //0 is found
-      run = (int)(melp->tmp >> (63 - eval)) & ((1 << eval) - 1);
-      melp->k = melp->k - 1 > 0 ? melp->k - 1 : 0; //decrement, min is 0
-      melp->tmp <<= eval + 1; //consume eval + 1 bits (max is 6)
-      melp->bits -= eval + 1;
-      run = (run << 1) + 1; // a stretch of zeros terminating with one
-    }
-    eval = melp->num_runs * 7;                 // 7 bits per run
-    melp->runs &= ~((OPJ_UINT64)0x3F << eval); // 6 bits are sufficient
-    melp->runs |= ((OPJ_UINT64)run) << eval;   // store the value in runs
-    melp->num_runs++;                          // increment count  
-  }
 }
 
 //************************************************************************/
 /** @brief Initiates a dec_mel_t structure for MEL decoding and reads
   *         some bytes in order to get the read address to a multiple
-  *         of 4 
+  *         of 4
   *
   *  @param [in]  melp is a pointer to dec_mel_t structure
   *  @param [in]  bbuf is a pointer to byte buffer
   *  @param [in]  lcup is the length of MagSgn+MEL+VLC segments
   *  @param [in]  scup is the length of MEL+VLC segments
   */
-static inline
+static INLINE
 void mel_init(dec_mel_t *melp, OPJ_UINT8* bbuf, int lcup, int scup)
 {
-  int num;
+    int num;
+    int i;
 
-  melp->data = bbuf + lcup - scup; // move the pointer to the start of MEL
-  melp->bits = 0;                  // 0 bits in tmp
-  melp->tmp = 0;                   //
-  melp->unstuff = OPJ_FALSE;       // no unstuffing
-  melp->size = scup - 1;           // size is the length of MEL+VLC-1
-  melp->k = 0;                     // 0 for state 
-  melp->num_runs = 0;              // num_runs is 0
-  melp->runs = 0;                  //
+    melp->data = bbuf + lcup - scup; // move the pointer to the start of MEL
+    melp->bits = 0;                  // 0 bits in tmp
+    melp->tmp = 0;                   //
+    melp->unstuff = OPJ_FALSE;       // no unstuffing
+    melp->size = scup - 1;           // size is the length of MEL+VLC-1
+    melp->k = 0;                     // 0 for state
+    melp->num_runs = 0;              // num_runs is 0
+    melp->runs = 0;                  //
 
-  //This code is borrowed; original is for a different architecture
-  //These few lines take care of the case where data is not at a multiple
-  // of 4 boundary.  It reads 1,2,3 up to 4 bytes from the MEL segment
-  num = 4 - (int)((intptr_t)(melp->data) & 0x3);
-  for (int i = 0; i < num; ++i) { // this code is similar to mel_read
-    OPJ_UINT64 d;
-    int d_bits;
-    
-    assert(melp->unstuff == OPJ_FALSE || melp->data[0] <= 0x8F);
-    d = (melp->size > 0) ? *melp->data : 0xFF; // if buffer is consumed 
-                                               // set data to 0xFF
-    if (melp->size == 1) d |= 0xF; //if this is MEL+VLC-1, set LSBs to 0xF
-                                    // see the standard
-    melp->data += melp->size-- > 0; //increment if the end is not reached
-    d_bits = 8 - melp->unstuff; //if unstuffing is needed, reduce by 1
-    melp->tmp = (melp->tmp << d_bits) | d; //store bits in tmp
-    melp->bits += d_bits;  //increment tmp by number of bits
-    melp->unstuff = ((d & 0xFF) == 0xFF); //true of next byte needs 
-                                          //unstuffing
-  }
-  melp->tmp <<= (64 - melp->bits); //push all the way up so the first bit
-                                    // is the MSB
+    //This code is borrowed; original is for a different architecture
+    //These few lines take care of the case where data is not at a multiple
+    // of 4 boundary.  It reads 1,2,3 up to 4 bytes from the MEL segment
+    num = 4 - (int)((intptr_t)(melp->data) & 0x3);
+    for (i = 0; i < num; ++i) { // this code is similar to mel_read
+        OPJ_UINT64 d;
+        int d_bits;
+
+        assert(melp->unstuff == OPJ_FALSE || melp->data[0] <= 0x8F);
+        d = (melp->size > 0) ? *melp->data : 0xFF; // if buffer is consumed
+        // set data to 0xFF
+        if (melp->size == 1) {
+            d |= 0xF;    //if this is MEL+VLC-1, set LSBs to 0xF
+        }
+        // see the standard
+        melp->data += melp->size-- > 0; //increment if the end is not reached
+        d_bits = 8 - melp->unstuff; //if unstuffing is needed, reduce by 1
+        melp->tmp = (melp->tmp << d_bits) | d; //store bits in tmp
+        melp->bits += d_bits;  //increment tmp by number of bits
+        melp->unstuff = ((d & 0xFF) == 0xFF); //true of next byte needs
+        //unstuffing
+    }
+    melp->tmp <<= (64 - melp->bits); //push all the way up so the first bit
+    // is the MSB
 }
 
 //************************************************************************/
@@ -321,103 +331,104 @@ void mel_init(dec_mel_t *melp, OPJ_UINT8* bbuf, int lcup, int scup)
   *         MEL segment is decoded
   *
   * @param [in]  melp is a pointer to dec_mel_t structure
-  */    
-static inline
+  */
+static INLINE
 int mel_get_run(dec_mel_t *melp)
 {
-  int t;
-  if (melp->num_runs == 0)  //if no runs, decode more bit from MEL segment
-    mel_decode(melp);
+    int t;
+    if (melp->num_runs == 0) { //if no runs, decode more bit from MEL segment
+        mel_decode(melp);
+    }
 
-  t = melp->runs & 0x7F; //retrieve one run
-  melp->runs >>= 7;  // remove the retrieved run
-  melp->num_runs--;
-  return t; // return run
+    t = melp->runs & 0x7F; //retrieve one run
+    melp->runs >>= 7;  // remove the retrieved run
+    melp->num_runs--;
+    return t; // return run
 }
 
 //************************************************************************/
 /** @brief A structure for reading and unstuffing a segment that grows
   *         backward, such as VLC and MRP
-  */ 
+  */
 typedef struct rev_struct {
-  //storage
-  OPJ_UINT8* data;  //!<pointer to where to read data
-  OPJ_UINT64 tmp;	  //!<temporary buffer of read data
-  OPJ_UINT32 bits;  //!<number of bits stored in tmp
-  int size;         //!<number of bytes left
-  OPJ_BOOL unstuff; //!<true if the last byte is more than 0x8F
-                    //!<then the current byte is unstuffed if it is 0x7F
+    //storage
+    OPJ_UINT8* data;  //!<pointer to where to read data
+    OPJ_UINT64 tmp;     //!<temporary buffer of read data
+    OPJ_UINT32 bits;  //!<number of bits stored in tmp
+    int size;         //!<number of bytes left
+    OPJ_BOOL unstuff; //!<true if the last byte is more than 0x8F
+    //!<then the current byte is unstuffed if it is 0x7F
 } rev_struct_t;
 
 //************************************************************************/
 /** @brief Read and unstuff data from a backwardly-growing segment
   *
   *  This reader can read up to 8 bytes from before the VLC segment.
-  *  Care must be taken not read from unreadable memory, causing a 
+  *  Care must be taken not read from unreadable memory, causing a
   *  segmentation fault.
-  * 
+  *
   *  Note that there is another subroutine rev_read_mrp that is slightly
   *  different.  The other one fills zeros when the buffer is exhausted.
   *  This one basically does not care if the bytes are consumed, because
   *  any extra data should not be used in the actual decoding.
   *
-  *  Unstuffing is needed to prevent sequences more than 0xFF8F from 
+  *  Unstuffing is needed to prevent sequences more than 0xFF8F from
   *  appearing in the bits stream; since we are reading backward, we keep
-  *  watch when a value larger than 0x8F appears in the bitstream. 
-  *  If the byte following this is 0x7F, we unstuff this byte (ignore the 
+  *  watch when a value larger than 0x8F appears in the bitstream.
+  *  If the byte following this is 0x7F, we unstuff this byte (ignore the
   *  MSB of that byte, which should be 0).
   *
   *  @param [in]  vlcp is a pointer to rev_struct_t structure
   */
-static inline 
+static INLINE
 void rev_read(rev_struct_t *vlcp)
 {
-  OPJ_UINT32 val;
-  OPJ_UINT32 tmp;
-  OPJ_UINT32 bits;
-  OPJ_BOOL unstuff;
+    OPJ_UINT32 val;
+    OPJ_UINT32 tmp;
+    OPJ_UINT32 bits;
+    OPJ_BOOL unstuff;
 
-  //process 4 bytes at a time
-  if (vlcp->bits > 32)  // if there are more than 32 bits in tmp, then 
-    return;             // reading 32 bits can overflow vlcp->tmp
-  val = 0;
-  //the next line (the if statement) needs to be tested first
-  if (vlcp->size > 0)  // if there are bytes left in the VLC segment
-  {
-    // We pad the data by 8 bytes at the beginning of the code stream 
-    // buffer
-    val = *(OPJ_UINT32*)vlcp->data; // then read 32 bits
-    vlcp->data -= 4;                // move data pointer back by 4
-    vlcp->size -= 4;                // reduce available byte by 4
-  }
+    //process 4 bytes at a time
+    if (vlcp->bits > 32) { // if there are more than 32 bits in tmp, then
+        return;    // reading 32 bits can overflow vlcp->tmp
+    }
+    val = 0;
+    //the next line (the if statement) needs to be tested first
+    if (vlcp->size > 0) { // if there are bytes left in the VLC segment
+        // We pad the data by 8 bytes at the beginning of the code stream
+        // buffer
+        val = *(OPJ_UINT32*)vlcp->data; // then read 32 bits
+        vlcp->data -= 4;                // move data pointer back by 4
+        vlcp->size -= 4;                // reduce available byte by 4
+    }
 
-  //accumulate in tmp, number of bits in tmp are stored in bits
-  tmp = val >> 24;  //start with the MSB byte
+    //accumulate in tmp, number of bits in tmp are stored in bits
+    tmp = val >> 24;  //start with the MSB byte
 
-  // test unstuff (previous byte is >0x8F), and this byte is 0x7F
-  bits = 8u - ((vlcp->unstuff && (((val >> 24) & 0x7F) == 0x7F)) ? 1u : 0u);
-  unstuff = (val >> 24) > 0x8F; //this is for the next byte
+    // test unstuff (previous byte is >0x8F), and this byte is 0x7F
+    bits = 8u - ((vlcp->unstuff && (((val >> 24) & 0x7F) == 0x7F)) ? 1u : 0u);
+    unstuff = (val >> 24) > 0x8F; //this is for the next byte
 
-  tmp |= ((val >> 16) & 0xFF) << bits; //process the next byte
-  bits += 8u - ((unstuff && (((val >> 16) & 0x7F) == 0x7F)) ? 1u : 0u);
-  unstuff = ((val >> 16) & 0xFF) > 0x8F;
+    tmp |= ((val >> 16) & 0xFF) << bits; //process the next byte
+    bits += 8u - ((unstuff && (((val >> 16) & 0x7F) == 0x7F)) ? 1u : 0u);
+    unstuff = ((val >> 16) & 0xFF) > 0x8F;
 
-  tmp |= ((val >> 8) & 0xFF) << bits;
-  bits += 8u - ((unstuff && (((val >> 8) & 0x7F) == 0x7F)) ? 1u : 0u);
-  unstuff = ((val >> 8) & 0xFF) > 0x8F;
+    tmp |= ((val >> 8) & 0xFF) << bits;
+    bits += 8u - ((unstuff && (((val >> 8) & 0x7F) == 0x7F)) ? 1u : 0u);
+    unstuff = ((val >> 8) & 0xFF) > 0x8F;
 
-  tmp |= (val & 0xFF) << bits;
-  bits += 8u - ((unstuff && ((val & 0x7F) == 0x7F)) ? 1u : 0u);
-  unstuff = (val & 0xFF) > 0x8F;
+    tmp |= (val & 0xFF) << bits;
+    bits += 8u - ((unstuff && ((val & 0x7F) == 0x7F)) ? 1u : 0u);
+    unstuff = (val & 0xFF) > 0x8F;
 
-  // now move the read and unstuffed bits into vlcp->tmp
-  vlcp->tmp |= (OPJ_UINT64)tmp << vlcp->bits;
-  vlcp->bits += bits;
-  vlcp->unstuff = unstuff; // this for the next read
+    // now move the read and unstuffed bits into vlcp->tmp
+    vlcp->tmp |= (OPJ_UINT64)tmp << vlcp->bits;
+    vlcp->bits += bits;
+    vlcp->unstuff = unstuff; // this for the next read
 }
 
 //************************************************************************/
-/** @brief Initiates the rev_struct_t structure and reads a few bytes to 
+/** @brief Initiates the rev_struct_t structure and reads a few bytes to
   *         move the read address to multiple of 4
   *
   *  There is another similar rev_init_mrp subroutine.  The difference is
@@ -430,62 +441,61 @@ void rev_read(rev_struct_t *vlcp)
   *  @param [in]  lcup is the length of MagSgn+MEL+VLC segments
   *  @param [in]  scup is the length of MEL+VLC segments
   */
-static inline 
+static INLINE
 void rev_init(rev_struct_t *vlcp, OPJ_UINT8* data, int lcup, int scup)
 {
-  OPJ_UINT32 d;
-  int num;
-  int tnum;
+    OPJ_UINT32 d;
+    int num, tnum, i;
 
-  //first byte has only the upper 4 bits
-  vlcp->data = data + lcup - 2;
+    //first byte has only the upper 4 bits
+    vlcp->data = data + lcup - 2;
 
-  //size can not be larger than this, in fact it should be smaller
-  vlcp->size = scup - 2;
+    //size can not be larger than this, in fact it should be smaller
+    vlcp->size = scup - 2;
 
-  d = *vlcp->data--;            // read one byte (this is a half byte)
-  vlcp->tmp = d >> 4;           // both initialize and set
-  vlcp->bits = 4 - ((vlcp->tmp & 7) == 7); //check standard
-  vlcp->unstuff = (d | 0xF) > 0x8F; //this is useful for the next byte
+    d = *vlcp->data--;            // read one byte (this is a half byte)
+    vlcp->tmp = d >> 4;           // both initialize and set
+    vlcp->bits = 4 - ((vlcp->tmp & 7) == 7); //check standard
+    vlcp->unstuff = (d | 0xF) > 0x8F; //this is useful for the next byte
 
-  //This code is designed for an architecture that read address should
-  // align to the read size (address multiple of 4 if read size is 4)
-  //These few lines take care of the case where data is not at a multiple
-  // of 4 boundary. It reads 1,2,3 up to 4 bytes from the VLC bitstream
-  num = 1 + (int)((intptr_t)(vlcp->data) & 0x3);
-  tnum = num < vlcp->size ? num : vlcp->size;
-  for (int i = 0; i < tnum; ++i) {
-    OPJ_UINT64 d;
-    OPJ_UINT32 d_bits;
-    d = *vlcp->data--;  // read one byte and move read pointer
-    //check if the last byte was >0x8F (unstuff == true) and this is 0x7F
-    d_bits = 8u - ((vlcp->unstuff && ((d & 0x7F) == 0x7F)) ? 1u : 0u);
-    vlcp->tmp |= d << vlcp->bits; // move data to vlcp->tmp
-    vlcp->bits += d_bits;
-    vlcp->unstuff = d > 0x8F; // for next byte
-  }
-  vlcp->size -= tnum;
-  vlcp->data -= 3; // make ready to read 32 bits (address multiple of 4)
-  rev_read(vlcp);  // read another 32 buts
+    //This code is designed for an architecture that read address should
+    // align to the read size (address multiple of 4 if read size is 4)
+    //These few lines take care of the case where data is not at a multiple
+    // of 4 boundary. It reads 1,2,3 up to 4 bytes from the VLC bitstream
+    num = 1 + (int)((intptr_t)(vlcp->data) & 0x3);
+    tnum = num < vlcp->size ? num : vlcp->size;
+    for (i = 0; i < tnum; ++i) {
+        OPJ_UINT64 d;
+        OPJ_UINT32 d_bits;
+        d = *vlcp->data--;  // read one byte and move read pointer
+        //check if the last byte was >0x8F (unstuff == true) and this is 0x7F
+        d_bits = 8u - ((vlcp->unstuff && ((d & 0x7F) == 0x7F)) ? 1u : 0u);
+        vlcp->tmp |= d << vlcp->bits; // move data to vlcp->tmp
+        vlcp->bits += d_bits;
+        vlcp->unstuff = d > 0x8F; // for next byte
+    }
+    vlcp->size -= tnum;
+    vlcp->data -= 3; // make ready to read 32 bits (address multiple of 4)
+    rev_read(vlcp);  // read another 32 buts
 }
 
 //************************************************************************/
-/** @brief Retrieves 32 bits from the head of a rev_struct structure 
+/** @brief Retrieves 32 bits from the head of a rev_struct structure
   *
   *  By the end of this call, vlcp->tmp must have no less than 33 bits
   *
   *  @param [in]  vlcp is a pointer to rev_struct structure
   */
-static inline 
+static INLINE
 OPJ_UINT32 rev_fetch(rev_struct_t *vlcp)
 {
-  if (vlcp->bits < 32)  // if there are less then 32 bits, read more
-  {
-    rev_read(vlcp);     // read 32 bits, but unstuffing might reduce this
-    if (vlcp->bits < 32)// if there is still space in vlcp->tmp for 32 bits
-      rev_read(vlcp);   // read another 32
-  }
-  return (OPJ_UINT32)vlcp->tmp; // return the head (bottom-most) of vlcp->tmp
+    if (vlcp->bits < 32) { // if there are less then 32 bits, read more
+        rev_read(vlcp);     // read 32 bits, but unstuffing might reduce this
+        if (vlcp->bits < 32) { // if there is still space in vlcp->tmp for 32 bits
+            rev_read(vlcp);    // read another 32
+        }
+    }
+    return (OPJ_UINT32)vlcp->tmp; // return the head (bottom-most) of vlcp->tmp
 }
 
 //************************************************************************/
@@ -494,13 +504,13 @@ OPJ_UINT32 rev_fetch(rev_struct_t *vlcp)
   *  @param [in]  vlcp is a pointer to rev_struct structure
   *  @param [in]  num_bits is the number of bits to be removed
   */
-static inline 
+static INLINE
 OPJ_UINT32 rev_advance(rev_struct_t *vlcp, OPJ_UINT32 num_bits)
 {
-  assert(num_bits <= vlcp->bits); // vlcp->tmp must have more than num_bits
-  vlcp->tmp >>= num_bits;         // remove bits
-  vlcp->bits -= num_bits;         // decrement the number of bits
-  return (OPJ_UINT32)vlcp->tmp;
+    assert(num_bits <= vlcp->bits); // vlcp->tmp must have more than num_bits
+    vlcp->tmp >>= num_bits;         // remove bits
+    vlcp->bits -= num_bits;         // decrement the number of bits
+    return (OPJ_UINT32)vlcp->tmp;
 }
 
 //************************************************************************/
@@ -514,50 +524,51 @@ OPJ_UINT32 rev_advance(rev_struct_t *vlcp, OPJ_UINT32 num_bits)
   *
   *  @param [in]  mrp is a pointer to rev_struct structure
   */
-static inline 
+static INLINE
 void rev_read_mrp(rev_struct_t *mrp)
 {
-  OPJ_UINT32 val;
-  OPJ_UINT32 tmp; 
-  OPJ_UINT32 bits;
-  OPJ_BOOL unstuff;
+    OPJ_UINT32 val;
+    OPJ_UINT32 tmp;
+    OPJ_UINT32 bits;
+    OPJ_BOOL unstuff;
 
-  //process 4 bytes at a time
-  if (mrp->bits > 32)
-    return;
-  val = 0;
-  //the next line (the if statement) needs to be tested first
-  //notice that second line can be simplified to mrp->data -= 4
-  // if (mrp->size > 0)
-  {
-    val = *(OPJ_UINT32*)mrp->data;      // read 32 bits
-    mrp->data -= mrp->size > 0 ? 4 : 0; // move back read pointer only if 
-                                        // there is data
-  }
+    //process 4 bytes at a time
+    if (mrp->bits > 32) {
+        return;
+    }
+    val = 0;
+    //the next line (the if statement) needs to be tested first
+    //notice that second line can be simplified to mrp->data -= 4
+    // if (mrp->size > 0)
+    {
+        val = *(OPJ_UINT32*)mrp->data;      // read 32 bits
+        mrp->data -= mrp->size > 0 ? 4 : 0; // move back read pointer only if
+        // there is data
+    }
 
-  //accumulate in tmp, and keep count in bits
-  tmp = (mrp->size-- > 0) ? (val >> 24) : 0; // fill zeros if all 
-                                                        
-  //test if the last byte > 0x8F (unstuff must be true) and this is 0x7F
-  bits = 8u - ((mrp->unstuff && (((val >> 24) & 0x7F) == 0x7F)) ? 1u : 0u);
-  unstuff = (val >> 24) > 0x8F;
+    //accumulate in tmp, and keep count in bits
+    tmp = (mrp->size-- > 0) ? (val >> 24) : 0; // fill zeros if all
 
-  //process the next byte
-  tmp |= (mrp->size-- > 0) ? (((val >> 16) & 0xFF) << bits) : 0;
-  bits += 8u - ((unstuff && (((val >> 16) & 0x7F) == 0x7F)) ? 1u : 0u);
-  unstuff = ((val >> 16) & 0xFF) > 0x8F;
+    //test if the last byte > 0x8F (unstuff must be true) and this is 0x7F
+    bits = 8u - ((mrp->unstuff && (((val >> 24) & 0x7F) == 0x7F)) ? 1u : 0u);
+    unstuff = (val >> 24) > 0x8F;
 
-  tmp |= (mrp->size-- > 0) ? (((val >> 8) & 0xFF) << bits) : 0;
-  bits += 8u - ((unstuff && (((val >> 8) & 0x7F) == 0x7F)) ? 1u : 0u);
-  unstuff = ((val >> 8) & 0xFF) > 0x8F;
+    //process the next byte
+    tmp |= (mrp->size-- > 0) ? (((val >> 16) & 0xFF) << bits) : 0;
+    bits += 8u - ((unstuff && (((val >> 16) & 0x7F) == 0x7F)) ? 1u : 0u);
+    unstuff = ((val >> 16) & 0xFF) > 0x8F;
 
-  tmp |= (mrp->size-- > 0) ? ((val & 0xFF) << bits) : 0;
-  bits += 8u - ((unstuff && ((val & 0x7F) == 0x7F)) ? 1u : 0u);
-  unstuff = (val & 0xFF) > 0x8F;
+    tmp |= (mrp->size-- > 0) ? (((val >> 8) & 0xFF) << bits) : 0;
+    bits += 8u - ((unstuff && (((val >> 8) & 0x7F) == 0x7F)) ? 1u : 0u);
+    unstuff = ((val >> 8) & 0xFF) > 0x8F;
 
-  mrp->tmp |= (OPJ_UINT64)tmp << mrp->bits; // move data to mrp pointer
-  mrp->bits += bits;
-  mrp->unstuff = unstuff;                   // next byte
+    tmp |= (mrp->size-- > 0) ? ((val & 0xFF) << bits) : 0;
+    bits += 8u - ((unstuff && ((val & 0x7F) == 0x7F)) ? 1u : 0u);
+    unstuff = (val & 0xFF) > 0x8F;
+
+    mrp->tmp |= (OPJ_UINT64)tmp << mrp->bits; // move data to mrp pointer
+    mrp->bits += bits;
+    mrp->unstuff = unstuff;                   // next byte
 }
 
 //************************************************************************/
@@ -567,7 +578,7 @@ void rev_read_mrp(rev_struct_t *mrp)
   *         an architecture that read size must be compatible with the
   *         alignment of the read address
   *
-  *  There is another simiar subroutine rev_init.  This subroutine does 
+  *  There is another simiar subroutine rev_init.  This subroutine does
   *  NOT skip the first 12 bits, and starts with unstuff set to true.
   *
   *  @param [in]  mrp is a pointer to rev_struct structure
@@ -575,55 +586,55 @@ void rev_read_mrp(rev_struct_t *mrp)
   *  @param [in]  lcup is the length of MagSgn+MEL+VLC segments
   *  @param [in]  len2 is the length of SPP+MRP segments
   */
-static inline 
+static INLINE
 void rev_init_mrp(rev_struct_t *mrp, OPJ_UINT8* data, int lcup, int len2)
 {
-  int num;
+    int num, i;
 
-  mrp->data = data + lcup + len2 - 1;
-  mrp->size = len2;
-  mrp->unstuff = OPJ_TRUE;
-  mrp->bits = 0;
-  mrp->tmp = 0;
+    mrp->data = data + lcup + len2 - 1;
+    mrp->size = len2;
+    mrp->unstuff = OPJ_TRUE;
+    mrp->bits = 0;
+    mrp->tmp = 0;
 
-  //This code is designed for an architecture that read address should
-  // align to the read size (address multiple of 4 if read size is 4)
-  //These few lines take care of the case where data is not at a multiple
-  // of 4 boundary.  It reads 1,2,3 up to 4 bytes from the MRP stream
-  num = 1 + (int)((intptr_t)(mrp->data) & 0x3);
-  for (int i = 0; i < num; ++i) {
-    OPJ_UINT64 d;
-    OPJ_UINT32 d_bits;
+    //This code is designed for an architecture that read address should
+    // align to the read size (address multiple of 4 if read size is 4)
+    //These few lines take care of the case where data is not at a multiple
+    // of 4 boundary.  It reads 1,2,3 up to 4 bytes from the MRP stream
+    num = 1 + (int)((intptr_t)(mrp->data) & 0x3);
+    for (i = 0; i < num; ++i) {
+        OPJ_UINT64 d;
+        OPJ_UINT32 d_bits;
 
-    //read a byte, 0 if no more data
-    d = (mrp->size-- > 0) ? *mrp->data-- : 0; 
-    //check if unstuffing is needed
-    d_bits = 8u - ((mrp->unstuff && ((d & 0x7F) == 0x7F)) ? 1u : 0u);
-    mrp->tmp |= d << mrp->bits; // move data to vlcp->tmp
-    mrp->bits += d_bits;
-    mrp->unstuff = d > 0x8F; // for next byte
-  }
-  mrp->data -= 3; //make ready to read a 32 bits
-  rev_read_mrp(mrp);
+        //read a byte, 0 if no more data
+        d = (mrp->size-- > 0) ? *mrp->data-- : 0;
+        //check if unstuffing is needed
+        d_bits = 8u - ((mrp->unstuff && ((d & 0x7F) == 0x7F)) ? 1u : 0u);
+        mrp->tmp |= d << mrp->bits; // move data to vlcp->tmp
+        mrp->bits += d_bits;
+        mrp->unstuff = d > 0x8F; // for next byte
+    }
+    mrp->data -= 3; //make ready to read a 32 bits
+    rev_read_mrp(mrp);
 }
 
 //************************************************************************/
-/** @brief Retrieves 32 bits from the head of a rev_struct structure 
+/** @brief Retrieves 32 bits from the head of a rev_struct structure
   *
   *  By the end of this call, mrp->tmp must have no less than 33 bits
   *
   *  @param [in]  mrp is a pointer to rev_struct structure
   */
-static inline 
+static INLINE
 OPJ_UINT32 rev_fetch_mrp(rev_struct_t *mrp)
 {
-  if (mrp->bits < 32) // if there are less than 32 bits in mrp->tmp
-  {
-    rev_read_mrp(mrp);    // read 30-32 bits from mrp
-    if (mrp->bits < 32)   // if there is a space of 32 bits
-      rev_read_mrp(mrp);  // read more
-  }
-  return (OPJ_UINT32)mrp->tmp;  // return the head of mrp->tmp
+    if (mrp->bits < 32) { // if there are less than 32 bits in mrp->tmp
+        rev_read_mrp(mrp);    // read 30-32 bits from mrp
+        if (mrp->bits < 32) { // if there is a space of 32 bits
+            rev_read_mrp(mrp);    // read more
+        }
+    }
+    return (OPJ_UINT32)mrp->tmp;  // return the head of mrp->tmp
 }
 
 //************************************************************************/
@@ -632,13 +643,13 @@ OPJ_UINT32 rev_fetch_mrp(rev_struct_t *mrp)
   *  @param [in]  mrp is a pointer to rev_struct structure
   *  @param [in]  num_bits is the number of bits to be removed
   */
-static inline 
+static INLINE
 OPJ_UINT32 rev_advance_mrp(rev_struct_t *mrp, OPJ_UINT32 num_bits)
 {
-  assert(num_bits <= mrp->bits); // we must not consume more than mrp->bits
-  mrp->tmp >>= num_bits;         // discard the lowest num_bits bits
-  mrp->bits -= num_bits;
-  return (OPJ_UINT32)mrp->tmp;   // return data after consumption
+    assert(num_bits <= mrp->bits); // we must not consume more than mrp->bits
+    mrp->tmp >>= num_bits;         // discard the lowest num_bits bits
+    mrp->bits -= num_bits;
+    return (OPJ_UINT32)mrp->tmp;   // return data after consumption
 }
 
 //************************************************************************/
@@ -652,218 +663,203 @@ OPJ_UINT32 rev_advance_mrp(rev_struct_t *mrp, OPJ_UINT32 num_bits)
   *  @param [out] u is the u value (or u_q) + 1.  Note: we produce u + 1;
   *               this value is a partial calculation of u + kappa.
   */
-static inline 
+static INLINE
 OPJ_UINT32 decode_init_uvlc(OPJ_UINT32 vlc, OPJ_UINT32 mode, OPJ_UINT32 *u)
 {
-  //table stores possible decoding three bits from vlc
-  // there are 8 entries for xx1, x10, 100, 000, where x means do not care
-  // table value is made up of
-  // 2 bits in the LSB for prefix length 
-  // 3 bits for suffix length
-  // 3 bits in the MSB for prefix value (u_pfx in Table 3 of ITU T.814)
-  static const OPJ_UINT8 dec[8] = { // the index is the prefix codeword
-    3 | (5 << 2) | (5 << 5),        //000 == 000, prefix codeword "000"
-    1 | (0 << 2) | (1 << 5),        //001 == xx1, prefix codeword "1"
-    2 | (0 << 2) | (2 << 5),        //010 == x10, prefix codeword "01"
-    1 | (0 << 2) | (1 << 5),        //011 == xx1, prefix codeword "1"
-    3 | (1 << 2) | (3 << 5),        //100 == 100, prefix codeword "001"
-    1 | (0 << 2) | (1 << 5),        //101 == xx1, prefix codeword "1"
-    2 | (0 << 2) | (2 << 5),        //110 == x10, prefix codeword "01"
-    1 | (0 << 2) | (1 << 5)         //111 == xx1, prefix codeword "1"
-  };
+    //table stores possible decoding three bits from vlc
+    // there are 8 entries for xx1, x10, 100, 000, where x means do not care
+    // table value is made up of
+    // 2 bits in the LSB for prefix length
+    // 3 bits for suffix length
+    // 3 bits in the MSB for prefix value (u_pfx in Table 3 of ITU T.814)
+    static const OPJ_UINT8 dec[8] = { // the index is the prefix codeword
+        3 | (5 << 2) | (5 << 5),        //000 == 000, prefix codeword "000"
+        1 | (0 << 2) | (1 << 5),        //001 == xx1, prefix codeword "1"
+        2 | (0 << 2) | (2 << 5),        //010 == x10, prefix codeword "01"
+        1 | (0 << 2) | (1 << 5),        //011 == xx1, prefix codeword "1"
+        3 | (1 << 2) | (3 << 5),        //100 == 100, prefix codeword "001"
+        1 | (0 << 2) | (1 << 5),        //101 == xx1, prefix codeword "1"
+        2 | (0 << 2) | (2 << 5),        //110 == x10, prefix codeword "01"
+        1 | (0 << 2) | (1 << 5)         //111 == xx1, prefix codeword "1"
+    };
 
-  OPJ_UINT32 consumed_bits = 0;
-  if (mode == 0)  // both u_off are 0
-  {
-    u[0] = u[1] = 1; //Kappa is 1 for initial line
-  }
-  else if (mode <= 2) // u_off are either 01 or 10
-  {
-    OPJ_UINT32 d;
-    OPJ_UINT32 suffix_len;
+    OPJ_UINT32 consumed_bits = 0;
+    if (mode == 0) { // both u_off are 0
+        u[0] = u[1] = 1; //Kappa is 1 for initial line
+    } else if (mode <= 2) { // u_off are either 01 or 10
+        OPJ_UINT32 d;
+        OPJ_UINT32 suffix_len;
 
-    d = dec[vlc & 0x7];   //look at the least significant 3 bits
-    vlc >>= d & 0x3;                 //prefix length
-    consumed_bits += d & 0x3; 
+        d = dec[vlc & 0x7];   //look at the least significant 3 bits
+        vlc >>= d & 0x3;                 //prefix length
+        consumed_bits += d & 0x3;
 
-    suffix_len = ((d >> 2) & 0x7); 
-    consumed_bits += suffix_len;
+        suffix_len = ((d >> 2) & 0x7);
+        consumed_bits += suffix_len;
 
-    d = (d >> 5) + (vlc & ((1U << suffix_len) - 1)); // u value
-    u[0] = (mode == 1) ? d + 1 : 1; // kappa is 1 for initial line
-    u[1] = (mode == 1) ? 1 : d + 1; // kappa is 1 for initial line
-  }
-  else if (mode == 3) // both u_off are 1, and MEL event is 0
-  {
-    OPJ_UINT32 d1 = dec[vlc & 0x7];  // LSBs of VLC are prefix codeword
-    vlc >>= d1 & 0x3;                // Consume bits
-    consumed_bits += d1 & 0x3;
+        d = (d >> 5) + (vlc & ((1U << suffix_len) - 1)); // u value
+        u[0] = (mode == 1) ? d + 1 : 1; // kappa is 1 for initial line
+        u[1] = (mode == 1) ? 1 : d + 1; // kappa is 1 for initial line
+    } else if (mode == 3) { // both u_off are 1, and MEL event is 0
+        OPJ_UINT32 d1 = dec[vlc & 0x7];  // LSBs of VLC are prefix codeword
+        vlc >>= d1 & 0x3;                // Consume bits
+        consumed_bits += d1 & 0x3;
 
-    if ((d1 & 0x3) > 2)
-    {
-      OPJ_UINT32 suffix_len;
+        if ((d1 & 0x3) > 2) {
+            OPJ_UINT32 suffix_len;
 
-      //u_{q_2} prefix
-      u[1] = (vlc & 1) + 1 + 1; //Kappa is 1 for initial line
-      ++consumed_bits;
-      vlc >>= 1;
+            //u_{q_2} prefix
+            u[1] = (vlc & 1) + 1 + 1; //Kappa is 1 for initial line
+            ++consumed_bits;
+            vlc >>= 1;
 
-      suffix_len = ((d1 >> 2) & 0x7);
-      consumed_bits += suffix_len;
-      d1 = (d1 >> 5) + (vlc & ((1U << suffix_len) - 1)); // u value
-      u[0] = d1 + 1; //Kappa is 1 for initial line
+            suffix_len = ((d1 >> 2) & 0x7);
+            consumed_bits += suffix_len;
+            d1 = (d1 >> 5) + (vlc & ((1U << suffix_len) - 1)); // u value
+            u[0] = d1 + 1; //Kappa is 1 for initial line
+        } else {
+            OPJ_UINT32 d2;
+            OPJ_UINT32 suffix_len;
+
+            d2 = dec[vlc & 0x7];  // LSBs of VLC are prefix codeword
+            vlc >>= d2 & 0x3;                // Consume bits
+            consumed_bits += d2 & 0x3;
+
+            suffix_len = ((d1 >> 2) & 0x7);
+            consumed_bits += suffix_len;
+
+            d1 = (d1 >> 5) + (vlc & ((1U << suffix_len) - 1)); // u value
+            u[0] = d1 + 1; //Kappa is 1 for initial line
+            vlc >>= suffix_len;
+
+            suffix_len = ((d2 >> 2) & 0x7);
+            consumed_bits += suffix_len;
+
+            d2 = (d2 >> 5) + (vlc & ((1U << suffix_len) - 1)); // u value
+            u[1] = d2 + 1; //Kappa is 1 for initial line
+        }
+    } else if (mode == 4) { // both u_off are 1, and MEL event is 1
+        OPJ_UINT32 d1;
+        OPJ_UINT32 d2;
+        OPJ_UINT32 suffix_len;
+
+        d1 = dec[vlc & 0x7];  // LSBs of VLC are prefix codeword
+        vlc >>= d1 & 0x3;                // Consume bits
+        consumed_bits += d1 & 0x3;
+
+        d2 = dec[vlc & 0x7];  // LSBs of VLC are prefix codeword
+        vlc >>= d2 & 0x3;                // Consume bits
+        consumed_bits += d2 & 0x3;
+
+        suffix_len = ((d1 >> 2) & 0x7);
+        consumed_bits += suffix_len;
+
+        d1 = (d1 >> 5) + (vlc & ((1U << suffix_len) - 1)); // u value
+        u[0] = d1 + 3; // add 2+kappa
+        vlc >>= suffix_len;
+
+        suffix_len = ((d2 >> 2) & 0x7);
+        consumed_bits += suffix_len;
+
+        d2 = (d2 >> 5) + (vlc & ((1U << suffix_len) - 1)); // u value
+        u[1] = d2 + 3; // add 2+kappa
     }
-    else
-    {
-      OPJ_UINT32 d2;
-      OPJ_UINT32 suffix_len;
-
-      d2 = dec[vlc & 0x7];  // LSBs of VLC are prefix codeword
-      vlc >>= d2 & 0x3;     // Consume bits
-      consumed_bits += d2 & 0x3;
-
-      suffix_len = ((d1 >> 2) & 0x7);
-      consumed_bits += suffix_len;
-
-      d1 = (d1 >> 5) + (vlc & ((1U << suffix_len) - 1)); // u value
-      u[0] = d1 + 1; //Kappa is 1 for initial line
-      vlc >>= suffix_len;
-
-      suffix_len = ((d2 >> 2) & 0x7);
-      consumed_bits += suffix_len;
-
-      d2 = (d2 >> 5) + (vlc & ((1U << suffix_len) - 1)); // u value
-      u[1] = d2 + 1; //Kappa is 1 for initial line
-    }
-  }
-  else if (mode == 4) // both u_off are 1, and MEL event is 1
-  {
-    OPJ_UINT32 d1;
-    OPJ_UINT32 d2;
-    OPJ_UINT32 suffix_len;
-
-    d1 = dec[vlc & 0x7];  // LSBs of VLC are prefix codeword
-    vlc >>= d1 & 0x3;     // Consume bits
-    consumed_bits += d1 & 0x3;
-
-    d2 = dec[vlc & 0x7];  // LSBs of VLC are prefix codeword
-    vlc >>= d2 & 0x3;     // Consume bits
-    consumed_bits += d2 & 0x3;
-
-    suffix_len = ((d1 >> 2) & 0x7);
-    consumed_bits += suffix_len;
-
-    d1 = (d1 >> 5) + (vlc & ((1U << suffix_len) - 1)); // u value
-    u[0] = d1 + 3; // add 2+kappa
-    vlc >>= suffix_len;
-
-    suffix_len = ((d2 >> 2) & 0x7);
-    consumed_bits += suffix_len;
-
-    d2 = (d2 >> 5) + (vlc & ((1U << suffix_len) - 1)); // u value
-    u[1] = d2 + 3; // add 2+kappa
-  }
-  return consumed_bits;
+    return consumed_bits;
 }
 
 //************************************************************************/
 /** @brief Decode non-initial UVLC to get the u value (or u_q)
   *
   *  @param [in]  vlc is the head of the VLC bitstream
-  *  @param [in]  mode is 0, 1, 2, or 3. The 1st bit is u_off of 1st quad 
+  *  @param [in]  mode is 0, 1, 2, or 3. The 1st bit is u_off of 1st quad
   *               and 2nd for 2nd quad of a quad pair
   *  @param [out] u is the u value (or u_q) + 1.  Note: we produce u + 1;
   *               this value is a partial calculation of u + kappa.
   */
-static inline 
+static INLINE
 OPJ_UINT32 decode_noninit_uvlc(OPJ_UINT32 vlc, OPJ_UINT32 mode, OPJ_UINT32 *u)
 {
-  //table stores possible decoding three bits from vlc
-  // there are 8 entries for xx1, x10, 100, 000, where x means do not care
-  // table value is made up of
-  // 2 bits in the LSB for prefix length 
-  // 3 bits for suffix length
-  // 3 bits in the MSB for prefix value (u_pfx in Table 3 of ITU T.814)
-  static const OPJ_UINT8 dec[8] = {
-    3 | (5 << 2) | (5 << 5), //000 == 000, prefix codeword "000"
-    1 | (0 << 2) | (1 << 5), //001 == xx1, prefix codeword "1"
-    2 | (0 << 2) | (2 << 5), //010 == x10, prefix codeword "01"
-    1 | (0 << 2) | (1 << 5), //011 == xx1, prefix codeword "1"
-    3 | (1 << 2) | (3 << 5), //100 == 100, prefix codeword "001"
-    1 | (0 << 2) | (1 << 5), //101 == xx1, prefix codeword "1"
-    2 | (0 << 2) | (2 << 5), //110 == x10, prefix codeword "01"
-    1 | (0 << 2) | (1 << 5)  //111 == xx1, prefix codeword "1"
-  };
+    //table stores possible decoding three bits from vlc
+    // there are 8 entries for xx1, x10, 100, 000, where x means do not care
+    // table value is made up of
+    // 2 bits in the LSB for prefix length
+    // 3 bits for suffix length
+    // 3 bits in the MSB for prefix value (u_pfx in Table 3 of ITU T.814)
+    static const OPJ_UINT8 dec[8] = {
+        3 | (5 << 2) | (5 << 5), //000 == 000, prefix codeword "000"
+        1 | (0 << 2) | (1 << 5), //001 == xx1, prefix codeword "1"
+        2 | (0 << 2) | (2 << 5), //010 == x10, prefix codeword "01"
+        1 | (0 << 2) | (1 << 5), //011 == xx1, prefix codeword "1"
+        3 | (1 << 2) | (3 << 5), //100 == 100, prefix codeword "001"
+        1 | (0 << 2) | (1 << 5), //101 == xx1, prefix codeword "1"
+        2 | (0 << 2) | (2 << 5), //110 == x10, prefix codeword "01"
+        1 | (0 << 2) | (1 << 5)  //111 == xx1, prefix codeword "1"
+    };
 
-  OPJ_UINT32 consumed_bits = 0;
-  if (mode == 0)
-  {
-    u[0] = u[1] = 1; //for kappa
-  }
-  else if (mode <= 2) //u_off are either 01 or 10
-  {
-    OPJ_UINT32 d;
-    OPJ_UINT32 suffix_len;
+    OPJ_UINT32 consumed_bits = 0;
+    if (mode == 0) {
+        u[0] = u[1] = 1; //for kappa
+    } else if (mode <= 2) { //u_off are either 01 or 10
+        OPJ_UINT32 d;
+        OPJ_UINT32 suffix_len;
 
-    d = dec[vlc & 0x7];  //look at the least significant 3 bits
-    vlc >>= d & 0x3;     //prefix length
-    consumed_bits += d & 0x3;
+        d = dec[vlc & 0x7];  //look at the least significant 3 bits
+        vlc >>= d & 0x3;                //prefix length
+        consumed_bits += d & 0x3;
 
-    suffix_len = ((d >> 2) & 0x7);
-    consumed_bits += suffix_len;
+        suffix_len = ((d >> 2) & 0x7);
+        consumed_bits += suffix_len;
 
-    d = (d >> 5) + (vlc & ((1U << suffix_len) - 1)); // u value
-    u[0] = (mode == 1) ? d + 1 : 1; //for kappa
-    u[1] = (mode == 1) ? 1 : d + 1; //for kappa
-  }
-  else if (mode == 3) // both u_off are 1
-  {
-    OPJ_UINT32 d1;
-    OPJ_UINT32 d2;
-    OPJ_UINT32 suffix_len;
+        d = (d >> 5) + (vlc & ((1U << suffix_len) - 1)); // u value
+        u[0] = (mode == 1) ? d + 1 : 1; //for kappa
+        u[1] = (mode == 1) ? 1 : d + 1; //for kappa
+    } else if (mode == 3) { // both u_off are 1
+        OPJ_UINT32 d1;
+        OPJ_UINT32 d2;
+        OPJ_UINT32 suffix_len;
 
-    d1 = dec[vlc & 0x7];  // LSBs of VLC are prefix codeword
-    vlc >>= d1 & 0x3;     // Consume bits
-    consumed_bits += d1 & 0x3;
+        d1 = dec[vlc & 0x7];  // LSBs of VLC are prefix codeword
+        vlc >>= d1 & 0x3;                // Consume bits
+        consumed_bits += d1 & 0x3;
 
-    d2 = dec[vlc & 0x7];  // LSBs of VLC are prefix codeword
-    vlc >>= d2 & 0x3;     // Consume bits
-    consumed_bits += d2 & 0x3;
+        d2 = dec[vlc & 0x7];  // LSBs of VLC are prefix codeword
+        vlc >>= d2 & 0x3;                // Consume bits
+        consumed_bits += d2 & 0x3;
 
-    suffix_len = ((d1 >> 2) & 0x7);
-    consumed_bits += suffix_len;
+        suffix_len = ((d1 >> 2) & 0x7);
+        consumed_bits += suffix_len;
 
-    d1 = (d1 >> 5) + (vlc & ((1U << suffix_len) - 1)); // u value
-    u[0] = d1 + 1;  //1 for kappa
-    vlc >>= suffix_len;
+        d1 = (d1 >> 5) + (vlc & ((1U << suffix_len) - 1)); // u value
+        u[0] = d1 + 1;  //1 for kappa
+        vlc >>= suffix_len;
 
-    suffix_len = ((d2 >> 2) & 0x7);
-    consumed_bits += suffix_len;
+        suffix_len = ((d2 >> 2) & 0x7);
+        consumed_bits += suffix_len;
 
-    d2 = (d2 >> 5) + (vlc & ((1U << suffix_len) - 1)); // u value
-    u[1] = d2 + 1;  //1 for kappa
-  }
-  return consumed_bits;
+        d2 = (d2 >> 5) + (vlc & ((1U << suffix_len) - 1)); // u value
+        u[1] = d2 + 1;  //1 for kappa
+    }
+    return consumed_bits;
 }
 
 //************************************************************************/
-/** @brief State structure for reading and unstuffing of forward-growing 
+/** @brief State structure for reading and unstuffing of forward-growing
   *         bitstreams; these are: MagSgn and SPP bitstreams
   */
 typedef struct frwd_struct {
-  const OPJ_UINT8* data; //!<pointer to bitstream
-  OPJ_UINT64 tmp;        //!<temporary buffer of read data
-  OPJ_UINT32 bits;       //!<number of bits stored in tmp
-  OPJ_BOOL unstuff;      //!<true if a bit needs to be unstuffed from next byte
-  int size;              //!<size of data
-  OPJ_UINT32 X;          //!<0 or 0xFF, X's are inserted at end of bitstream
+    const OPJ_UINT8* data; //!<pointer to bitstream
+    OPJ_UINT64 tmp;        //!<temporary buffer of read data
+    OPJ_UINT32 bits;       //!<number of bits stored in tmp
+    OPJ_BOOL unstuff;      //!<true if a bit needs to be unstuffed from next byte
+    int size;              //!<size of data
+    OPJ_UINT32 X;          //!<0 or 0xFF, X's are inserted at end of bitstream
 } frwd_struct_t;
 
 //************************************************************************/
 /** @brief Read and unstuffs 32 bits from forward-growing bitstream
-  *  
-  *  A subroutine to read from both the MagSgn or SPP bitstreams; 
-  *  in particular, when MagSgn bitstream is consumed, 0xFF's are fed, 
+  *
+  *  A subroutine to read from both the MagSgn or SPP bitstreams;
+  *  in particular, when MagSgn bitstream is consumed, 0xFF's are fed,
   *  while when SPP is exhausted 0's are fed in.
   *  X controls this value.
   *
@@ -875,81 +871,80 @@ typedef struct frwd_struct {
   *
   *  @param  [in]  msp is a pointer to frwd_struct_t structure
   *
-  */ 
-static inline
+  */
+static INLINE
 void frwd_read(frwd_struct_t *msp)
 {
-  OPJ_UINT32 val;
-  OPJ_UINT32 bits;
-  OPJ_UINT32 t;
-  OPJ_BOOL unstuff;
+    OPJ_UINT32 val;
+    OPJ_UINT32 bits;
+    OPJ_UINT32 t;
+    OPJ_BOOL unstuff;
 
-  assert(msp->bits <= 32); // assert that there is a space for 32 bits
+    assert(msp->bits <= 32); // assert that there is a space for 32 bits
 
-  val = *(OPJ_UINT32*)msp->data;      // read 32 bits
-  msp->data += msp->size > 0 ? 4 : 0; // move pointer if data is not 
-                                      // exhausted
+    val = *(OPJ_UINT32*)msp->data;      // read 32 bits
+    msp->data += msp->size > 0 ? 4 : 0; // move pointer if data is not
+    // exhausted
 
-  // we accumulate in t and keep a count of the number of bits in bits
-  bits = 8u - (msp->unstuff ? 1u:0u);     // if previous byte was 0xFF
-  // get next byte, if bitstream is exhausted, replace it with X
-  t = msp->size-- > 0 ? (val & 0xFF) : msp->X;
-  unstuff = ((val & 0xFF) == 0xFF);  // Do we need unstuffing next?
+    // we accumulate in t and keep a count of the number of bits in bits
+    bits = 8u - (msp->unstuff ? 1u : 0u);   // if previous byte was 0xFF
+    // get next byte, if bitstream is exhausted, replace it with X
+    t = msp->size-- > 0 ? (val & 0xFF) : msp->X;
+    unstuff = ((val & 0xFF) == 0xFF);  // Do we need unstuffing next?
 
-  t |= (msp->size-- > 0 ? ((val >> 8) & 0xFF) : msp->X) << bits;
-  bits += 8u - (unstuff ? 1u:0u);
-  unstuff = (((val >> 8) & 0xFF) == 0xFF);
+    t |= (msp->size-- > 0 ? ((val >> 8) & 0xFF) : msp->X) << bits;
+    bits += 8u - (unstuff ? 1u : 0u);
+    unstuff = (((val >> 8) & 0xFF) == 0xFF);
 
-  t |= (msp->size-- > 0 ? ((val >> 16) & 0xFF) : msp->X) << bits;
-  bits += 8u - (unstuff ? 1u:0u);
-  unstuff = (((val >> 16) & 0xFF) == 0xFF);
+    t |= (msp->size-- > 0 ? ((val >> 16) & 0xFF) : msp->X) << bits;
+    bits += 8u - (unstuff ? 1u : 0u);
+    unstuff = (((val >> 16) & 0xFF) == 0xFF);
 
-  t |= (msp->size-- > 0 ? ((val >> 24) & 0xFF) : msp->X) << bits;
-  bits += 8u - (unstuff ? 1u:0u);
-  msp->unstuff = (((val >> 24) & 0xFF) == 0xFF); // for next byte
+    t |= (msp->size-- > 0 ? ((val >> 24) & 0xFF) : msp->X) << bits;
+    bits += 8u - (unstuff ? 1u : 0u);
+    msp->unstuff = (((val >> 24) & 0xFF) == 0xFF); // for next byte
 
-  msp->tmp |= ((OPJ_UINT64)t) << msp->bits;  // move data to msp->tmp
-  msp->bits += bits;
+    msp->tmp |= ((OPJ_UINT64)t) << msp->bits;  // move data to msp->tmp
+    msp->bits += bits;
 }
 
 //************************************************************************/
 /** @brief Initialize frwd_struct_t struct and reads some bytes
-  *  
+  *
   *  @param [in]  msp is a pointer to frwd_struct_t
   *  @param [in]  data is a pointer to the start of data
   *  @param [in]  size is the number of byte in the bitstream
   *  @param [in]  X is the value fed in when the bitstream is exhausted.
   *               See frwd_read.
   */
-static inline
-void frwd_init(frwd_struct_t *msp, const OPJ_UINT8* data, int size, 
+static INLINE
+void frwd_init(frwd_struct_t *msp, const OPJ_UINT8* data, int size,
                OPJ_UINT32 X)
 {
-  int num;
+    int num, i;
 
-  msp->data = data;
-  msp->tmp = 0;
-  msp->bits = 0;
-  msp->unstuff = OPJ_FALSE;
-  msp->size = size;
-  msp->X = X;
-  assert(msp->X == 0 || msp->X == 0xFF);
+    msp->data = data;
+    msp->tmp = 0;
+    msp->bits = 0;
+    msp->unstuff = OPJ_FALSE;
+    msp->size = size;
+    msp->X = X;
+    assert(msp->X == 0 || msp->X == 0xFF);
 
-  //This code is designed for an architecture that read address should
-  // align to the read size (address multiple of 4 if read size is 4)
-  //These few lines take care of the case where data is not at a multiple
-  // of 4 boundary.  It reads 1,2,3 up to 4 bytes from the bitstream
-  num = 4 - (int)((intptr_t)(msp->data) & 0x3);
-  for (int i = 0; i < num; ++i)
-  {
-    OPJ_UINT64 d;
-    //read a byte if the buffer is not exhausted, otherwise set it to X
-    d = msp->size-- > 0 ? *msp->data++ : msp->X;
-    msp->tmp |= (d << msp->bits);           // store data in msp->tmp
-    msp->bits += 8u - (msp->unstuff?1u:0u); // number of bits added to msp->tmp
-    msp->unstuff = ((d & 0xFF) == 0xFF);    // unstuffing for next byte
-  }
-  frwd_read(msp); // read 32 bits more
+    //This code is designed for an architecture that read address should
+    // align to the read size (address multiple of 4 if read size is 4)
+    //These few lines take care of the case where data is not at a multiple
+    // of 4 boundary.  It reads 1,2,3 up to 4 bytes from the bitstream
+    num = 4 - (int)((intptr_t)(msp->data) & 0x3);
+    for (i = 0; i < num; ++i) {
+        OPJ_UINT64 d;
+        //read a byte if the buffer is not exhausted, otherwise set it to X
+        d = msp->size-- > 0 ? *msp->data++ : msp->X;
+        msp->tmp |= (d << msp->bits);      // store data in msp->tmp
+        msp->bits += 8u - (msp->unstuff ? 1u : 0u); // number of bits added to msp->tmp
+        msp->unstuff = ((d & 0xFF) == 0xFF); // unstuffing for next byte
+    }
+    frwd_read(msp); // read 32 bits more
 }
 
 //************************************************************************/
@@ -958,12 +953,12 @@ void frwd_init(frwd_struct_t *msp, const OPJ_UINT8* data, int size,
   *  @param [in]  msp is a pointer to frwd_struct_t
   *  @param [in]  num_bits is the number of bit to consume
   */
-static inline 
+static INLINE
 void frwd_advance(frwd_struct_t *msp, OPJ_UINT32 num_bits)
 {
-  assert(num_bits <= msp->bits);
-  msp->tmp >>= num_bits;  // consume num_bits
-  msp->bits -= num_bits;
+    assert(num_bits <= msp->bits);
+    msp->tmp >>= num_bits;  // consume num_bits
+    msp->bits -= num_bits;
 }
 
 //************************************************************************/
@@ -971,16 +966,16 @@ void frwd_advance(frwd_struct_t *msp, OPJ_UINT32 num_bits)
   *
   *  @param [in]  msp is a pointer to frwd_struct_t
   */
-static inline 
+static INLINE
 OPJ_UINT32 frwd_fetch(frwd_struct_t *msp)
 {
-  if (msp->bits < 32)
-  {
-    frwd_read(msp);
-    if (msp->bits < 32) //need to test
-      frwd_read(msp);
-  }
-  return (OPJ_UINT32)msp->tmp;
+    if (msp->bits < 32) {
+        frwd_read(msp);
+        if (msp->bits < 32) { //need to test
+            frwd_read(msp);
+        }
+    }
+    return (OPJ_UINT32)msp->tmp;
 }
 
 //************************************************************************/
@@ -1005,8 +1000,8 @@ static OPJ_BOOL opj_t1_allocate_buffers(
 
         if (datasize > t1->datasize) {
             opj_aligned_free(t1->data);
-            t1->data = (OPJ_INT32*) 
-              opj_aligned_malloc(datasize * sizeof(OPJ_INT32));
+            t1->data = (OPJ_INT32*)
+                       opj_aligned_malloc(datasize * sizeof(OPJ_INT32));
             if (!t1->data) {
                 /* FIXME event manager error callback */
                 return OPJ_FALSE;
@@ -1059,1476 +1054,1445 @@ OPJ_BOOL opj_t1_ht_decode_cblk(opj_t1_t *t1,
                                opj_mutex_t* p_manager_mutex,
                                OPJ_BOOL check_pterm)
 {
-  OPJ_BYTE* cblkdata = NULL;
-  OPJ_UINT8* coded_data;
-  OPJ_UINT32* decoded_data;
-  OPJ_UINT32 num_passes;
-  OPJ_UINT32 lengths1;
-  OPJ_UINT32 lengths2;
-  OPJ_INT32 width;
-  OPJ_INT32 height;
-  OPJ_INT32 stride;
-  OPJ_UINT32 *pflags, *sigma1, *sigma2, *mbr1, *mbr2, *sip, sip_shift;
-  OPJ_UINT32 p;
-  OPJ_UINT32 zero_planes_p1;
-  int lcup, scup;
-  dec_mel_t mel;
-  rev_struct_t vlc;
-  frwd_struct_t magsgn;
-  frwd_struct_t sigprop;
-  rev_struct_t magref;
-  OPJ_UINT8 *lsp, *line_state;
-  int run;  
-  OPJ_UINT32 vlc_val;           
-  OPJ_UINT32 qinf[2];
-  OPJ_UINT32 c_q;
-  OPJ_UINT32* sp;
-
-  (void)(orient);      // stops unused parameter message
-  (void)(check_pterm); // stops unused parameter message
-
-  // We ignor orient, because the same decoder is used for all subbands
-  // We also ignore check_pterm, because I am not sure how it applies
-  assert(cblksty == 0x40); // that is the only support mode
-  if (roishift != 0) {
-    if (p_manager_mutex)
-      opj_mutex_lock(p_manager_mutex);
-    opj_event_msg(p_manager, EVT_ERROR, "We do not support ROI in decoding "
-                            "HT codeblocks\n");
-    if (p_manager_mutex)
-      opj_mutex_unlock(p_manager_mutex);
-    return OPJ_FALSE;
-  }
-
-  if (!opj_t1_allocate_buffers(
-              t1,
-              (OPJ_UINT32)(cblk->x1 - cblk->x0),
-              (OPJ_UINT32)(cblk->y1 - cblk->y0))) {
-      return OPJ_FALSE;
-  }
-
-  /* Even if we have a single chunk, in multi-threaded decoding */
-  /* the insertion of our synthetic marker might potentially override */
-  /* valid codestream of other codeblocks decoded in parallel. */
-  if (cblk->numchunks > 1 || t1->mustuse_cblkdatabuffer) {
-      OPJ_UINT32 i;
-      OPJ_UINT32 cblk_len;
-
-      /* Compute whole codeblock length from chunk lengths */
-      cblk_len = 0;
-      for (i = 0; i < cblk->numchunks; i++) {
-          cblk_len += cblk->chunks[i].len;
-      }
-
-      /* Allocate temporary memory if needed */
-      if (cblk_len + OPJ_COMMON_CBLK_DATA_EXTRA > t1->cblkdatabuffersize) {
-          cblkdata = (OPJ_BYTE*)opj_realloc(
-              t1->cblkdatabuffer, cblk_len + OPJ_COMMON_CBLK_DATA_EXTRA);
-          if (cblkdata == NULL) {
-              return OPJ_FALSE;
-          }
-          t1->cblkdatabuffer = cblkdata;
-          memset(t1->cblkdatabuffer + cblk_len, 0, OPJ_COMMON_CBLK_DATA_EXTRA);
-          t1->cblkdatabuffersize = cblk_len + OPJ_COMMON_CBLK_DATA_EXTRA;
-      }
-
-      /* Concatenate all chunks */
-      cblkdata = t1->cblkdatabuffer;
-      cblk_len = 0;
-      for (i = 0; i < cblk->numchunks; i++) {
-          memcpy(cblkdata+cblk_len, cblk->chunks[i].data, cblk->chunks[i].len);
-          cblk_len += cblk->chunks[i].len;
-      }
-  } else if (cblk->numchunks == 1) {
-      cblkdata = cblk->chunks[0].data;
-  } else {
-      /* Not sure if that can happen in practice, but avoid Coverity to */
-      /* think we will dereference a null cblkdta pointer */
-      return OPJ_TRUE;
-  }
-
-  // OPJ_BYTE* coded_data is a pointer to bitstream
-  coded_data = cblkdata;
-  // OPJ_UINT32* decoded_data is a pointer to decoded codeblock data buf.
-  decoded_data = (OPJ_UINT32*)t1->data;
-  // OPJ_UINT32 num_passes is the number of passes: 1 if CUP only, 2 for 
-  // CUP+SPP, and 3 for CUP+SPP+MRP
-  num_passes = cblk->numsegs>0 ? cblk->segs[0].real_num_passes : 0;
-  num_passes += cblk->numsegs>1 ? cblk->segs[1].real_num_passes : 0;
-  // OPJ_UINT32 lengths1 is the length of cleanup pass
-  lengths1 = num_passes > 0 ? cblk->segs[0].len : 0;
-  // OPJ_UINT32 lengths2 is the length of refinement passes (either SPP only or SPP+MRP)
-  lengths2 = num_passes > 1 ? cblk->segs[1].len : 0;
-  // OPJ_INT32 width is the decoded codeblock width 
-  width = cblk->x1 - cblk->x0;
-  // OPJ_INT32 height is the decoded codeblock height
-  height = cblk->y1 - cblk->y0;
-  // OPJ_INT32 stride is the decoded codeblock buffer stride 
-  stride = width;
-
-   /*  sigma1 and sigma2 contains significant (i.e., non-zero) pixel 
-    *  locations.  The buffers are used interchangeably, because we need
-    *  more than 4 rows of significance information at a given time.
-    *  Each 32 bits contain significance information for 4 rows of 8 
-    *  columns each.  If we denote 32 bits by 0xaaaaaaaa, the each "a" is
-    *  called a nibble and has significance information for 4 rows.
-    *  The least significant nibble has information for the first column,
-    *  and so on. The nibble's LSB is for the first row, and so on.
-    *  Since, at most, we can have 1024 columns in a quad, we need 128
-    *  entries; we added 1 for convenience when propagation of signifcance
-    *  goes outside the structure
-    *  To work in OpenJPEG these buffers has been expanded to 132.
-    */
-  // OPJ_UINT32 *pflags, *sigma1, *sigma2, *mbr1, *mbr2, *sip, sip_shift;
-  pflags = (OPJ_UINT32 *)t1->flags;
-  sigma1 = pflags;
-  sigma2 = sigma1 + 132;
-  // mbr arrangement is similar to sigma; mbr contains locations 
-  // that become significant during significance propagation pass
-  mbr1 = sigma2 + 132;
-  mbr2 = mbr1 + 132;
-  //a pointer to sigma
-  sip = sigma1;  //pointers to arrays to be used interchangeably
-  sip_shift = 0; //the amount of shift needed for sigma
-
-  if (num_passes > 1 && lengths2 == 0)
-  {
-    if (p_manager_mutex)
-      opj_mutex_lock(p_manager_mutex);
-    opj_event_msg(p_manager, EVT_WARNING, "A malformed codeblock that has "
-                  "more than one coding pass, but zero length for "
-                  "2nd and potential 3rd pass.\n");
-    if (p_manager_mutex)
-      opj_mutex_unlock(p_manager_mutex);
-    num_passes = 1;
-  }
-  if (num_passes > 3)
-  {
-    if (p_manager_mutex)
-      opj_mutex_lock(p_manager_mutex);
-    opj_event_msg(p_manager, EVT_WARNING, "We do not support more than 3 "
-                            "coding passes; This codeblocks has %d passes.\n",
-                            num_passes);
-    if (p_manager_mutex)
-      opj_mutex_unlock(p_manager_mutex);
-    return OPJ_FALSE;
-  }
-
-  if (cblk->numbps == 1 && num_passes > 1)
-    {
-      // We do not have enough precision to decode SgnProp nor MagRef passes.
-      // We decode the cleanup passes only
-      if (cannot_decode_spp_mrp_msg == OPJ_FALSE) {
-        if (p_manager_mutex)
-          opj_mutex_lock(p_manager_mutex);
-        cannot_decode_spp_mrp_msg = OPJ_TRUE;
-        opj_event_msg(p_manager, EVT_WARNING, "Not enough precision to decode "
-                                "the SgnProp nor MagRef passes.  This message "
-                                "will not be displayed again.\n");
-        if (p_manager_mutex)
-          opj_mutex_unlock(p_manager_mutex);
-      }
-      num_passes = 1;
-    }
-  if (cblk->numbps == 0)
-    {
-      // We do not have enough precision to decode the CUP pass with the 
-      // center of bin bit set.  The code can be modified to support this 
-      // case, without using the center of the bin.
-      if (cannot_decode_due_to_insufficient_precision == OPJ_FALSE) {
-        if (p_manager_mutex)
-          opj_mutex_lock(p_manager_mutex);
-        cannot_decode_due_to_insufficient_precision = OPJ_TRUE;
-        opj_event_msg(p_manager, EVT_WARNING, "Not enough precision to decode "
-                                "the cleanup pass. The code should be "
-                                "modified to support this case. This message "
-                                "will not be displayed again.\n");
-        if (p_manager_mutex)
-          opj_mutex_unlock(p_manager_mutex);
-      }
-      return OPJ_TRUE;
-    }
-
-  // OPJ_INT32
-  p = cblk->numbps; 
-  // OPJ_INT32 zero planes plus 1
-  zero_planes_p1 = cblk->Mb - cblk->numbps + 1;
-
-  // read scup and fix the bytes there
-  lcup = (int)lengths1;  // length of CUP
-  //scup is the length of MEL + VLC
-  scup = (((int)coded_data[lcup-1]) << 4) + (coded_data[lcup-2] & 0xF);
-  if (scup < 2 || scup > lcup || scup > 4079) //something is wrong
-    return OPJ_FALSE;
-
-  // init structures
-  mel_init(&mel, coded_data, lcup, scup);
-  rev_init(&vlc, coded_data, lcup, scup);
-  frwd_init(&magsgn, coded_data, lcup - scup, 0xFF);
-  if (num_passes > 1) // needs to be tested
-    frwd_init(&sigprop, coded_data + lengths1, (int)lengths2, 0);
-  if (num_passes > 2)
-    rev_init_mrp(&magref, coded_data, (int)lengths1, (int)lengths2);
-
-  /** State storage
-    *  One byte per quad; for 1024 columns, or 512 quads, we need
-    *  512 bytes. We are using 2 extra bytes one on the left and one on
-    *  the right for convenience.
-    *
-    *  The MSB bit in each byte is (\sigma^nw | \sigma^n), and the 7 LSBs
-    *  contain max(E^nw | E^n)
-    */
-
-  // 514 is enough for a block width of 1024, +2 extra
-  // here expanded to 528
-  line_state = (OPJ_UINT8 *)(mbr2 + 132); 
-
-  //initial 2 lines
-  /////////////////
-  lsp = line_state;           // point to line state
-  lsp[0] = 0;                 // for initial row of quad, we set to 0
-  run = mel_get_run(&mel);    // decode runs of events from MEL bitstrm
-                              // data represented as runs of 0 events
-                              // See mel_decode description
-  qinf[0] = qinf[1] = 0;      // quad info decoded from VLC bitstream
-  c_q = 0;                    // context for quad q
-  sp = decoded_data;          // decoded codeblock samples
-  // vlc_val;                 // fetched data from VLC bitstream
-
-  for (OPJ_INT32 x = 0; x < width; x += 4) // one iteration per quad pair
-  {
-    OPJ_UINT32 U_q[2]; // u values for the quad pair
-    OPJ_UINT32 uvlc_mode;
-    OPJ_UINT32 consumed_bits;
-    OPJ_UINT32 m_n, v_n;
-    OPJ_UINT32 ms_val;
-    OPJ_UINT32 locs;
-
-    // decode VLC
-    /////////////
-
-    //first quad
-    // Get the head of the VLC bitstream. One fetch is enough for two 
-    // quads, since the largest VLC code is 7 bits, and maximum number of 
-    // bits used for u is 8.  Therefore for two quads we need 30 bits 
-    // (if we include unstuffing, then 32 bits are enough, since we have 
-    // a maximum of one stuffing per two bytes)
-    vlc_val = rev_fetch(&vlc);
-
-    //decode VLC using the context c_q and the head of the VLC bitstream
-    qinf[0] = vlc_tbl0[ (c_q << 7) | (vlc_val & 0x7F) ];
-
-    if (c_q == 0) // if zero context, we need to use one MEL event
-    {
-      run -= 2; //the number of 0 events is multiplied by 2, so subtract 2
-
-      // Is the run terminated in 1? if so, use decoded VLC code, 
-      // otherwise, discard decoded data, since we will decoded again 
-      // using a different context
-      qinf[0] = (run == -1) ? qinf[0] : 0;
-
-      // is run -1 or -2? this means a run has been consumed
-      if (run < 0) 
-        run = mel_get_run(&mel);  // get another run
-    }
-
-    // prepare context for the next quad; eqn. 1 in ITU T.814
-    c_q = ((qinf[0] & 0x10) >> 4) | ((qinf[0] & 0xE0) >> 5);
-
-    //remove data from vlc stream (0 bits are removed if qinf is not used)
-    vlc_val = rev_advance(&vlc, qinf[0] & 0x7);
-
-    //update sigma
-    // The update depends on the value of x; consider one OPJ_UINT32
-    // if x is 0, 8, 16 and so on, then this line update c locations
-    //      nibble (4 bits) number   0 1 2 3 4 5 6 7
-    //                         LSB   c c 0 0 0 0 0 0 
-    //                               c c 0 0 0 0 0 0
-    //                               0 0 0 0 0 0 0 0
-    //                               0 0 0 0 0 0 0 0
-    // if x is 4, 12, 20, then this line update locations c
-    //      nibble (4 bits) number   0 1 2 3 4 5 6 7
-    //                         LSB   0 0 0 0 c c 0 0 
-    //                               0 0 0 0 c c 0 0
-    //                               0 0 0 0 0 0 0 0
-    //                               0 0 0 0 0 0 0 0
-    *sip |= (((qinf[0] & 0x30)>>4) | ((qinf[0] & 0xC0)>>2)) << sip_shift;
-
-    //second quad
-    qinf[1] = 0;
-    if (x + 2 < width) // do not run if codeblock is narrower
-    {
-      //decode VLC using the context c_q and the head of the VLC bitstream
-      qinf[1] = vlc_tbl0[(c_q << 7) | (vlc_val & 0x7F)]; 
-
-      // if context is zero, use one MEL event
-      if (c_q == 0) //zero context
-      {
-        run -= 2; //subtract 2, since events number if multiplied by 2
-
-        // if event is 0, discard decoded qinf
-        qinf[1] = (run == -1) ? qinf[1] : 0;
-
-        if (run < 0) // have we consumed all events in a run
-          run = mel_get_run(&mel); // if yes, then get another run
-      }
-
-      //prepare context for the next quad, eqn. 1 in ITU T.814
-      c_q = ((qinf[1] & 0x10) >> 4) | ((qinf[1] & 0xE0) >> 5);
-
-      //remove data from vlc stream, if qinf is not used, cwdlen is 0
-      vlc_val = rev_advance(&vlc, qinf[1] & 0x7);
-    }
-
-    //update sigma
-    // The update depends on the value of x; consider one OPJ_UINT32
-    // if x is 0, 8, 16 and so on, then this line update c locations
-    //      nibble (4 bits) number   0 1 2 3 4 5 6 7
-    //                         LSB   0 0 c c 0 0 0 0 
-    //                               0 0 c c 0 0 0 0
-    //                               0 0 0 0 0 0 0 0
-    //                               0 0 0 0 0 0 0 0
-    // if x is 4, 12, 20, then this line update locations c
-    //      nibble (4 bits) number   0 1 2 3 4 5 6 7
-    //                         LSB   0 0 0 0 0 0 c c 
-    //                               0 0 0 0 0 0 c c
-    //                               0 0 0 0 0 0 0 0
-    //                               0 0 0 0 0 0 0 0
-    *sip |= (((qinf[1] & 0x30) | ((qinf[1] & 0xC0)<<2))) << (4+sip_shift);
-
-    sip += x & 0x7 ? 1 : 0; // move sigma pointer to next entry
-    sip_shift ^= 0x10;      // increment/decrement sip_shift by 16
-
-    // retrieve u
-    /////////////
-
-    // uvlc_mode is made up of u_offset bits from the quad pair
-    uvlc_mode = ((qinf[0] & 0x8) >> 3) | ((qinf[1] & 0x8) >> 2);
-    if (uvlc_mode == 3)  // if both u_offset are set, get an event from
-    {                    // the MEL run of events
-      run -= 2; //subtract 2, since events number if multiplied by 2
-      uvlc_mode += (run == -1) ? 1 : 0; //increment uvlc_mode if event is 1
-      if (run < 0) // if run is consumed (run is -1 or -2), get another run
-        run = mel_get_run(&mel);
-    }
-    //decode uvlc_mode to get u for both quads
-    consumed_bits = decode_init_uvlc(vlc_val, uvlc_mode, U_q);
-    if (U_q[0] > zero_planes_p1 || U_q[1] > zero_planes_p1)
-    {
-      if (p_manager_mutex)
-        opj_mutex_lock(p_manager_mutex);
-      opj_event_msg(p_manager, EVT_ERROR, "Malformed HT codeblock. Decoding "
-                              "this codeblock is stopped.\n");
-      if (p_manager_mutex)
-        opj_mutex_unlock(p_manager_mutex);
-      return OPJ_FALSE;
-    }
-
-    //consume u bits in the VLC code
-    vlc_val = rev_advance(&vlc, consumed_bits);
-
-    //decode magsgn and update line_state
-    /////////////////////////////////////
-
-    //We obtain a mask for the samples locations that needs evaluation
-    locs = 0xFF;
-    if (x + 4 > width) locs >>= (x + 4 - width) << 1; // limits width
-    locs = height > 1 ? locs : (locs & 0x55);         // limits height
-
-    //first quad, starting at first sample in quad and moving on
-    if (qinf[0] & 0x10) //is it signifcant? (sigma_n)
-    {
-      OPJ_UINT32 val;
-
-      ms_val = frwd_fetch(&magsgn);         //get 32 bits of magsgn data
-      m_n = U_q[0] - ((qinf[0] >> 12) & 1); //evaluate m_n (number of bits
-                                  // to read from bitstream), using EMB e_k
-      frwd_advance(&magsgn, m_n);         //consume m_n
-      val = ms_val << 31;                 //get sign bit
-      v_n = ms_val & ((1U << m_n) - 1);   //keep only m_n bits
-      v_n |= ((qinf[0] & 0x100) >> 8) << m_n;  //add EMB e_1 as MSB
-      v_n |= 1;                                //add center of bin    
-      //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit
-      //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs
-      sp[0] = val | ((v_n + 2) << (p - 1)); 
-    }
-    else if (locs & 0x1) // if this is outside the codeblock, set the 
-      sp[0] = 0;         // sample to zero
-
-    if (qinf[0] & 0x20) //sigma_n
-    {
-      OPJ_UINT32 val, t;
-
-      ms_val = frwd_fetch(&magsgn);         //get 32 bits
-      m_n = U_q[0] - ((qinf[0] >> 13) & 1); //m_n, uses EMB e_k
-      frwd_advance(&magsgn, m_n);           //consume m_n
-      val = ms_val << 31;                   //get sign bit
-      v_n = ms_val & ((1U << m_n) - 1);     //keep only m_n bits
-      v_n |= ((qinf[0] & 0x200) >> 9) << m_n; //add EMB e_1
-      v_n |= 1;                               //bin center
-      //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit
-      //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs
-      sp[stride] = val | ((v_n + 2) << (p - 1)); 
-
-      //update line_state: bit 7 (\sigma^N), and E^N
-      t = lsp[0] & 0x7F;       // keep E^NW
-      v_n = 32 - count_leading_zeros(v_n); 
-      lsp[0] = (OPJ_UINT8)(0x80 | (t > v_n ? t : v_n)); //max(E^NW, E^N) | s
-    }
-    else if (locs & 0x2) // if this is outside the codeblock, set the 
-      sp[stride] = 0;    //no need to update line_state
-
-    ++lsp; // move to next quad information
-    ++sp;  // move to next column of samples
-
-    //this is similar to the above two samples
-    if (qinf[0] & 0x40) 
-    {
-      OPJ_UINT32 val;
-
-      ms_val = frwd_fetch(&magsgn);
-      m_n = U_q[0] - ((qinf[0] >> 14) & 1); 
-      frwd_advance(&magsgn, m_n);
-      val = ms_val << 31;
-      v_n = ms_val & ((1U << m_n) - 1);
-      v_n |= (((qinf[0] & 0x400) >> 10) << m_n);
-      v_n |= 1; 
-      sp[0] = val | ((v_n + 2) << (p - 1));
-    }
-    else if (locs & 0x4)
-      sp[0] = 0;
-
-    lsp[0] = 0;
-    if (qinf[0] & 0x80) 
-    {
-      OPJ_UINT32 val;
-      ms_val = frwd_fetch(&magsgn);
-      m_n = U_q[0] - ((qinf[0] >> 15) & 1); //m_n
-      frwd_advance(&magsgn, m_n);
-      val = ms_val << 31;
-      v_n = ms_val & ((1U << m_n) - 1);
-      v_n |= ((qinf[0] & 0x800) >> 11) << m_n;
-      v_n |= 1; //center of bin
-      sp[stride] = val | ((v_n + 2) << (p - 1));
-
-      //line_state: bit 7 (\sigma^NW), and E^NW for next quad
-      lsp[0] = (OPJ_UINT8)(0x80 | (32 - count_leading_zeros(v_n)));
-    }
-    else if (locs & 0x8) //if outside set to 0
-      sp[stride] = 0;
-
-    ++sp; //move to next column
-
-    //second quad
-    if (qinf[1] & 0x10) 
-    {
-      OPJ_UINT32 val;
-
-      ms_val = frwd_fetch(&magsgn);
-      m_n = U_q[1] - ((qinf[1] >> 12) & 1); //m_n
-      frwd_advance(&magsgn, m_n);
-      val = ms_val << 31;
-      v_n = ms_val & ((1U << m_n) - 1);
-      v_n |= (((qinf[1] & 0x100) >> 8) << m_n);
-      v_n |= 1;
-      sp[0] = val | ((v_n + 2) << (p - 1));
-    }
-    else if (locs & 0x10)
-      sp[0] = 0;
-
-    if (qinf[1] & 0x20)
-    {
-      OPJ_UINT32 val, t;
-
-      ms_val = frwd_fetch(&magsgn);
-      m_n = U_q[1] - ((qinf[1] >> 13) & 1); //m_n
-      frwd_advance(&magsgn, m_n);
-      val = ms_val << 31;
-      v_n = ms_val & ((1U << m_n) - 1);
-      v_n |= (((qinf[1] & 0x200) >> 9) << m_n);
-      v_n |= 1;
-      sp[stride] = val | ((v_n + 2) << (p - 1));
-
-      //update line_state: bit 7 (\sigma^N), and E^N
-      t = lsp[0] & 0x7F;            //E^NW
-      v_n = 32 - count_leading_zeros(v_n);     //E^N
-      lsp[0] = (OPJ_UINT8)(0x80 | (t > v_n ? t : v_n)); //max(E^NW, E^N) | s
-    }
-    else if (locs & 0x20)
-      sp[stride] = 0;      //no need to update line_state
-
-    ++lsp; //move line state to next quad
-    ++sp;  //move to next sample
-
-    if (qinf[1] & 0x40)
-    {
-      OPJ_UINT32 val;
-
-      ms_val = frwd_fetch(&magsgn);
-      m_n = U_q[1] - ((qinf[1] >> 14) & 1); //m_n
-      frwd_advance(&magsgn, m_n);
-      val = ms_val << 31;
-      v_n = ms_val & ((1U << m_n) - 1);
-      v_n |= (((qinf[1] & 0x400) >> 10) << m_n);
-      v_n |= 1;
-      sp[0] = val | ((v_n + 2) << (p - 1));
-    }
-    else if (locs & 0x40)
-      sp[0] = 0;
-
-    lsp[0] = 0;
-    if (qinf[1] & 0x80)
-    {
-      OPJ_UINT32 val;
-
-      ms_val = frwd_fetch(&magsgn);
-      m_n = U_q[1] - ((qinf[1] >> 15) & 1); //m_n
-      frwd_advance(&magsgn, m_n);
-      val = ms_val << 31;
-      v_n = ms_val & ((1U << m_n) - 1);
-      v_n |= (((qinf[1] & 0x800) >> 11) << m_n);
-      v_n |= 1; //center of bin
-      sp[stride] = val | ((v_n + 2) << (p - 1));
-
-      //line_state: bit 7 (\sigma^NW), and E^NW for next quad
-      lsp[0] = (OPJ_UINT8)(0x80 | (32 - count_leading_zeros(v_n)));
-    }
-    else if (locs & 0x80)
-      sp[stride] = 0;
-
-    ++sp;
-  }
-
-  //non-initial lines
-  //////////////////////////
-  for (OPJ_INT32 y = 2; y < height; /*done at the end of loop*/)
-  {
-    OPJ_UINT32 *sip;
-    OPJ_UINT8 ls0;
-
-    sip_shift ^= 0x2;  // shift sigma to the upper half od the nibble
-    sip_shift &= 0xFFFFFFEFU; //move back to 0 (it might have been at 0x10)
-    sip = y & 0x4 ? sigma2 : sigma1; //choose sigma array
-
-    lsp = line_state;
-    ls0 = lsp[0];                   // read the line state value
-    lsp[0] = 0;                     // and set it to zero
-    sp = decoded_data + y * stride; // generated samples
-    c_q = 0;                        // context
-    for (OPJ_INT32 x = 0; x < width; x += 4)
-    {
-      OPJ_UINT32 U_q[2];
-      OPJ_UINT32 uvlc_mode, consumed_bits;
-      OPJ_UINT32 m_n, v_n;
-      OPJ_UINT32 ms_val;
-      OPJ_UINT32 locs;
-
-      // decode vlc
-      /////////////
-
-      //first quad
-      // get context, eqn. 2 ITU T.814
-      // c_q has \sigma^W | \sigma^SW
-      c_q |= (ls0 >> 7);          //\sigma^NW | \sigma^N
-      c_q |= (lsp[1] >> 5) & 0x4; //\sigma^NE | \sigma^NF
-
-      //the following is very similar to previous code, so please refer to 
-      // that
-      vlc_val = rev_fetch(&vlc);
-      qinf[0] = vlc_tbl1[(c_q << 7) | (vlc_val & 0x7F)];
-      if (c_q == 0) //zero context
-      {
-        run -= 2;
-        qinf[0] = (run == -1) ? qinf[0] : 0;
-        if (run < 0)
-          run = mel_get_run(&mel);
-      }
-      //prepare context for the next quad, \sigma^W | \sigma^SW
-      c_q = ((qinf[0] & 0x40) >> 5) | ((qinf[0] & 0x80) >> 6);
-
-      //remove data from vlc stream
-      vlc_val = rev_advance(&vlc, qinf[0] & 0x7);
-
-      //update sigma
-      // The update depends on the value of x and y; consider one OPJ_UINT32
-      // if x is 0, 8, 16 and so on, and y is 2, 6, etc., then this 
-      // line update c locations
-      //      nibble (4 bits) number   0 1 2 3 4 5 6 7
-      //                         LSB   0 0 0 0 0 0 0 0 
-      //                               0 0 0 0 0 0 0 0
-      //                               c c 0 0 0 0 0 0
-      //                               c c 0 0 0 0 0 0
-      *sip |= (((qinf[0]&0x30) >> 4) | ((qinf[0]&0xC0) >> 2)) << sip_shift;
-
-      //second quad
-      qinf[1] = 0;
-      if (x + 2 < width)
-      {
-        c_q |= (lsp[1] >> 7);
-        c_q |= (lsp[2] >> 5) & 0x4;
-        qinf[1] = vlc_tbl1[(c_q << 7) | (vlc_val & 0x7F)];
-        if (c_q == 0) //zero context
-        {
-          run -= 2;
-          qinf[1] = (run == -1) ? qinf[1] : 0;
-          if (run < 0)
-            run = mel_get_run(&mel);
+    OPJ_BYTE* cblkdata = NULL;
+    OPJ_UINT8* coded_data;
+    OPJ_UINT32* decoded_data;
+    OPJ_UINT32 num_passes;
+    OPJ_UINT32 lengths1;
+    OPJ_UINT32 lengths2;
+    OPJ_INT32 width;
+    OPJ_INT32 height;
+    OPJ_INT32 stride;
+    OPJ_UINT32 *pflags, *sigma1, *sigma2, *mbr1, *mbr2, *sip, sip_shift;
+    OPJ_UINT32 p;
+    OPJ_UINT32 zero_planes_p1;
+    int lcup, scup;
+    dec_mel_t mel;
+    rev_struct_t vlc;
+    frwd_struct_t magsgn;
+    frwd_struct_t sigprop;
+    rev_struct_t magref;
+    OPJ_UINT8 *lsp, *line_state;
+    int run;
+    OPJ_UINT32 vlc_val;              // fetched data from VLC bitstream
+    OPJ_UINT32 qinf[2];
+    OPJ_UINT32 c_q;
+    OPJ_UINT32* sp;
+    OPJ_INT32 x, y; // loop indices
+
+    (void)(orient);      // stops unused parameter message
+    (void)(check_pterm); // stops unused parameter message
+
+    // We ignor orient, because the same decoder is used for all subbands
+    // We also ignore check_pterm, because I am not sure how it applies
+    assert(cblksty == 0x40); // that is the only support mode
+    if (roishift != 0) {
+        if (p_manager_mutex) {
+            opj_mutex_lock(p_manager_mutex);
+        }
+        opj_event_msg(p_manager, EVT_ERROR, "We do not support ROI in decoding "
+                      "HT codeblocks\n");
+        if (p_manager_mutex) {
+            opj_mutex_unlock(p_manager_mutex);
         }
-        //prepare context for the next quad
-        c_q = ((qinf[1] & 0x40) >> 5) | ((qinf[1] & 0x80) >> 6);
-        //remove data from vlc stream
-        vlc_val = rev_advance(&vlc, qinf[1] & 0x7);
-      }
-
-      //update sigma
-      *sip |= (((qinf[1]&0x30) | ((qinf[1]&0xC0) << 2))) << (4+sip_shift);
-
-      sip += x & 0x7 ? 1 : 0;
-      sip_shift ^= 0x10;
-
-      //retrieve u
-      ////////////
-      uvlc_mode = ((qinf[0] & 0x8) >> 3) | ((qinf[1] & 0x8) >> 2);
-      consumed_bits = decode_noninit_uvlc(vlc_val, uvlc_mode, U_q);
-      vlc_val = rev_advance(&vlc, consumed_bits);
-
-      //calculate E^max and add it to U_q, eqns 5 and 6 in ITU T.814
-      if ((qinf[0] & 0xF0) & ((qinf[0] & 0xF0) - 1)) // is \gamma_q 1?
-      {
-        OPJ_UINT32 E = (ls0 & 0x7Fu);
-        E = E > (lsp[1] & 0x7Fu) ? E : (lsp[1]&0x7Fu); //max(E, E^NE, E^NF)
-        //since U_q alread has u_q + 1, we subtract 2 instead of 1
-        U_q[0] += E > 2 ? E - 2 : 0;
-      }
-
-      if ((qinf[1] & 0xF0) & ((qinf[1] & 0xF0) - 1)) //is \gamma_q 1? 
-      {
-        OPJ_UINT32 E = (lsp[1] & 0x7Fu);
-        E = E > (lsp[2] & 0x7Fu) ? E : (lsp[2]&0x7Fu); //max(E, E^NE, E^NF)
-        //since U_q alread has u_q + 1, we subtract 2 instead of 1
-        U_q[1] += E > 2 ? E - 2 : 0;
-      }
-
-      if (U_q[0] > zero_planes_p1 || U_q[1] > zero_planes_p1)
-      {
-        if (p_manager_mutex)
-          opj_mutex_lock(p_manager_mutex);
-        opj_event_msg(p_manager, EVT_ERROR, "Malformed HT codeblock. "
-                                "Decoding this codeblock is stopped.\n");
-        if (p_manager_mutex)
-          opj_mutex_unlock(p_manager_mutex);
         return OPJ_FALSE;
-      }
-
-      ls0 = lsp[2]; //for next double quad
-      lsp[1] = lsp[2] = 0;
-
-      //decode magsgn and update line_state
-      /////////////////////////////////////
-
-      //locations where samples need update
-      locs = 0xFF;
-      if (x + 4 > width) locs >>= (x + 4 - width) << 1;
-      locs = height > 1 ? locs : (locs & 0x55);
-
-
-      if (qinf[0] & 0x10) //sigma_n
-      {
-        OPJ_UINT32 val;
-
-        ms_val = frwd_fetch(&magsgn);
-        m_n = U_q[0] - ((qinf[0] >> 12) & 1); //m_n
-        frwd_advance(&magsgn, m_n);
-        val = ms_val << 31;
-        v_n = ms_val & ((1U << m_n) - 1);
-        v_n |= ((qinf[0] & 0x100) >> 8) << m_n;
-        v_n |= 1; //center of bin
-        sp[0] = val | ((v_n + 2) << (p - 1));
-      }
-      else if (locs & 0x1)
-        sp[0] = 0;
-
-      if (qinf[0] & 0x20) //sigma_n
-      {
-        OPJ_UINT32 val, t;
-
-        ms_val = frwd_fetch(&magsgn);
-        m_n = U_q[0] - ((qinf[0] >> 13) & 1); //m_n
-        frwd_advance(&magsgn, m_n);
-        val = ms_val << 31;
-        v_n = ms_val & ((1U << m_n) - 1);
-        v_n |= ((qinf[0] & 0x200) >> 9) << m_n;
-        v_n |= 1; //center of bin
-        sp[stride] = val | ((v_n + 2) << (p - 1));
-
-        //update line_state: bit 7 (\sigma^N), and E^N
-        t = lsp[0] & 0x7F;          //E^NW
-        v_n = 32 - count_leading_zeros(v_n); 
-        lsp[0] = (OPJ_UINT8)(0x80 | (t > v_n ? t : v_n));
-      }
-      else if (locs & 0x2)
-        sp[stride] = 0; //no need to update line_state
-
-      ++lsp;
-      ++sp;
-
-      if (qinf[0] & 0x40) //sigma_n
-      {
-        OPJ_UINT32 val;
-
-        ms_val = frwd_fetch(&magsgn);
-        m_n = U_q[0] - ((qinf[0] >> 14) & 1); //m_n
-        frwd_advance(&magsgn, m_n);
-        val = ms_val << 31;
-        v_n = ms_val & ((1U << m_n) - 1);
-        v_n |= (((qinf[0] & 0x400) >> 10) << m_n);
-        v_n |= 1;                            //center of bin
-        sp[0] = val | ((v_n + 2) << (p - 1));
-      }
-      else if (locs & 0x4)
-        sp[0] = 0;
-
-      if (qinf[0] & 0x80) //sigma_n
-      {
-        OPJ_UINT32 val;
-
-        ms_val = frwd_fetch(&magsgn);
-        m_n = U_q[0] - ((qinf[0] >> 15) & 1); //m_n
-        frwd_advance(&magsgn, m_n);
-        val = ms_val << 31;
-        v_n = ms_val & ((1U << m_n) - 1);
-        v_n |= ((qinf[0] & 0x800) >> 11) << m_n;
-        v_n |= 1; //center of bin
-        sp[stride] = val | ((v_n + 2) << (p - 1));
-
-        //update line_state: bit 7 (\sigma^NW), and E^NW for next quad
-        lsp[0] = (OPJ_UINT8)(0x80 | (32 - count_leading_zeros(v_n)));
-      }
-      else if (locs & 0x8)
-        sp[stride] = 0;
-
-      ++sp;
-
-      if (qinf[1] & 0x10) //sigma_n
-      {
-        OPJ_UINT32 val;
-
-        ms_val = frwd_fetch(&magsgn);
-        m_n = U_q[1] - ((qinf[1] >> 12) & 1); //m_n
-        frwd_advance(&magsgn, m_n);
-        val = ms_val << 31;
-        v_n = ms_val & ((1U << m_n) - 1);
-        v_n |= (((qinf[1] & 0x100) >> 8) << m_n);
-        v_n |= 1;                            //center of bin
-        sp[0] = val | ((v_n + 2) << (p - 1));
-      }
-      else if (locs & 0x10)
-        sp[0] = 0;
-
-      if (qinf[1] & 0x20) //sigma_n
-      {
-        OPJ_UINT32 val, t;
-
-        ms_val = frwd_fetch(&magsgn);
-        m_n = U_q[1] - ((qinf[1] >> 13) & 1); //m_n
-        frwd_advance(&magsgn, m_n);
-        val = ms_val << 31;
-        v_n = ms_val & ((1U << m_n) - 1);
-        v_n |= (((qinf[1] & 0x200) >> 9) << m_n);
-        v_n |= 1; //center of bin
-        sp[stride] = val | ((v_n + 2) << (p - 1));
-
-        //update line_state: bit 7 (\sigma^N), and E^N
-        t = lsp[0] & 0x7F;          //E^NW
-        v_n = 32 - count_leading_zeros(v_n); 
-        lsp[0] = (OPJ_UINT8)(0x80 | (t > v_n ? t : v_n));
-      }
-      else if (locs & 0x20)
-        sp[stride] = 0; //no need to update line_state
-
-      ++lsp;
-      ++sp;
-
-      if (qinf[1] & 0x40) //sigma_n
-      {
-        OPJ_UINT32 val;
-
-        ms_val = frwd_fetch(&magsgn);
-        m_n = U_q[1] - ((qinf[1] >> 14) & 1); //m_n
-        frwd_advance(&magsgn, m_n);
-        val = ms_val << 31;
-        v_n = ms_val & ((1U << m_n) - 1);
-        v_n |= (((qinf[1] & 0x400) >> 10) << m_n);
-        v_n |= 1;                            //center of bin
-        sp[0] = val | ((v_n + 2) << (p - 1));
-      }
-      else if (locs & 0x40)
-        sp[0] = 0;
-
-      if (qinf[1] & 0x80) //sigma_n
-      {
-        OPJ_UINT32 val;
-
-        ms_val = frwd_fetch(&magsgn);
-        m_n = U_q[1] - ((qinf[1] >> 15) & 1); //m_n
-        frwd_advance(&magsgn, m_n);
-        val = ms_val << 31;
-        v_n = ms_val & ((1U << m_n) - 1);
-        v_n |= (((qinf[1] & 0x800) >> 11) << m_n);
-        v_n |= 1; //center of bin
-        sp[stride] = val | ((v_n + 2) << (p - 1));
-
-        //update line_state: bit 7 (\sigma^NW), and E^NW for next quad
-        lsp[0] = (OPJ_UINT8)(0x80 | (32 - count_leading_zeros(v_n)));
-      }
-      else if (locs & 0x80)
-        sp[stride] = 0;
-
-      ++sp;
     }
 
-    y += 2;
-    if (num_passes > 1 && (y & 3) == 0) //executed at multiples of 4
-    { // This is for SPP and potentially MRP
+    if (!opj_t1_allocate_buffers(
+                t1,
+                (OPJ_UINT32)(cblk->x1 - cblk->x0),
+                (OPJ_UINT32)(cblk->y1 - cblk->y0))) {
+        return OPJ_FALSE;
+    }
 
-      if (num_passes > 2) //do MRP
-      {
-        // select the current stripe
-        OPJ_UINT32 *cur_sig = y & 0x4 ? sigma1 : sigma2;
-        // the address of the data that needs updating
-        OPJ_UINT32 *dpp = decoded_data + (y - 4) * stride;
-        OPJ_UINT32 half = 1u << (p - 2); // half the center of the bin
-        for (OPJ_INT32 i = 0; i < width; i += 8)
-        {
-          //Process one entry from sigma array at a time
-          // Each nibble (4 bits) in the sigma array represents 4 rows,
-          // and the 32 bits contain 8 columns
-          OPJ_UINT32 cwd = rev_fetch_mrp(&magref); // get 32 bit data
-          OPJ_UINT32 sig = *cur_sig++; // 32 bit that will be processed now
-          OPJ_UINT32 col_mask = 0xFu;  // a mask for a column in sig
-          OPJ_UINT32 *dp = dpp + i;    // next column in decode samples
-          if (sig) // if any of the 32 bits are set
-          {
-            for (int j = 0; j < 8; ++j, dp++) //one column at a time
-            {
-              if (sig & col_mask) // lowest nibble
-              {
-                OPJ_UINT32 sample_mask = 0x11111111u & col_mask; //LSB
+    /* Even if we have a single chunk, in multi-threaded decoding */
+    /* the insertion of our synthetic marker might potentially override */
+    /* valid codestream of other codeblocks decoded in parallel. */
+    if (cblk->numchunks > 1 || t1->mustuse_cblkdatabuffer) {
+        OPJ_UINT32 i;
+        OPJ_UINT32 cblk_len;
 
-                if (sig & sample_mask) //if LSB is set
-                {
-                  OPJ_UINT32 sym;
+        /* Compute whole codeblock length from chunk lengths */
+        cblk_len = 0;
+        for (i = 0; i < cblk->numchunks; i++) {
+            cblk_len += cblk->chunks[i].len;
+        }
 
-                  assert(dp[0] != 0); // decoded value cannot be zero
-                  sym = cwd & 1; // get it value
-                  // remove center of bin if sym is 0
-                  dp[0] ^= (1 - sym) << (p - 1);
-                  dp[0] |= half;      // put half the center of bin
-                  cwd >>= 1;          //consume word
-                }
-                sample_mask += sample_mask; //next row
-
-                if (sig & sample_mask)
-                {
-                  OPJ_UINT32 sym;
-
-                  assert(dp[stride] != 0);
-                  sym = cwd & 1;
-                  dp[stride] ^= (1 - sym) << (p - 1);
-                  dp[stride] |= half;
-                  cwd >>= 1;
-                }
-                sample_mask += sample_mask;
-
-                if (sig & sample_mask)
-                {
-                  OPJ_UINT32 sym;
-
-                  assert(dp[2 * stride] != 0);
-                  sym = cwd & 1;
-                  dp[2 * stride] ^= (1 - sym) << (p - 1);
-                  dp[2 * stride] |= half;
-                  cwd >>= 1;
-                }
-                sample_mask += sample_mask;
-
-                if (sig & sample_mask)
-                {
-                  OPJ_UINT32 sym;
-
-                  assert(dp[3 * stride] != 0);
-                  sym = cwd & 1;
-                  dp[3 * stride] ^= (1 - sym) << (p - 1);
-                  dp[3 * stride] |= half;
-                  cwd >>= 1;
-                }
-                sample_mask += sample_mask;
-              }
-              col_mask <<= 4; //next column
+        /* Allocate temporary memory if needed */
+        if (cblk_len + OPJ_COMMON_CBLK_DATA_EXTRA > t1->cblkdatabuffersize) {
+            cblkdata = (OPJ_BYTE*)opj_realloc(
+                           t1->cblkdatabuffer, cblk_len + OPJ_COMMON_CBLK_DATA_EXTRA);
+            if (cblkdata == NULL) {
+                return OPJ_FALSE;
             }
-          }
-          // consume data according to the number of bits set
-          rev_advance_mrp(&magref, population_count(sig)); 
-        }
-      }
-
-      if (y >= 4) // update mbr array at the end of each stripe
-      {
-        //generate mbr corresponding to a stripe
-        OPJ_UINT32 *sig = y & 0x4 ? sigma1 : sigma2;
-        OPJ_UINT32 *mbr = y & 0x4 ? mbr1 : mbr2;
-
-        //data is processed in patches of 8 columns, each 
-        // each 32 bits in sigma1 or mbr1 represent 4 rows
-
-        //integrate horizontally
-        OPJ_UINT32 prev = 0; // previous columns
-        for (OPJ_INT32 i = 0; i < width; i += 8, mbr++, sig++)
-        {
-          OPJ_UINT32 t, z;
-
-          mbr[0] = sig[0];         //start with significant samples
-          mbr[0] |= prev >> 28;    //for first column, left neighbors
-          mbr[0] |= sig[0] << 4;   //left neighbors
-          mbr[0] |= sig[0] >> 4;   //right neighbors
-          mbr[0] |= sig[1] << 28;  //for last column, right neighbors
-          prev = sig[0];           // for next group of columns
-
-          //integrate vertically
-          t = mbr[0], z = mbr[0];
-          z |= (t & 0x77777777) << 1; //above neighbors
-          z |= (t & 0xEEEEEEEE) >> 1; //below neighbors
-          mbr[0] = z & ~sig[0]; //remove already significance samples
-        }
-      }
-
-      if (y >= 8) //wait until 8 rows has been processed
-      {
-        OPJ_UINT32 *cur_sig, *cur_mbr, *nxt_sig, *nxt_mbr;
-        OPJ_UINT32 prev;
-        OPJ_UINT32 val;
-
-        // add membership from the next stripe, obtained above
-        cur_sig = y & 0x4 ? sigma2 : sigma1;
-        cur_mbr = y & 0x4 ? mbr2 : mbr1;
-        nxt_sig = y & 0x4 ? sigma1 : sigma2;  //future samples
-        prev = 0; // the columns before these group of 8 columns
-        for (OPJ_INT32 i=0; i < width; i+=8, cur_mbr++, cur_sig++, nxt_sig++)
-        {
-          OPJ_UINT32 t = nxt_sig[0];
-          t |= prev >> 28;        //for first column, left neighbors
-          t |= nxt_sig[0] << 4;   //left neighbors
-          t |= nxt_sig[0] >> 4;   //right neighbors
-          t |= nxt_sig[1] << 28;  //for last column, right neighbors
-          prev = nxt_sig[0];      // for next group of columns
-
-          cur_mbr[0] |= (t & 0x11111111u) << 3; //propagate up to cur_mbr
-          cur_mbr[0] &= ~cur_sig[0]; //remove already significance samples
+            t1->cblkdatabuffer = cblkdata;
+            memset(t1->cblkdatabuffer + cblk_len, 0, OPJ_COMMON_CBLK_DATA_EXTRA);
+            t1->cblkdatabuffersize = cblk_len + OPJ_COMMON_CBLK_DATA_EXTRA;
         }
 
-        //find new locations and get signs
-        cur_sig = y & 0x4 ? sigma2 : sigma1;  
-        cur_mbr = y & 0x4 ? mbr2 : mbr1;
-        nxt_sig = y & 0x4 ? sigma1 : sigma2; //future samples
-        nxt_mbr = y & 0x4 ? mbr1 : mbr2;     //future samples
-        val = 3u << (p - 2); // sample values for newly discovered 
-                             // signficant samples including the bin center
-        for (OPJ_INT32 i = 0; i < width;
-              i += 8, cur_sig++, cur_mbr++, nxt_sig++, nxt_mbr++)
-        {
-          OPJ_UINT32 ux, tx;
-          OPJ_UINT32 mbr = *cur_mbr;
-          OPJ_UINT32 new_sig = 0;
-          if (mbr)  //are there any samples that migt be signficant 
-          {
-            for (OPJ_INT32 n = 0; n < 8; n += 4)
-            {
-              OPJ_UINT32 col_mask;
-              OPJ_UINT32 inv_sig;
-              OPJ_INT32 end;
+        /* Concatenate all chunks */
+        cblkdata = t1->cblkdatabuffer;
+        cblk_len = 0;
+        for (i = 0; i < cblk->numchunks; i++) {
+            memcpy(cblkdata + cblk_len, cblk->chunks[i].data, cblk->chunks[i].len);
+            cblk_len += cblk->chunks[i].len;
+        }
+    } else if (cblk->numchunks == 1) {
+        cblkdata = cblk->chunks[0].data;
+    } else {
+        /* Not sure if that can happen in practice, but avoid Coverity to */
+        /* think we will dereference a null cblkdta pointer */
+        return OPJ_TRUE;
+    }
 
-              OPJ_UINT32 cwd = frwd_fetch(&sigprop); //get 32 bits
-              OPJ_UINT32 cnt = 0;
+    // OPJ_BYTE* coded_data is a pointer to bitstream
+    coded_data = cblkdata;
+    // OPJ_UINT32* decoded_data is a pointer to decoded codeblock data buf.
+    decoded_data = (OPJ_UINT32*)t1->data;
+    // OPJ_UINT32 num_passes is the number of passes: 1 if CUP only, 2 for
+    // CUP+SPP, and 3 for CUP+SPP+MRP
+    num_passes = cblk->numsegs > 0 ? cblk->segs[0].real_num_passes : 0;
+    num_passes += cblk->numsegs > 1 ? cblk->segs[1].real_num_passes : 0;
+    // OPJ_UINT32 lengths1 is the length of cleanup pass
+    lengths1 = num_passes > 0 ? cblk->segs[0].len : 0;
+    // OPJ_UINT32 lengths2 is the length of refinement passes (either SPP only or SPP+MRP)
+    lengths2 = num_passes > 1 ? cblk->segs[1].len : 0;
+    // OPJ_INT32 width is the decoded codeblock width
+    width = cblk->x1 - cblk->x0;
+    // OPJ_INT32 height is the decoded codeblock height
+    height = cblk->y1 - cblk->y0;
+    // OPJ_INT32 stride is the decoded codeblock buffer stride
+    stride = width;
 
-              OPJ_UINT32 *dp = decoded_data + (y - 8) * stride;
-              dp += i + n; //address for decoded samples
+    /*  sigma1 and sigma2 contains significant (i.e., non-zero) pixel
+     *  locations.  The buffers are used interchangeably, because we need
+     *  more than 4 rows of significance information at a given time.
+     *  Each 32 bits contain significance information for 4 rows of 8
+     *  columns each.  If we denote 32 bits by 0xaaaaaaaa, the each "a" is
+     *  called a nibble and has significance information for 4 rows.
+     *  The least significant nibble has information for the first column,
+     *  and so on. The nibble's LSB is for the first row, and so on.
+     *  Since, at most, we can have 1024 columns in a quad, we need 128
+     *  entries; we added 1 for convenience when propagation of signifcance
+     *  goes outside the structure
+     *  To work in OpenJPEG these buffers has been expanded to 132.
+     */
+    // OPJ_UINT32 *pflags, *sigma1, *sigma2, *mbr1, *mbr2, *sip, sip_shift;
+    pflags = (OPJ_UINT32 *)t1->flags;
+    sigma1 = pflags;
+    sigma2 = sigma1 + 132;
+    // mbr arrangement is similar to sigma; mbr contains locations
+    // that become significant during significance propagation pass
+    mbr1 = sigma2 + 132;
+    mbr2 = mbr1 + 132;
+    //a pointer to sigma
+    sip = sigma1;  //pointers to arrays to be used interchangeably
+    sip_shift = 0; //the amount of shift needed for sigma
 
-              col_mask = 0xFu << (4 * n); //a mask to select a column
+    if (num_passes > 1 && lengths2 == 0) {
+        if (p_manager_mutex) {
+            opj_mutex_lock(p_manager_mutex);
+        }
+        opj_event_msg(p_manager, EVT_WARNING, "A malformed codeblock that has "
+                      "more than one coding pass, but zero length for "
+                      "2nd and potential 3rd pass.\n");
+        if (p_manager_mutex) {
+            opj_mutex_unlock(p_manager_mutex);
+        }
+        num_passes = 1;
+    }
+    if (num_passes > 3) {
+        if (p_manager_mutex) {
+            opj_mutex_lock(p_manager_mutex);
+        }
+        opj_event_msg(p_manager, EVT_WARNING, "We do not support more than 3 "
+                      "coding passes; This codeblocks has %d passes.\n",
+                      num_passes);
+        if (p_manager_mutex) {
+            opj_mutex_unlock(p_manager_mutex);
+        }
+        return OPJ_FALSE;
+    }
 
-              inv_sig = ~cur_sig[0]; // insignificant samples
+    if (cblk->numbps == 1 && num_passes > 1) {
+        // We do not have enough precision to decode SgnProp nor MagRef passes.
+        // We decode the cleanup passes only
+        if (cannot_decode_spp_mrp_msg == OPJ_FALSE) {
+            if (p_manager_mutex) {
+                opj_mutex_lock(p_manager_mutex);
+            }
+            cannot_decode_spp_mrp_msg = OPJ_TRUE;
+            opj_event_msg(p_manager, EVT_WARNING, "Not enough precision to decode "
+                          "the SgnProp nor MagRef passes.  This message "
+                          "will not be displayed again.\n");
+            if (p_manager_mutex) {
+                opj_mutex_unlock(p_manager_mutex);
+            }
+        }
+        num_passes = 1;
+    }
+    if (cblk->numbps == 0) {
+        // We do not have enough precision to decode the CUP pass with the
+        // center of bin bit set.  The code can be modified to support this
+        // case, without using the center of the bin.
+        if (cannot_decode_due_to_insufficient_precision == OPJ_FALSE) {
+            if (p_manager_mutex) {
+                opj_mutex_lock(p_manager_mutex);
+            }
+            cannot_decode_due_to_insufficient_precision = OPJ_TRUE;
+            opj_event_msg(p_manager, EVT_WARNING, "Not enough precision to decode "
+                          "the cleanup pass. The code should be "
+                          "modified to support this case. This message "
+                          "will not be displayed again.\n");
+            if (p_manager_mutex) {
+                opj_mutex_unlock(p_manager_mutex);
+            }
+        }
+        return OPJ_TRUE;
+    }
 
-              //find the last sample we operate on
-              end = n + 4 + i < width ? n + 4 : width - i;
+    // OPJ_UINT32
+    p = cblk->numbps;
+    // OPJ_UINT32 zero planes plus 1
+    zero_planes_p1 = cblk->Mb - cblk->numbps + 1;
 
-              for (OPJ_INT32 j = n; j < end; ++j, ++dp, col_mask <<= 4)
-              {
-                OPJ_UINT32 sample_mask;
+    // read scup and fix the bytes there
+    lcup = (int)lengths1;  // length of CUP
+    //scup is the length of MEL + VLC
+    scup = (((int)coded_data[lcup - 1]) << 4) + (coded_data[lcup - 2] & 0xF);
+    if (scup < 2 || scup > lcup || scup > 4079) { //something is wrong
+        return OPJ_FALSE;
+    }
 
-                if ((col_mask & mbr) == 0) //no samples need checking
-                  continue;
+    // init structures
+    mel_init(&mel, coded_data, lcup, scup);
+    rev_init(&vlc, coded_data, lcup, scup);
+    frwd_init(&magsgn, coded_data, lcup - scup, 0xFF);
+    if (num_passes > 1) { // needs to be tested
+        frwd_init(&sigprop, coded_data + lengths1, (int)lengths2, 0);
+    }
+    if (num_passes > 2) {
+        rev_init_mrp(&magref, coded_data, (int)lengths1, (int)lengths2);
+    }
 
-                //scan mbr to find a new signficant sample
-                sample_mask = 0x11111111u & col_mask; // LSB
-                if (mbr & sample_mask)
-                {
-                  assert(dp[0] == 0); // the sample must have been 0
-                  if (cwd & 1) //if this sample has become significant
-                  { // must propagate it to nearby samples
-                    OPJ_UINT32 t;
-                    new_sig |= sample_mask;  // new significant samples
-                    t = 0x32u << (j * 4);// propagation to neighbors
-                    mbr |= t & inv_sig; //remove already signifcant samples
-                  }
-                  cwd >>= 1; ++cnt; //consume bit and increment number of
+    /** State storage
+      *  One byte per quad; for 1024 columns, or 512 quads, we need
+      *  512 bytes. We are using 2 extra bytes one on the left and one on
+      *  the right for convenience.
+      *
+      *  The MSB bit in each byte is (\sigma^nw | \sigma^n), and the 7 LSBs
+      *  contain max(E^nw | E^n)
+      */
+
+    // 514 is enough for a block width of 1024, +2 extra
+    // here expanded to 528
+    line_state = (OPJ_UINT8 *)(mbr2 + 132);
+
+    //initial 2 lines
+    /////////////////
+    lsp = line_state;              // point to line state
+    lsp[0] = 0;                    // for initial row of quad, we set to 0
+    run = mel_get_run(&mel);    // decode runs of events from MEL bitstrm
+    // data represented as runs of 0 events
+    // See mel_decode description
+    qinf[0] = qinf[1] = 0;      // quad info decoded from VLC bitstream
+    c_q = 0;                    // context for quad q
+    sp = decoded_data;          // decoded codeblock samples
+    // vlc_val;                 // fetched data from VLC bitstream
+
+    for (x = 0; x < width; x += 4) { // one iteration per quad pair
+        OPJ_UINT32 U_q[2]; // u values for the quad pair
+        OPJ_UINT32 uvlc_mode;
+        OPJ_UINT32 consumed_bits;
+        OPJ_UINT32 m_n, v_n;
+        OPJ_UINT32 ms_val;
+        OPJ_UINT32 locs;
+
+        // decode VLC
+        /////////////
+
+        //first quad
+        // Get the head of the VLC bitstream. One fetch is enough for two
+        // quads, since the largest VLC code is 7 bits, and maximum number of
+        // bits used for u is 8.  Therefore for two quads we need 30 bits
+        // (if we include unstuffing, then 32 bits are enough, since we have
+        // a maximum of one stuffing per two bytes)
+        vlc_val = rev_fetch(&vlc);
+
+        //decode VLC using the context c_q and the head of the VLC bitstream
+        qinf[0] = vlc_tbl0[(c_q << 7) | (vlc_val & 0x7F) ];
+
+        if (c_q == 0) { // if zero context, we need to use one MEL event
+            run -= 2; //the number of 0 events is multiplied by 2, so subtract 2
+
+            // Is the run terminated in 1? if so, use decoded VLC code,
+            // otherwise, discard decoded data, since we will decoded again
+            // using a different context
+            qinf[0] = (run == -1) ? qinf[0] : 0;
+
+            // is run -1 or -2? this means a run has been consumed
+            if (run < 0) {
+                run = mel_get_run(&mel);    // get another run
+            }
+        }
+
+        // prepare context for the next quad; eqn. 1 in ITU T.814
+        c_q = ((qinf[0] & 0x10) >> 4) | ((qinf[0] & 0xE0) >> 5);
+
+        //remove data from vlc stream (0 bits are removed if qinf is not used)
+        vlc_val = rev_advance(&vlc, qinf[0] & 0x7);
+
+        //update sigma
+        // The update depends on the value of x; consider one OPJ_UINT32
+        // if x is 0, 8, 16 and so on, then this line update c locations
+        //      nibble (4 bits) number   0 1 2 3 4 5 6 7
+        //                         LSB   c c 0 0 0 0 0 0
+        //                               c c 0 0 0 0 0 0
+        //                               0 0 0 0 0 0 0 0
+        //                               0 0 0 0 0 0 0 0
+        // if x is 4, 12, 20, then this line update locations c
+        //      nibble (4 bits) number   0 1 2 3 4 5 6 7
+        //                         LSB   0 0 0 0 c c 0 0
+        //                               0 0 0 0 c c 0 0
+        //                               0 0 0 0 0 0 0 0
+        //                               0 0 0 0 0 0 0 0
+        *sip |= (((qinf[0] & 0x30) >> 4) | ((qinf[0] & 0xC0) >> 2)) << sip_shift;
+
+        //second quad
+        qinf[1] = 0;
+        if (x + 2 < width) { // do not run if codeblock is narrower
+            //decode VLC using the context c_q and the head of the VLC bitstream
+            qinf[1] = vlc_tbl0[(c_q << 7) | (vlc_val & 0x7F)];
+
+            // if context is zero, use one MEL event
+            if (c_q == 0) { //zero context
+                run -= 2; //subtract 2, since events number if multiplied by 2
+
+                // if event is 0, discard decoded qinf
+                qinf[1] = (run == -1) ? qinf[1] : 0;
+
+                if (run < 0) { // have we consumed all events in a run
+                    run = mel_get_run(&mel);    // if yes, then get another run
+                }
+            }
+
+            //prepare context for the next quad, eqn. 1 in ITU T.814
+            c_q = ((qinf[1] & 0x10) >> 4) | ((qinf[1] & 0xE0) >> 5);
+
+            //remove data from vlc stream, if qinf is not used, cwdlen is 0
+            vlc_val = rev_advance(&vlc, qinf[1] & 0x7);
+        }
+
+        //update sigma
+        // The update depends on the value of x; consider one OPJ_UINT32
+        // if x is 0, 8, 16 and so on, then this line update c locations
+        //      nibble (4 bits) number   0 1 2 3 4 5 6 7
+        //                         LSB   0 0 c c 0 0 0 0
+        //                               0 0 c c 0 0 0 0
+        //                               0 0 0 0 0 0 0 0
+        //                               0 0 0 0 0 0 0 0
+        // if x is 4, 12, 20, then this line update locations c
+        //      nibble (4 bits) number   0 1 2 3 4 5 6 7
+        //                         LSB   0 0 0 0 0 0 c c
+        //                               0 0 0 0 0 0 c c
+        //                               0 0 0 0 0 0 0 0
+        //                               0 0 0 0 0 0 0 0
+        *sip |= (((qinf[1] & 0x30) | ((qinf[1] & 0xC0) << 2))) << (4 + sip_shift);
+
+        sip += x & 0x7 ? 1 : 0; // move sigma pointer to next entry
+        sip_shift ^= 0x10;      // increment/decrement sip_shift by 16
+
+        // retrieve u
+        /////////////
+
+        // uvlc_mode is made up of u_offset bits from the quad pair
+        uvlc_mode = ((qinf[0] & 0x8) >> 3) | ((qinf[1] & 0x8) >> 2);
+        if (uvlc_mode == 3) { // if both u_offset are set, get an event from
+            // the MEL run of events
+            run -= 2; //subtract 2, since events number if multiplied by 2
+            uvlc_mode += (run == -1) ? 1 : 0; //increment uvlc_mode if event is 1
+            if (run < 0) { // if run is consumed (run is -1 or -2), get another run
+                run = mel_get_run(&mel);
+            }
+        }
+        //decode uvlc_mode to get u for both quads
+        consumed_bits = decode_init_uvlc(vlc_val, uvlc_mode, U_q);
+        if (U_q[0] > zero_planes_p1 || U_q[1] > zero_planes_p1) {
+            if (p_manager_mutex) {
+                opj_mutex_lock(p_manager_mutex);
+            }
+            opj_event_msg(p_manager, EVT_ERROR, "Malformed HT codeblock. Decoding "
+                          "this codeblock is stopped.\n");
+            if (p_manager_mutex) {
+                opj_mutex_unlock(p_manager_mutex);
+            }
+            return OPJ_FALSE;
+        }
+
+        //consume u bits in the VLC code
+        vlc_val = rev_advance(&vlc, consumed_bits);
+
+        //decode magsgn and update line_state
+        /////////////////////////////////////
+
+        //We obtain a mask for the samples locations that needs evaluation
+        locs = 0xFF;
+        if (x + 4 > width) {
+            locs >>= (x + 4 - width) << 1;    // limits width
+        }
+        locs = height > 1 ? locs : (locs & 0x55);         // limits height
+
+        //first quad, starting at first sample in quad and moving on
+        if (qinf[0] & 0x10) { //is it signifcant? (sigma_n)
+            OPJ_UINT32 val;
+
+            ms_val = frwd_fetch(&magsgn);         //get 32 bits of magsgn data
+            m_n = U_q[0] - ((qinf[0] >> 12) & 1); //evaluate m_n (number of bits
+            // to read from bitstream), using EMB e_k
+            frwd_advance(&magsgn, m_n);         //consume m_n
+            val = ms_val << 31;                 //get sign bit
+            v_n = ms_val & ((1U << m_n) - 1);   //keep only m_n bits
+            v_n |= ((qinf[0] & 0x100) >> 8) << m_n;  //add EMB e_1 as MSB
+            v_n |= 1;                                //add center of bin
+            //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit
+            //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs
+            sp[0] = val | ((v_n + 2) << (p - 1));
+        } else if (locs & 0x1) { // if this is outside the codeblock, set the
+            sp[0] = 0;    // sample to zero
+        }
+
+        if (qinf[0] & 0x20) { //sigma_n
+            OPJ_UINT32 val, t;
+
+            ms_val = frwd_fetch(&magsgn);         //get 32 bits
+            m_n = U_q[0] - ((qinf[0] >> 13) & 1); //m_n, uses EMB e_k
+            frwd_advance(&magsgn, m_n);           //consume m_n
+            val = ms_val << 31;                   //get sign bit
+            v_n = ms_val & ((1U << m_n) - 1);     //keep only m_n bits
+            v_n |= ((qinf[0] & 0x200) >> 9) << m_n; //add EMB e_1
+            v_n |= 1;                               //bin center
+            //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit
+            //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs
+            sp[stride] = val | ((v_n + 2) << (p - 1));
+
+            //update line_state: bit 7 (\sigma^N), and E^N
+            t = lsp[0] & 0x7F;       // keep E^NW
+            v_n = 32 - count_leading_zeros(v_n);
+            lsp[0] = (OPJ_UINT8)(0x80 | (t > v_n ? t : v_n)); //max(E^NW, E^N) | s
+        } else if (locs & 0x2) { // if this is outside the codeblock, set the
+            sp[stride] = 0;    //no need to update line_state
+        }
+
+        ++lsp; // move to next quad information
+        ++sp;  // move to next column of samples
+
+        //this is similar to the above two samples
+        if (qinf[0] & 0x40) {
+            OPJ_UINT32 val;
+
+            ms_val = frwd_fetch(&magsgn);
+            m_n = U_q[0] - ((qinf[0] >> 14) & 1);
+            frwd_advance(&magsgn, m_n);
+            val = ms_val << 31;
+            v_n = ms_val & ((1U << m_n) - 1);
+            v_n |= (((qinf[0] & 0x400) >> 10) << m_n);
+            v_n |= 1;
+            sp[0] = val | ((v_n + 2) << (p - 1));
+        } else if (locs & 0x4) {
+            sp[0] = 0;
+        }
+
+        lsp[0] = 0;
+        if (qinf[0] & 0x80) {
+            OPJ_UINT32 val;
+            ms_val = frwd_fetch(&magsgn);
+            m_n = U_q[0] - ((qinf[0] >> 15) & 1); //m_n
+            frwd_advance(&magsgn, m_n);
+            val = ms_val << 31;
+            v_n = ms_val & ((1U << m_n) - 1);
+            v_n |= ((qinf[0] & 0x800) >> 11) << m_n;
+            v_n |= 1; //center of bin
+            sp[stride] = val | ((v_n + 2) << (p - 1));
+
+            //line_state: bit 7 (\sigma^NW), and E^NW for next quad
+            lsp[0] = (OPJ_UINT8)(0x80 | (32 - count_leading_zeros(v_n)));
+        } else if (locs & 0x8) { //if outside set to 0
+            sp[stride] = 0;
+        }
+
+        ++sp; //move to next column
+
+        //second quad
+        if (qinf[1] & 0x10) {
+            OPJ_UINT32 val;
+
+            ms_val = frwd_fetch(&magsgn);
+            m_n = U_q[1] - ((qinf[1] >> 12) & 1); //m_n
+            frwd_advance(&magsgn, m_n);
+            val = ms_val << 31;
+            v_n = ms_val & ((1U << m_n) - 1);
+            v_n |= (((qinf[1] & 0x100) >> 8) << m_n);
+            v_n |= 1;
+            sp[0] = val | ((v_n + 2) << (p - 1));
+        } else if (locs & 0x10) {
+            sp[0] = 0;
+        }
+
+        if (qinf[1] & 0x20) {
+            OPJ_UINT32 val, t;
+
+            ms_val = frwd_fetch(&magsgn);
+            m_n = U_q[1] - ((qinf[1] >> 13) & 1); //m_n
+            frwd_advance(&magsgn, m_n);
+            val = ms_val << 31;
+            v_n = ms_val & ((1U << m_n) - 1);
+            v_n |= (((qinf[1] & 0x200) >> 9) << m_n);
+            v_n |= 1;
+            sp[stride] = val | ((v_n + 2) << (p - 1));
+
+            //update line_state: bit 7 (\sigma^N), and E^N
+            t = lsp[0] & 0x7F;            //E^NW
+            v_n = 32 - count_leading_zeros(v_n);     //E^N
+            lsp[0] = (OPJ_UINT8)(0x80 | (t > v_n ? t : v_n)); //max(E^NW, E^N) | s
+        } else if (locs & 0x20) {
+            sp[stride] = 0;    //no need to update line_state
+        }
+
+        ++lsp; //move line state to next quad
+        ++sp;  //move to next sample
+
+        if (qinf[1] & 0x40) {
+            OPJ_UINT32 val;
+
+            ms_val = frwd_fetch(&magsgn);
+            m_n = U_q[1] - ((qinf[1] >> 14) & 1); //m_n
+            frwd_advance(&magsgn, m_n);
+            val = ms_val << 31;
+            v_n = ms_val & ((1U << m_n) - 1);
+            v_n |= (((qinf[1] & 0x400) >> 10) << m_n);
+            v_n |= 1;
+            sp[0] = val | ((v_n + 2) << (p - 1));
+        } else if (locs & 0x40) {
+            sp[0] = 0;
+        }
+
+        lsp[0] = 0;
+        if (qinf[1] & 0x80) {
+            OPJ_UINT32 val;
+
+            ms_val = frwd_fetch(&magsgn);
+            m_n = U_q[1] - ((qinf[1] >> 15) & 1); //m_n
+            frwd_advance(&magsgn, m_n);
+            val = ms_val << 31;
+            v_n = ms_val & ((1U << m_n) - 1);
+            v_n |= (((qinf[1] & 0x800) >> 11) << m_n);
+            v_n |= 1; //center of bin
+            sp[stride] = val | ((v_n + 2) << (p - 1));
+
+            //line_state: bit 7 (\sigma^NW), and E^NW for next quad
+            lsp[0] = (OPJ_UINT8)(0x80 | (32 - count_leading_zeros(v_n)));
+        } else if (locs & 0x80) {
+            sp[stride] = 0;
+        }
+
+        ++sp;
+    }
+
+    //non-initial lines
+    //////////////////////////
+    for (y = 2; y < height; /*done at the end of loop*/) {
+        OPJ_UINT32 *sip;
+        OPJ_UINT8 ls0;
+        OPJ_INT32 x;
+
+        sip_shift ^= 0x2;  // shift sigma to the upper half od the nibble
+        sip_shift &= 0xFFFFFFEFU; //move back to 0 (it might have been at 0x10)
+        sip = y & 0x4 ? sigma2 : sigma1; //choose sigma array
+
+        lsp = line_state;
+        ls0 = lsp[0];                   // read the line state value
+        lsp[0] = 0;                     // and set it to zero
+        sp = decoded_data + y * stride; // generated samples
+        c_q = 0;                        // context
+        for (x = 0; x < width; x += 4) {
+            OPJ_UINT32 U_q[2];
+            OPJ_UINT32 uvlc_mode, consumed_bits;
+            OPJ_UINT32 m_n, v_n;
+            OPJ_UINT32 ms_val;
+            OPJ_UINT32 locs;
+
+            // decode vlc
+            /////////////
+
+            //first quad
+            // get context, eqn. 2 ITU T.814
+            // c_q has \sigma^W | \sigma^SW
+            c_q |= (ls0 >> 7);          //\sigma^NW | \sigma^N
+            c_q |= (lsp[1] >> 5) & 0x4; //\sigma^NE | \sigma^NF
+
+            //the following is very similar to previous code, so please refer to
+            // that
+            vlc_val = rev_fetch(&vlc);
+            qinf[0] = vlc_tbl1[(c_q << 7) | (vlc_val & 0x7F)];
+            if (c_q == 0) { //zero context
+                run -= 2;
+                qinf[0] = (run == -1) ? qinf[0] : 0;
+                if (run < 0) {
+                    run = mel_get_run(&mel);
+                }
+            }
+            //prepare context for the next quad, \sigma^W | \sigma^SW
+            c_q = ((qinf[0] & 0x40) >> 5) | ((qinf[0] & 0x80) >> 6);
+
+            //remove data from vlc stream
+            vlc_val = rev_advance(&vlc, qinf[0] & 0x7);
+
+            //update sigma
+            // The update depends on the value of x and y; consider one OPJ_UINT32
+            // if x is 0, 8, 16 and so on, and y is 2, 6, etc., then this
+            // line update c locations
+            //      nibble (4 bits) number   0 1 2 3 4 5 6 7
+            //                         LSB   0 0 0 0 0 0 0 0
+            //                               0 0 0 0 0 0 0 0
+            //                               c c 0 0 0 0 0 0
+            //                               c c 0 0 0 0 0 0
+            *sip |= (((qinf[0] & 0x30) >> 4) | ((qinf[0] & 0xC0) >> 2)) << sip_shift;
+
+            //second quad
+            qinf[1] = 0;
+            if (x + 2 < width) {
+                c_q |= (lsp[1] >> 7);
+                c_q |= (lsp[2] >> 5) & 0x4;
+                qinf[1] = vlc_tbl1[(c_q << 7) | (vlc_val & 0x7F)];
+                if (c_q == 0) { //zero context
+                    run -= 2;
+                    qinf[1] = (run == -1) ? qinf[1] : 0;
+                    if (run < 0) {
+                        run = mel_get_run(&mel);
+                    }
+                }
+                //prepare context for the next quad
+                c_q = ((qinf[1] & 0x40) >> 5) | ((qinf[1] & 0x80) >> 6);
+                //remove data from vlc stream
+                vlc_val = rev_advance(&vlc, qinf[1] & 0x7);
+            }
+
+            //update sigma
+            *sip |= (((qinf[1] & 0x30) | ((qinf[1] & 0xC0) << 2))) << (4 + sip_shift);
+
+            sip += x & 0x7 ? 1 : 0;
+            sip_shift ^= 0x10;
+
+            //retrieve u
+            ////////////
+            uvlc_mode = ((qinf[0] & 0x8) >> 3) | ((qinf[1] & 0x8) >> 2);
+            consumed_bits = decode_noninit_uvlc(vlc_val, uvlc_mode, U_q);
+            vlc_val = rev_advance(&vlc, consumed_bits);
+
+            //calculate E^max and add it to U_q, eqns 5 and 6 in ITU T.814
+            if ((qinf[0] & 0xF0) & ((qinf[0] & 0xF0) - 1)) { // is \gamma_q 1?
+                OPJ_UINT32 E = (ls0 & 0x7Fu);
+                E = E > (lsp[1] & 0x7Fu) ? E : (lsp[1] & 0x7Fu); //max(E, E^NE, E^NF)
+                //since U_q alread has u_q + 1, we subtract 2 instead of 1
+                U_q[0] += E > 2 ? E - 2 : 0;
+            }
+
+            if ((qinf[1] & 0xF0) & ((qinf[1] & 0xF0) - 1)) { //is \gamma_q 1?
+                OPJ_UINT32 E = (lsp[1] & 0x7Fu);
+                E = E > (lsp[2] & 0x7Fu) ? E : (lsp[2] & 0x7Fu); //max(E, E^NE, E^NF)
+                //since U_q alread has u_q + 1, we subtract 2 instead of 1
+                U_q[1] += E > 2 ? E - 2 : 0;
+            }
+
+            if (U_q[0] > zero_planes_p1 || U_q[1] > zero_planes_p1) {
+                if (p_manager_mutex) {
+                    opj_mutex_lock(p_manager_mutex);
+                }
+                opj_event_msg(p_manager, EVT_ERROR, "Malformed HT codeblock. "
+                              "Decoding this codeblock is stopped.\n");
+                if (p_manager_mutex) {
+                    opj_mutex_unlock(p_manager_mutex);
+                }
+                return OPJ_FALSE;
+            }
+
+            ls0 = lsp[2]; //for next double quad
+            lsp[1] = lsp[2] = 0;
+
+            //decode magsgn and update line_state
+            /////////////////////////////////////
+
+            //locations where samples need update
+            locs = 0xFF;
+            if (x + 4 > width) {
+                locs >>= (x + 4 - width) << 1;
+            }
+            locs = height > 1 ? locs : (locs & 0x55);
+
+
+            if (qinf[0] & 0x10) { //sigma_n
+                OPJ_UINT32 val;
+
+                ms_val = frwd_fetch(&magsgn);
+                m_n = U_q[0] - ((qinf[0] >> 12) & 1); //m_n
+                frwd_advance(&magsgn, m_n);
+                val = ms_val << 31;
+                v_n = ms_val & ((1U << m_n) - 1);
+                v_n |= ((qinf[0] & 0x100) >> 8) << m_n;
+                v_n |= 1; //center of bin
+                sp[0] = val | ((v_n + 2) << (p - 1));
+            } else if (locs & 0x1) {
+                sp[0] = 0;
+            }
+
+            if (qinf[0] & 0x20) { //sigma_n
+                OPJ_UINT32 val, t;
+
+                ms_val = frwd_fetch(&magsgn);
+                m_n = U_q[0] - ((qinf[0] >> 13) & 1); //m_n
+                frwd_advance(&magsgn, m_n);
+                val = ms_val << 31;
+                v_n = ms_val & ((1U << m_n) - 1);
+                v_n |= ((qinf[0] & 0x200) >> 9) << m_n;
+                v_n |= 1; //center of bin
+                sp[stride] = val | ((v_n + 2) << (p - 1));
+
+                //update line_state: bit 7 (\sigma^N), and E^N
+                t = lsp[0] & 0x7F;          //E^NW
+                v_n = 32 - count_leading_zeros(v_n);
+                lsp[0] = (OPJ_UINT8)(0x80 | (t > v_n ? t : v_n));
+            } else if (locs & 0x2) {
+                sp[stride] = 0;    //no need to update line_state
+            }
+
+            ++lsp;
+            ++sp;
+
+            if (qinf[0] & 0x40) { //sigma_n
+                OPJ_UINT32 val;
+
+                ms_val = frwd_fetch(&magsgn);
+                m_n = U_q[0] - ((qinf[0] >> 14) & 1); //m_n
+                frwd_advance(&magsgn, m_n);
+                val = ms_val << 31;
+                v_n = ms_val & ((1U << m_n) - 1);
+                v_n |= (((qinf[0] & 0x400) >> 10) << m_n);
+                v_n |= 1;                            //center of bin
+                sp[0] = val | ((v_n + 2) << (p - 1));
+            } else if (locs & 0x4) {
+                sp[0] = 0;
+            }
+
+            if (qinf[0] & 0x80) { //sigma_n
+                OPJ_UINT32 val;
+
+                ms_val = frwd_fetch(&magsgn);
+                m_n = U_q[0] - ((qinf[0] >> 15) & 1); //m_n
+                frwd_advance(&magsgn, m_n);
+                val = ms_val << 31;
+                v_n = ms_val & ((1U << m_n) - 1);
+                v_n |= ((qinf[0] & 0x800) >> 11) << m_n;
+                v_n |= 1; //center of bin
+                sp[stride] = val | ((v_n + 2) << (p - 1));
+
+                //update line_state: bit 7 (\sigma^NW), and E^NW for next quad
+                lsp[0] = (OPJ_UINT8)(0x80 | (32 - count_leading_zeros(v_n)));
+            } else if (locs & 0x8) {
+                sp[stride] = 0;
+            }
+
+            ++sp;
+
+            if (qinf[1] & 0x10) { //sigma_n
+                OPJ_UINT32 val;
+
+                ms_val = frwd_fetch(&magsgn);
+                m_n = U_q[1] - ((qinf[1] >> 12) & 1); //m_n
+                frwd_advance(&magsgn, m_n);
+                val = ms_val << 31;
+                v_n = ms_val & ((1U << m_n) - 1);
+                v_n |= (((qinf[1] & 0x100) >> 8) << m_n);
+                v_n |= 1;                            //center of bin
+                sp[0] = val | ((v_n + 2) << (p - 1));
+            } else if (locs & 0x10) {
+                sp[0] = 0;
+            }
+
+            if (qinf[1] & 0x20) { //sigma_n
+                OPJ_UINT32 val, t;
+
+                ms_val = frwd_fetch(&magsgn);
+                m_n = U_q[1] - ((qinf[1] >> 13) & 1); //m_n
+                frwd_advance(&magsgn, m_n);
+                val = ms_val << 31;
+                v_n = ms_val & ((1U << m_n) - 1);
+                v_n |= (((qinf[1] & 0x200) >> 9) << m_n);
+                v_n |= 1; //center of bin
+                sp[stride] = val | ((v_n + 2) << (p - 1));
+
+                //update line_state: bit 7 (\sigma^N), and E^N
+                t = lsp[0] & 0x7F;          //E^NW
+                v_n = 32 - count_leading_zeros(v_n);
+                lsp[0] = (OPJ_UINT8)(0x80 | (t > v_n ? t : v_n));
+            } else if (locs & 0x20) {
+                sp[stride] = 0;    //no need to update line_state
+            }
+
+            ++lsp;
+            ++sp;
+
+            if (qinf[1] & 0x40) { //sigma_n
+                OPJ_UINT32 val;
+
+                ms_val = frwd_fetch(&magsgn);
+                m_n = U_q[1] - ((qinf[1] >> 14) & 1); //m_n
+                frwd_advance(&magsgn, m_n);
+                val = ms_val << 31;
+                v_n = ms_val & ((1U << m_n) - 1);
+                v_n |= (((qinf[1] & 0x400) >> 10) << m_n);
+                v_n |= 1;                            //center of bin
+                sp[0] = val | ((v_n + 2) << (p - 1));
+            } else if (locs & 0x40) {
+                sp[0] = 0;
+            }
+
+            if (qinf[1] & 0x80) { //sigma_n
+                OPJ_UINT32 val;
+
+                ms_val = frwd_fetch(&magsgn);
+                m_n = U_q[1] - ((qinf[1] >> 15) & 1); //m_n
+                frwd_advance(&magsgn, m_n);
+                val = ms_val << 31;
+                v_n = ms_val & ((1U << m_n) - 1);
+                v_n |= (((qinf[1] & 0x800) >> 11) << m_n);
+                v_n |= 1; //center of bin
+                sp[stride] = val | ((v_n + 2) << (p - 1));
+
+                //update line_state: bit 7 (\sigma^NW), and E^NW for next quad
+                lsp[0] = (OPJ_UINT8)(0x80 | (32 - count_leading_zeros(v_n)));
+            } else if (locs & 0x80) {
+                sp[stride] = 0;
+            }
+
+            ++sp;
+        }
+
+        y += 2;
+        if (num_passes > 1 && (y & 3) == 0) { //executed at multiples of 4
+            // This is for SPP and potentially MRP
+
+            if (num_passes > 2) { //do MRP
+                // select the current stripe
+                OPJ_UINT32 *cur_sig = y & 0x4 ? sigma1 : sigma2;
+                // the address of the data that needs updating
+                OPJ_UINT32 *dpp = decoded_data + (y - 4) * stride;
+                OPJ_UINT32 half = 1u << (p - 2); // half the center of the bin
+                OPJ_INT32 i;
+                for (i = 0; i < width; i += 8) {
+                    //Process one entry from sigma array at a time
+                    // Each nibble (4 bits) in the sigma array represents 4 rows,
+                    // and the 32 bits contain 8 columns
+                    OPJ_UINT32 cwd = rev_fetch_mrp(&magref); // get 32 bit data
+                    OPJ_UINT32 sig = *cur_sig++; // 32 bit that will be processed now
+                    OPJ_UINT32 col_mask = 0xFu;  // a mask for a column in sig
+                    OPJ_UINT32 *dp = dpp + i;    // next column in decode samples
+                    if (sig) { // if any of the 32 bits are set
+                        int j;
+                        for (j = 0; j < 8; ++j, dp++) { //one column at a time
+                            if (sig & col_mask) { // lowest nibble
+                                OPJ_UINT32 sample_mask = 0x11111111u & col_mask; //LSB
+
+                                if (sig & sample_mask) { //if LSB is set
+                                    OPJ_UINT32 sym;
+
+                                    assert(dp[0] != 0); // decoded value cannot be zero
+                                    sym = cwd & 1; // get it value
+                                    // remove center of bin if sym is 0
+                                    dp[0] ^= (1 - sym) << (p - 1);
+                                    dp[0] |= half;      // put half the center of bin
+                                    cwd >>= 1;          //consume word
+                                }
+                                sample_mask += sample_mask; //next row
+
+                                if (sig & sample_mask) {
+                                    OPJ_UINT32 sym;
+
+                                    assert(dp[stride] != 0);
+                                    sym = cwd & 1;
+                                    dp[stride] ^= (1 - sym) << (p - 1);
+                                    dp[stride] |= half;
+                                    cwd >>= 1;
+                                }
+                                sample_mask += sample_mask;
+
+                                if (sig & sample_mask) {
+                                    OPJ_UINT32 sym;
+
+                                    assert(dp[2 * stride] != 0);
+                                    sym = cwd & 1;
+                                    dp[2 * stride] ^= (1 - sym) << (p - 1);
+                                    dp[2 * stride] |= half;
+                                    cwd >>= 1;
+                                }
+                                sample_mask += sample_mask;
+
+                                if (sig & sample_mask) {
+                                    OPJ_UINT32 sym;
+
+                                    assert(dp[3 * stride] != 0);
+                                    sym = cwd & 1;
+                                    dp[3 * stride] ^= (1 - sym) << (p - 1);
+                                    dp[3 * stride] |= half;
+                                    cwd >>= 1;
+                                }
+                                sample_mask += sample_mask;
+                            }
+                            col_mask <<= 4; //next column
+                        }
+                    }
+                    // consume data according to the number of bits set
+                    rev_advance_mrp(&magref, population_count(sig));
+                }
+            }
+
+            if (y >= 4) { // update mbr array at the end of each stripe
+                //generate mbr corresponding to a stripe
+                OPJ_UINT32 *sig = y & 0x4 ? sigma1 : sigma2;
+                OPJ_UINT32 *mbr = y & 0x4 ? mbr1 : mbr2;
+
+                //data is processed in patches of 8 columns, each
+                // each 32 bits in sigma1 or mbr1 represent 4 rows
+
+                //integrate horizontally
+                OPJ_UINT32 prev = 0; // previous columns
+                OPJ_INT32 i;
+                for (i = 0; i < width; i += 8, mbr++, sig++) {
+                    OPJ_UINT32 t, z;
+
+                    mbr[0] = sig[0];         //start with significant samples
+                    mbr[0] |= prev >> 28;    //for first column, left neighbors
+                    mbr[0] |= sig[0] << 4;   //left neighbors
+                    mbr[0] |= sig[0] >> 4;   //right neighbors
+                    mbr[0] |= sig[1] << 28;  //for last column, right neighbors
+                    prev = sig[0];           // for next group of columns
+
+                    //integrate vertically
+                    t = mbr[0], z = mbr[0];
+                    z |= (t & 0x77777777) << 1; //above neighbors
+                    z |= (t & 0xEEEEEEEE) >> 1; //below neighbors
+                    mbr[0] = z & ~sig[0]; //remove already significance samples
+                }
+            }
+
+            if (y >= 8) { //wait until 8 rows has been processed
+                OPJ_UINT32 *cur_sig, *cur_mbr, *nxt_sig, *nxt_mbr;
+                OPJ_UINT32 prev;
+                OPJ_UINT32 val;
+                OPJ_INT32 i;
+
+                // add membership from the next stripe, obtained above
+                cur_sig = y & 0x4 ? sigma2 : sigma1;
+                cur_mbr = y & 0x4 ? mbr2 : mbr1;
+                nxt_sig = y & 0x4 ? sigma1 : sigma2;  //future samples
+                prev = 0; // the columns before these group of 8 columns
+                for (i = 0; i < width; i += 8, cur_mbr++, cur_sig++, nxt_sig++) {
+                    OPJ_UINT32 t = nxt_sig[0];
+                    t |= prev >> 28;        //for first column, left neighbors
+                    t |= nxt_sig[0] << 4;   //left neighbors
+                    t |= nxt_sig[0] >> 4;   //right neighbors
+                    t |= nxt_sig[1] << 28;  //for last column, right neighbors
+                    prev = nxt_sig[0];      // for next group of columns
+
+                    cur_mbr[0] |= (t & 0x11111111u) << 3; //propagate up to cur_mbr
+                    cur_mbr[0] &= ~cur_sig[0]; //remove already significance samples
+                }
+
+                //find new locations and get signs
+                cur_sig = y & 0x4 ? sigma2 : sigma1;
+                cur_mbr = y & 0x4 ? mbr2 : mbr1;
+                nxt_sig = y & 0x4 ? sigma1 : sigma2; //future samples
+                nxt_mbr = y & 0x4 ? mbr1 : mbr2;     //future samples
+                val = 3u << (p - 2); // sample values for newly discovered
+                // signficant samples including the bin center
+                for (i = 0; i < width;
+                        i += 8, cur_sig++, cur_mbr++, nxt_sig++, nxt_mbr++) {
+                    OPJ_UINT32 ux, tx;
+                    OPJ_UINT32 mbr = *cur_mbr;
+                    OPJ_UINT32 new_sig = 0;
+                    if (mbr) { //are there any samples that migt be signficant
+                        OPJ_INT32 n;
+                        for (n = 0; n < 8; n += 4) {
+                            OPJ_UINT32 col_mask;
+                            OPJ_UINT32 inv_sig;
+                            OPJ_INT32 end;
+                            OPJ_INT32 j;
+
+                            OPJ_UINT32 cwd = frwd_fetch(&sigprop); //get 32 bits
+                            OPJ_UINT32 cnt = 0;
+
+                            OPJ_UINT32 *dp = decoded_data + (y - 8) * stride;
+                            dp += i + n; //address for decoded samples
+
+                            col_mask = 0xFu << (4 * n); //a mask to select a column
+
+                            inv_sig = ~cur_sig[0]; // insignificant samples
+
+                            //find the last sample we operate on
+                            end = n + 4 + i < width ? n + 4 : width - i;
+
+                            for (j = n; j < end; ++j, ++dp, col_mask <<= 4) {
+                                OPJ_UINT32 sample_mask;
+
+                                if ((col_mask & mbr) == 0) { //no samples need checking
+                                    continue;
+                                }
+
+                                //scan mbr to find a new signficant sample
+                                sample_mask = 0x11111111u & col_mask; // LSB
+                                if (mbr & sample_mask) {
+                                    assert(dp[0] == 0); // the sample must have been 0
+                                    if (cwd & 1) { //if this sample has become significant
+                                        // must propagate it to nearby samples
+                                        OPJ_UINT32 t;
+                                        new_sig |= sample_mask;  // new significant samples
+                                        t = 0x32u << (j * 4);// propagation to neighbors
+                                        mbr |= t & inv_sig; //remove already signifcant samples
+                                    }
+                                    cwd >>= 1;
+                                    ++cnt; //consume bit and increment number of
                                     //consumed bits
+                                }
+
+                                sample_mask += sample_mask;  // next row
+                                if (mbr & sample_mask) {
+                                    assert(dp[stride] == 0);
+                                    if (cwd & 1) {
+                                        OPJ_UINT32 t;
+                                        new_sig |= sample_mask;
+                                        t = 0x74u << (j * 4);
+                                        mbr |= t & inv_sig;
+                                    }
+                                    cwd >>= 1;
+                                    ++cnt;
+                                }
+
+                                sample_mask += sample_mask;
+                                if (mbr & sample_mask) {
+                                    assert(dp[2 * stride] == 0);
+                                    if (cwd & 1) {
+                                        OPJ_UINT32 t;
+                                        new_sig |= sample_mask;
+                                        t = 0xE8u << (j * 4);
+                                        mbr |= t & inv_sig;
+                                    }
+                                    cwd >>= 1;
+                                    ++cnt;
+                                }
+
+                                sample_mask += sample_mask;
+                                if (mbr & sample_mask) {
+                                    assert(dp[3 * stride] == 0);
+                                    if (cwd & 1) {
+                                        OPJ_UINT32 t;
+                                        new_sig |= sample_mask;
+                                        t = 0xC0u << (j * 4);
+                                        mbr |= t & inv_sig;
+                                    }
+                                    cwd >>= 1;
+                                    ++cnt;
+                                }
+                            }
+
+                            //obtain signs here
+                            if (new_sig & (0xFFFFu << (4 * n))) { //if any
+                                OPJ_UINT32 col_mask;
+                                OPJ_INT32 j;
+                                OPJ_UINT32 *dp = decoded_data + (y - 8) * stride;
+                                dp += i + n; // decoded samples address
+                                col_mask = 0xFu << (4 * n); //mask to select a column
+
+                                for (j = n; j < end; ++j, ++dp, col_mask <<= 4) {
+                                    OPJ_UINT32 sample_mask;
+
+                                    if ((col_mask & new_sig) == 0) { //if non is signficant
+                                        continue;
+                                    }
+
+                                    //scan 4 signs
+                                    sample_mask = 0x11111111u & col_mask;
+                                    if (new_sig & sample_mask) {
+                                        assert(dp[0] == 0);
+                                        dp[0] |= ((cwd & 1) << 31) | val; //put value and sign
+                                        cwd >>= 1;
+                                        ++cnt; //consume bit and increment number
+                                        //of consumed bits
+                                    }
+
+                                    sample_mask += sample_mask;
+                                    if (new_sig & sample_mask) {
+                                        assert(dp[stride] == 0);
+                                        dp[stride] |= ((cwd & 1) << 31) | val;
+                                        cwd >>= 1;
+                                        ++cnt;
+                                    }
+
+                                    sample_mask += sample_mask;
+                                    if (new_sig & sample_mask) {
+                                        assert(dp[2 * stride] == 0);
+                                        dp[2 * stride] |= ((cwd & 1) << 31) | val;
+                                        cwd >>= 1;
+                                        ++cnt;
+                                    }
+
+                                    sample_mask += sample_mask;
+                                    if (new_sig & sample_mask) {
+                                        assert(dp[3 * stride] == 0);
+                                        dp[3 * stride] |= ((cwd & 1) << 31) | val;
+                                        cwd >>= 1;
+                                        ++cnt;
+                                    }
+                                }
+
+                            }
+                            frwd_advance(&sigprop, cnt); //consume the bits from bitstrm
+                            cnt = 0;
+
+                            //update the next 8 columns
+                            if (n == 4) {
+                                //horizontally
+                                OPJ_UINT32 t = new_sig >> 28;
+                                t |= ((t & 0xE) >> 1) | ((t & 7) << 1);
+                                cur_mbr[1] |= t & ~cur_sig[1];
+                            }
+                        }
+                    }
+                    //update the next stripe (vertically propagation)
+                    new_sig |= cur_sig[0];
+                    ux = (new_sig & 0x88888888) >> 3;
+                    tx = ux | (ux << 4) | (ux >> 4); //left and right neighbors
+                    if (i > 0) {
+                        nxt_mbr[-1] |= (ux << 28) & ~nxt_sig[-1];
+                    }
+                    nxt_mbr[0] |= tx & ~nxt_sig[0];
+                    nxt_mbr[1] |= (ux >> 28) & ~nxt_sig[1];
                 }
 
-                sample_mask += sample_mask;  // next row
-                if (mbr & sample_mask)
-                {
-                  assert(dp[stride] == 0);
-                  if (cwd & 1)
-                  {
-                    OPJ_UINT32 t;
-                    new_sig |= sample_mask;
-                    t = 0x74u << (j * 4);
-                    mbr |= t & inv_sig;
-                  }
-                  cwd >>= 1; ++cnt;
-                }
-
-                sample_mask += sample_mask;
-                if (mbr & sample_mask)
-                {
-                  assert(dp[2 * stride] == 0);
-                  if (cwd & 1)
-                  {
-                    OPJ_UINT32 t;
-                    new_sig |= sample_mask;
-                    t = 0xE8u << (j * 4);
-                    mbr |= t & inv_sig;
-                  }
-                  cwd >>= 1; ++cnt;
-                }
-
-                sample_mask += sample_mask;
-                if (mbr & sample_mask)
-                {
-                  assert(dp[3 * stride] == 0);
-                  if (cwd & 1)
-                  {
-                    OPJ_UINT32 t;
-                    new_sig |= sample_mask;
-                    t = 0xC0u << (j * 4);
-                    mbr |= t & inv_sig;
-                  }
-                  cwd >>= 1; ++cnt;
-                }
-              }
-
-              //obtain signs here
-              if (new_sig & (0xFFFFu << (4 * n))) //if any
-              {
-                OPJ_UINT32 col_mask;
-                OPJ_UINT32 *dp = decoded_data + (y - 8) * stride;
-                dp += i + n; // decoded samples address
-                col_mask = 0xFu << (4 * n); //mask to select a column
-
-                for (OPJ_INT32 j = n; j < end; ++j, ++dp, col_mask <<= 4)
-                {
-                  OPJ_UINT32 sample_mask;
-
-                  if ((col_mask & new_sig) == 0) //if non is signficant
-                    continue;
-
-                  //scan 4 signs
-                  sample_mask = 0x11111111u & col_mask;
-                  if (new_sig & sample_mask)
-                  {
-                    assert(dp[0] == 0);
-                    dp[0] |= ((cwd & 1) << 31) | val; //put value and sign
-                    cwd >>= 1; ++cnt; //consume bit and increment number
-                                      //of consumed bits
-                  }
-
-                  sample_mask += sample_mask;
-                  if (new_sig & sample_mask)
-                  {
-                    assert(dp[stride] == 0);
-                    dp[stride] |= ((cwd & 1) << 31) | val;
-                    cwd >>= 1; ++cnt;
-                  }
-
-                  sample_mask += sample_mask;
-                  if (new_sig & sample_mask)
-                  {
-                    assert(dp[2 * stride] == 0);
-                    dp[2 * stride] |= ((cwd & 1) << 31) | val;
-                    cwd >>= 1; ++cnt;
-                  }
-
-                  sample_mask += sample_mask;
-                  if (new_sig & sample_mask)
-                  {
-                    assert(dp[3 * stride] == 0);
-                    dp[3 * stride] |= ((cwd & 1) << 31) | val;
-                    cwd >>= 1; ++cnt;
-                  }
-                }
-
-              }
-              frwd_advance(&sigprop, cnt); //consume the bits from bitstrm
-              cnt = 0;
-
-              //update the next 8 columns
-              if (n == 4)
-              {
-                //horizontally
-                OPJ_UINT32 t = new_sig >> 28;
-                t |= ((t & 0xE) >> 1) | ((t & 7) << 1);
-                cur_mbr[1] |= t & ~cur_sig[1];
-              }
+                //clear current sigma
+                //mbr need not be cleared because it is overwritten
+                cur_sig = y & 0x4 ? sigma2 : sigma1;
+                memset(cur_sig, 0, ((((OPJ_UINT32)width + 7u) >> 3) + 1u) << 2);
+            }
+        }
+    }
+
+    //terminating
+    if (num_passes > 1) {
+        OPJ_INT32 st, y;
+
+        if (num_passes > 2 && ((height & 3) == 1 || (height & 3) == 2)) {
+            //do magref
+            OPJ_UINT32 *cur_sig = height & 0x4 ? sigma2 : sigma1; //reversed
+            OPJ_UINT32 *dpp = decoded_data + (height & 0xFFFFFC) * stride;
+            OPJ_UINT32 half = 1u << (p - 2);
+            OPJ_INT32 i;
+            for (i = 0; i < width; i += 8) {
+                OPJ_UINT32 cwd = rev_fetch_mrp(&magref);
+                OPJ_UINT32 sig = *cur_sig++;
+                OPJ_UINT32 col_mask = 0xF;
+                OPJ_UINT32 *dp = dpp + i;
+                if (sig) {
+                    int j;
+                    for (j = 0; j < 8; ++j, dp++) {
+                        if (sig & col_mask) {
+                            OPJ_UINT32 sample_mask = 0x11111111 & col_mask;
+
+                            if (sig & sample_mask) {
+                                OPJ_UINT32 sym;
+                                assert(dp[0] != 0);
+                                sym = cwd & 1;
+                                dp[0] ^= (1 - sym) << (p - 1);
+                                dp[0] |= half;
+                                cwd >>= 1;
+                            }
+                            sample_mask += sample_mask;
+
+                            if (sig & sample_mask) {
+                                OPJ_UINT32 sym;
+                                assert(dp[stride] != 0);
+                                sym = cwd & 1;
+                                dp[stride] ^= (1 - sym) << (p - 1);
+                                dp[stride] |= half;
+                                cwd >>= 1;
+                            }
+                            sample_mask += sample_mask;
+
+                            if (sig & sample_mask) {
+                                OPJ_UINT32 sym;
+                                assert(dp[2 * stride] != 0);
+                                sym = cwd & 1;
+                                dp[2 * stride] ^= (1 - sym) << (p - 1);
+                                dp[2 * stride] |= half;
+                                cwd >>= 1;
+                            }
+                            sample_mask += sample_mask;
+
+                            if (sig & sample_mask) {
+                                OPJ_UINT32 sym;
+                                assert(dp[3 * stride] != 0);
+                                sym = cwd & 1;
+                                dp[3 * stride] ^= (1 - sym) << (p - 1);
+                                dp[3 * stride] |= half;
+                                cwd >>= 1;
+                            }
+                            sample_mask += sample_mask;
+                        }
+                        col_mask <<= 4;
+                    }
+                }
+                rev_advance_mrp(&magref, population_count(sig));
+            }
+        }
+
+        //do the last incomplete stripe
+        // for cases of (height & 3) == 0 and 3
+        // the should have been processed previously
+        if ((height & 3) == 1 || (height & 3) == 2) {
+            //generate mbr of first stripe
+            OPJ_UINT32 *sig = height & 0x4 ? sigma2 : sigma1;
+            OPJ_UINT32 *mbr = height & 0x4 ? mbr2 : mbr1;
+            //integrate horizontally
+            OPJ_UINT32 prev = 0;
+            OPJ_INT32 i;
+            for (i = 0; i < width; i += 8, mbr++, sig++) {
+                OPJ_UINT32 t, z;
+
+                mbr[0] = sig[0];
+                mbr[0] |= prev >> 28;    //for first column, left neighbors
+                mbr[0] |= sig[0] << 4;   //left neighbors
+                mbr[0] |= sig[0] >> 4;   //left neighbors
+                mbr[0] |= sig[1] << 28;  //for last column, right neighbors
+                prev = sig[0];
+
+                //integrate vertically
+                t = mbr[0], z = mbr[0];
+                z |= (t & 0x77777777) << 1; //above neighbors
+                z |= (t & 0xEEEEEEEE) >> 1; //below neighbors
+                mbr[0] = z & ~sig[0]; //remove already significance samples
+            }
+        }
+
+        st = height;
+        st -= height > 6 ? (((height + 1) & 3) + 3) : height;
+        for (y = st; y < height; y += 4) {
+            OPJ_UINT32 *cur_sig, *cur_mbr, *nxt_sig, *nxt_mbr;
+            OPJ_UINT32 val;
+            OPJ_INT32 i;
+
+            OPJ_UINT32 pattern = 0xFFFFFFFFu; // a pattern needed samples
+            if (height - y == 3) {
+                pattern = 0x77777777u;
+            } else if (height - y == 2) {
+                pattern = 0x33333333u;
+            } else if (height - y == 1) {
+                pattern = 0x11111111u;
+            }
+
+            //add membership from the next stripe, obtained above
+            if (height - y > 4) {
+                OPJ_UINT32 prev = 0;
+                OPJ_INT32 i;
+                cur_sig = y & 0x4 ? sigma2 : sigma1;
+                cur_mbr = y & 0x4 ? mbr2 : mbr1;
+                nxt_sig = y & 0x4 ? sigma1 : sigma2;
+                for (i = 0; i < width; i += 8, cur_mbr++, cur_sig++, nxt_sig++) {
+                    OPJ_UINT32 t = nxt_sig[0];
+                    t |= prev >> 28;     //for first column, left neighbors
+                    t |= nxt_sig[0] << 4;   //left neighbors
+                    t |= nxt_sig[0] >> 4;   //left neighbors
+                    t |= nxt_sig[1] << 28;  //for last column, right neighbors
+                    prev = nxt_sig[0];
+
+                    cur_mbr[0] |= (t & 0x11111111) << 3;
+                    //remove already significance samples
+                    cur_mbr[0] &= ~cur_sig[0];
+                }
+            }
+
+            //find new locations and get signs
+            cur_sig = y & 0x4 ? sigma2 : sigma1;
+            cur_mbr = y & 0x4 ? mbr2 : mbr1;
+            nxt_sig = y & 0x4 ? sigma1 : sigma2;
+            nxt_mbr = y & 0x4 ? mbr1 : mbr2;
+            val = 3u << (p - 2);
+            for (i = 0; i < width; i += 8,
+                    cur_sig++, cur_mbr++, nxt_sig++, nxt_mbr++) {
+                OPJ_UINT32 mbr = *cur_mbr & pattern; //skip unneeded samples
+                OPJ_UINT32 new_sig = 0;
+                OPJ_UINT32 ux, tx;
+                if (mbr) {
+                    OPJ_INT32 n;
+                    for (n = 0; n < 8; n += 4) {
+                        OPJ_UINT32 col_mask;
+                        OPJ_UINT32 inv_sig;
+                        OPJ_INT32 end;
+                        OPJ_INT32 j;
+
+                        OPJ_UINT32 cwd = frwd_fetch(&sigprop);
+                        OPJ_UINT32 cnt = 0;
+
+                        OPJ_UINT32 *dp = decoded_data + y * stride;
+                        dp += i + n;
+
+                        col_mask = 0xFu << (4 * n);
+
+                        inv_sig = ~cur_sig[0] & pattern;
+
+                        end = n + 4 + i < width ? n + 4 : width - i;
+                        for (j = n; j < end; ++j, ++dp, col_mask <<= 4) {
+                            OPJ_UINT32 sample_mask;
+
+                            if ((col_mask & mbr) == 0) {
+                                continue;
+                            }
+
+                            //scan 4 mbr
+                            sample_mask = 0x11111111u & col_mask;
+                            if (mbr & sample_mask) {
+                                assert(dp[0] == 0);
+                                if (cwd & 1) {
+                                    OPJ_UINT32 t;
+                                    new_sig |= sample_mask;
+                                    t = 0x32u << (j * 4);
+                                    mbr |= t & inv_sig;
+                                }
+                                cwd >>= 1;
+                                ++cnt;
+                            }
+
+                            sample_mask += sample_mask;
+                            if (mbr & sample_mask) {
+                                assert(dp[stride] == 0);
+                                if (cwd & 1) {
+                                    OPJ_UINT32 t;
+                                    new_sig |= sample_mask;
+                                    t = 0x74u << (j * 4);
+                                    mbr |= t & inv_sig;
+                                }
+                                cwd >>= 1;
+                                ++cnt;
+                            }
+
+                            sample_mask += sample_mask;
+                            if (mbr & sample_mask) {
+                                assert(dp[2 * stride] == 0);
+                                if (cwd & 1) {
+                                    OPJ_UINT32 t;
+                                    new_sig |= sample_mask;
+                                    t = 0xE8u << (j * 4);
+                                    mbr |= t & inv_sig;
+                                }
+                                cwd >>= 1;
+                                ++cnt;
+                            }
+
+                            sample_mask += sample_mask;
+                            if (mbr & sample_mask) {
+                                assert(dp[3 * stride] == 0);
+                                if (cwd & 1) {
+                                    OPJ_UINT32 t;
+                                    new_sig |= sample_mask;
+                                    t = 0xC0u << (j * 4);
+                                    mbr |= t & inv_sig;
+                                }
+                                cwd >>= 1;
+                                ++cnt;
+                            }
+                        }
+
+                        //signs here
+                        if (new_sig & (0xFFFFu << (4 * n))) {
+                            OPJ_UINT32 col_mask;
+                            OPJ_INT32 j;
+                            OPJ_UINT32 *dp = decoded_data + y * stride;
+                            dp += i + n;
+                            col_mask = 0xFu << (4 * n);
+
+                            for (j = n; j < end; ++j, ++dp, col_mask <<= 4) {
+                                OPJ_UINT32 sample_mask;
+                                if ((col_mask & new_sig) == 0) {
+                                    continue;
+                                }
+
+                                //scan 4 signs
+                                sample_mask = 0x11111111u & col_mask;
+                                if (new_sig & sample_mask) {
+                                    assert(dp[0] == 0);
+                                    dp[0] |= ((cwd & 1) << 31) | val;
+                                    cwd >>= 1;
+                                    ++cnt;
+                                }
+
+                                sample_mask += sample_mask;
+                                if (new_sig & sample_mask) {
+                                    assert(dp[stride] == 0);
+                                    dp[stride] |= ((cwd & 1) << 31) | val;
+                                    cwd >>= 1;
+                                    ++cnt;
+                                }
+
+                                sample_mask += sample_mask;
+                                if (new_sig & sample_mask) {
+                                    assert(dp[2 * stride] == 0);
+                                    dp[2 * stride] |= ((cwd & 1) << 31) | val;
+                                    cwd >>= 1;
+                                    ++cnt;
+                                }
+
+                                sample_mask += sample_mask;
+                                if (new_sig & sample_mask) {
+                                    assert(dp[3 * stride] == 0);
+                                    dp[3 * stride] |= ((cwd & 1) << 31) | val;
+                                    cwd >>= 1;
+                                    ++cnt;
+                                }
+                            }
+
+                        }
+                        frwd_advance(&sigprop, cnt);
+                        cnt = 0;
+
+                        //update next columns
+                        if (n == 4) {
+                            //horizontally
+                            OPJ_UINT32 t = new_sig >> 28;
+                            t |= ((t & 0xE) >> 1) | ((t & 7) << 1);
+                            cur_mbr[1] |= t & ~cur_sig[1];
+                        }
+                    }
+                }
+                //propagate down (vertically propagation)
+                new_sig |= cur_sig[0];
+                ux = (new_sig & 0x88888888) >> 3;
+                tx = ux | (ux << 4) | (ux >> 4);
+                if (i > 0) {
+                    nxt_mbr[-1] |= (ux << 28) & ~nxt_sig[-1];
+                }
+                nxt_mbr[0] |= tx & ~nxt_sig[0];
+                nxt_mbr[1] |= (ux >> 28) & ~nxt_sig[1];
             }
-          }
-          //update the next stripe (vertically propagation)
-          new_sig |= cur_sig[0];
-          ux = (new_sig & 0x88888888) >> 3;
-          tx = ux | (ux << 4) | (ux >> 4); //left and right neighbors
-          if (i > 0)
-            nxt_mbr[-1] |= (ux << 28) & ~nxt_sig[-1];
-          nxt_mbr[0] |= tx & ~nxt_sig[0];
-          nxt_mbr[1] |= (ux >> 28) & ~nxt_sig[1];
         }
-
-        //clear current sigma
-        //mbr need not be cleared because it is overwritten
-        cur_sig = y & 0x4 ? sigma2 : sigma1;
-        memset(cur_sig, 0, ((((OPJ_UINT32)width + 7u) >> 3) + 1u) << 2);
-      }
-    }
-  }
-
-  //terminating
-  if (num_passes > 1) {
-    OPJ_INT32 st;
-
-    if (num_passes > 2 && ((height & 3) == 1 || (height & 3) == 2))
-    {//do magref
-      OPJ_UINT32 *cur_sig = height & 0x4 ? sigma2 : sigma1; //reversed
-      OPJ_UINT32 *dpp = decoded_data + (height & 0xFFFFFC) * stride;
-      OPJ_UINT32 half = 1u << (p - 2);
-      for (OPJ_INT32 i = 0; i < width; i += 8)
-      {
-        OPJ_UINT32 cwd = rev_fetch_mrp(&magref);
-        OPJ_UINT32 sig = *cur_sig++;
-        OPJ_UINT32 col_mask = 0xF;
-        OPJ_UINT32 *dp = dpp + i;
-        if (sig)
-        {
-          for (int j = 0; j < 8; ++j, dp++)
-          {
-            if (sig & col_mask)
-            {
-              OPJ_UINT32 sample_mask = 0x11111111 & col_mask;
-
-              if (sig & sample_mask)
-              {
-                OPJ_UINT32 sym;
-                assert(dp[0] != 0);
-                sym = cwd & 1;
-                dp[0] ^= (1 - sym) << (p - 1);
-                dp[0] |= half;
-                cwd >>= 1;
-              }
-              sample_mask += sample_mask;
-
-              if (sig & sample_mask)
-              {
-                OPJ_UINT32 sym;
-                assert(dp[stride] != 0);
-                sym = cwd & 1;
-                dp[stride] ^= (1 - sym) << (p - 1);
-                dp[stride] |= half;
-                cwd >>= 1;
-              }
-              sample_mask += sample_mask;
-
-              if (sig & sample_mask)
-              {
-                OPJ_UINT32 sym;
-                assert(dp[2 * stride] != 0);
-                sym = cwd & 1;
-                dp[2 * stride] ^= (1 - sym) << (p - 1);
-                dp[2 * stride] |= half;
-                cwd >>= 1;
-              }
-              sample_mask += sample_mask;
-
-              if (sig & sample_mask)
-              {
-                OPJ_UINT32 sym;
-                assert(dp[3 * stride] != 0);
-                sym = cwd & 1;
-                dp[3 * stride] ^= (1 - sym) << (p - 1);
-                dp[3 * stride] |= half;
-                cwd >>= 1;
-              }
-              sample_mask += sample_mask;
-            }
-            col_mask <<= 4;
-          }
-        }
-        rev_advance_mrp(&magref, population_count(sig));
-      }
     }
 
-    //do the last incomplete stripe
-    // for cases of (height & 3) == 0 and 3
-    // the should have been processed previously
-    if ((height & 3) == 1 || (height & 3) == 2)
     {
-      //generate mbr of first stripe
-      OPJ_UINT32 *sig = height & 0x4 ? sigma2 : sigma1;
-      OPJ_UINT32 *mbr = height & 0x4 ? mbr2 : mbr1;
-      //integrate horizontally
-      OPJ_UINT32 prev = 0;
-      for (OPJ_INT32 i = 0; i < width; i += 8, mbr++, sig++)
-      {
-        OPJ_UINT32 t, z;
-
-        mbr[0] = sig[0];
-        mbr[0] |= prev >> 28;    //for first column, left neighbors
-        mbr[0] |= sig[0] << 4;   //left neighbors
-        mbr[0] |= sig[0] >> 4;   //left neighbors
-        mbr[0] |= sig[1] << 28;  //for last column, right neighbors
-        prev = sig[0];
-
-        //integrate vertically
-        t = mbr[0], z = mbr[0];
-        z |= (t & 0x77777777) << 1; //above neighbors
-        z |= (t & 0xEEEEEEEE) >> 1; //below neighbors
-        mbr[0] = z & ~sig[0]; //remove already significance samples
-      }
-    }
-
-    st = height;
-    st -= height > 6 ? (((height + 1) & 3) + 3) : height;
-    for (OPJ_INT32 y = st; y < height; y += 4)
-    {
-      OPJ_UINT32 *cur_sig, *cur_mbr, *nxt_sig, *nxt_mbr;
-      OPJ_UINT32 val;
-
-      OPJ_UINT32 pattern = 0xFFFFFFFFu; // a pattern needed samples
-      if (height - y == 3)
-        pattern = 0x77777777u;
-      else if (height - y == 2)
-        pattern = 0x33333333u;
-      else if (height - y == 1)
-        pattern = 0x11111111u;
-
-      //add membership from the next stripe, obtained above
-      if (height - y > 4)
-      {
-        OPJ_UINT32 prev = 0;
-        cur_sig = y & 0x4 ? sigma2 : sigma1;
-        cur_mbr = y & 0x4 ? mbr2 : mbr1;
-        nxt_sig = y & 0x4 ? sigma1 : sigma2;
-
-        for (OPJ_INT32 i=0; i<width; i += 8, cur_mbr++, cur_sig++, nxt_sig++)
-        {
-          OPJ_UINT32 t = nxt_sig[0];
-          t |= prev >> 28;     //for first column, left neighbors
-          t |= nxt_sig[0] << 4;   //left neighbors
-          t |= nxt_sig[0] >> 4;   //left neighbors
-          t |= nxt_sig[1] << 28;  //for last column, right neighbors
-          prev = nxt_sig[0];
-
-          cur_mbr[0] |= (t & 0x11111111) << 3;
-          //remove already significance samples
-          cur_mbr[0] &= ~cur_sig[0];
+        OPJ_INT32 x, y;
+        for (y = 0; y < height; ++y) {
+            OPJ_INT32* sp = (OPJ_INT32*)decoded_data + y * stride;
+            for (x = 0; x < width; ++x, ++sp) {
+                OPJ_INT32 val = (*sp & 0x7FFFFFFF);
+                *sp = ((OPJ_UINT32) * sp & 0x80000000) ? -val : val;
+            }
         }
-      }
-
-      //find new locations and get signs
-      cur_sig = y & 0x4 ? sigma2 : sigma1;
-      cur_mbr = y & 0x4 ? mbr2 : mbr1;
-      nxt_sig = y & 0x4 ? sigma1 : sigma2;
-      nxt_mbr = y & 0x4 ? mbr1 : mbr2;
-      val = 3u << (p - 2);
-      for (OPJ_INT32 i = 0; i < width; i += 8,
-            cur_sig++, cur_mbr++, nxt_sig++, nxt_mbr++)
-      {
-        OPJ_UINT32 mbr = *cur_mbr & pattern; //skip unneeded samples
-        OPJ_UINT32 new_sig = 0;
-        OPJ_UINT32 ux, tx;
-        if (mbr)
-        {
-          for (OPJ_INT32 n = 0; n < 8; n += 4)
-          {
-            OPJ_UINT32 col_mask;
-            OPJ_UINT32 inv_sig;
-            OPJ_INT32 end;
-
-            OPJ_UINT32 cwd = frwd_fetch(&sigprop);
-            OPJ_UINT32 cnt = 0;
-
-            OPJ_UINT32 *dp = decoded_data + y * stride;
-            dp += i + n;
-
-            col_mask = 0xFu << (4 * n);
-
-            inv_sig = ~cur_sig[0] & pattern;
-
-            end = n + 4 + i < width ? n + 4 : width - i;
-            for (OPJ_INT32 j = n; j < end; ++j, ++dp, col_mask <<= 4)
-            {
-              OPJ_UINT32 sample_mask;
-
-              if ((col_mask & mbr) == 0)
-                continue;
-
-              //scan 4 mbr
-              sample_mask = 0x11111111u & col_mask;
-              if (mbr & sample_mask)
-              {
-                assert(dp[0] == 0);
-                if (cwd & 1)
-                {
-                  OPJ_UINT32 t;
-                  new_sig |= sample_mask;
-                  t = 0x32u << (j * 4);
-                  mbr |= t & inv_sig;
-                }
-                cwd >>= 1; ++cnt;
-              }
-
-              sample_mask += sample_mask;
-              if (mbr & sample_mask)
-              {
-                assert(dp[stride] == 0);
-                if (cwd & 1)
-                {
-                  OPJ_UINT32 t;
-                  new_sig |= sample_mask;
-                  t = 0x74u << (j * 4);
-                  mbr |= t & inv_sig;
-                }
-                cwd >>= 1; ++cnt;
-              }
-
-              sample_mask += sample_mask;
-              if (mbr & sample_mask)
-              {
-                assert(dp[2 * stride] == 0);
-                if (cwd & 1)
-                {
-                  OPJ_UINT32 t;
-                  new_sig |= sample_mask;
-                  t = 0xE8u << (j * 4);
-                  mbr |= t & inv_sig;
-                }
-                cwd >>= 1; ++cnt;
-              }
-
-              sample_mask += sample_mask;
-              if (mbr & sample_mask)
-              {
-                assert(dp[3 * stride] == 0);
-                if (cwd & 1)
-                {
-                  OPJ_UINT32 t;
-                  new_sig |= sample_mask;
-                  t = 0xC0u << (j * 4);
-                  mbr |= t & inv_sig;
-                }
-                cwd >>= 1; ++cnt;
-              }
-            }
-
-            //signs here
-            if (new_sig & (0xFFFFu << (4 * n)))
-            {
-              OPJ_UINT32 col_mask;
-              OPJ_UINT32 *dp = decoded_data + y * stride;
-              dp += i + n;
-              col_mask = 0xFu << (4 * n);
-
-              for (OPJ_INT32 j = n; j < end; ++j, ++dp, col_mask <<= 4)
-              {
-                OPJ_UINT32 sample_mask;
-                if ((col_mask & new_sig) == 0)
-                  continue;
-
-                //scan 4 signs
-                sample_mask = 0x11111111u & col_mask;
-                if (new_sig & sample_mask)
-                {
-                  assert(dp[0] == 0);
-                  dp[0] |= ((cwd & 1) << 31) | val;
-                  cwd >>= 1; ++cnt;
-                }
-
-                sample_mask += sample_mask;
-                if (new_sig & sample_mask)
-                {
-                  assert(dp[stride] == 0);
-                  dp[stride] |= ((cwd & 1) << 31) | val;
-                  cwd >>= 1; ++cnt;
-                }
-
-                sample_mask += sample_mask;
-                if (new_sig & sample_mask)
-                {
-                  assert(dp[2 * stride] == 0);
-                  dp[2 * stride] |= ((cwd & 1) << 31) | val;
-                  cwd >>= 1; ++cnt;
-                }
-
-                sample_mask += sample_mask;
-                if (new_sig & sample_mask)
-                {
-                  assert(dp[3 * stride] == 0);
-                  dp[3 * stride] |= ((cwd & 1) << 31) | val;
-                  cwd >>= 1; ++cnt;
-                }
-              }
-
-            }
-            frwd_advance(&sigprop, cnt);
-            cnt = 0;
-
-            //update next columns
-            if (n == 4)
-            {
-              //horizontally
-              OPJ_UINT32 t = new_sig >> 28;
-              t |= ((t & 0xE) >> 1) | ((t & 7) << 1);
-              cur_mbr[1] |= t & ~cur_sig[1];
-            }
-          }
-        }
-        //propagate down (vertically propagation)
-        new_sig |= cur_sig[0];
-        ux = (new_sig & 0x88888888) >> 3;
-        tx = ux | (ux << 4) | (ux >> 4);
-        if (i > 0)
-          nxt_mbr[-1] |= (ux << 28) & ~nxt_sig[-1];
-        nxt_mbr[0] |= tx & ~nxt_sig[0];
-        nxt_mbr[1] |= (ux >> 28) & ~nxt_sig[1];
-      }
     }
-  }
 
-  //int shift = 29 - missing_msbs;
-  for (OPJ_INT32 y = 0; y < height; ++y)
-  {
-    OPJ_INT32* sp = (OPJ_INT32*)decoded_data + y * stride;
-    for (OPJ_INT32 x = 0; x < width; ++x, ++sp)
-    {
-      OPJ_INT32 val = (*sp & 0x7FFFFFFF);
-      *sp = ((OPJ_UINT32)*sp & 0x80000000) ? -val : val;
-    }
-  }
-
-  return OPJ_TRUE;
+    return OPJ_TRUE;
 }
diff --git a/src/lib/openjp2/j2k.c b/src/lib/openjp2/j2k.c
index 8bbf0be8..6586c701 100644
--- a/src/lib/openjp2/j2k.c
+++ b/src/lib/openjp2/j2k.c
@@ -10617,12 +10617,18 @@ static OPJ_BOOL opj_j2k_read_SPCod_SPCoc(opj_j2k_t *p_j2k,
     /* SPcod (G) / SPcoc (D) */
     opj_read_bytes(l_current_ptr, &l_tccp->cblksty, 1);
     ++l_current_ptr;
-    if ((l_tccp->cblksty & 0x80U) != 0 || (l_tccp->cblksty & 0x48U) == 0x48U) { 
-    /* For HT, we only support one mode, bit 6 set, meaning that "all code-blocks 
-       within the corresponding tile-component shall be HT code-blocks, and 
-       bit 3 is reset, meaning that "No vertically causal context". */
+    if ((l_tccp->cblksty & J2K_CCP_CBLKSTY_HTMIXED) != 0) {
+        /* We do not support HT mixed mode yet*/
         opj_event_msg(p_manager, EVT_ERROR,
-                      "Error reading SPCod SPCoc element, Invalid code-block style found\n");
+                      "Error reading SPCod SPCoc element. Unsupported Mixed HT code-block style found\n");
+        return OPJ_FALSE;
+    }
+
+    if ((l_tccp->cblksty & (J2K_CCP_CBLKSTY_HT | J2K_CCP_CBLKSTY_VSC)) ==
+            (J2K_CCP_CBLKSTY_HT | J2K_CCP_CBLKSTY_VSC)) {
+        /* For HT, we do not support vertically causal mode yet. */
+        opj_event_msg(p_manager, EVT_ERROR,
+                      "Error reading SPCod SPCoc element. Unsupported HT mode with vertically causal mode. \n");
         return OPJ_FALSE;
     }
 
diff --git a/src/lib/openjp2/j2k.h b/src/lib/openjp2/j2k.h
index ac69a376..51e7c23e 100644
--- a/src/lib/openjp2/j2k.h
+++ b/src/lib/openjp2/j2k.h
@@ -61,7 +61,8 @@ The functions in J2K.C have for goal to read/write the several parts of the code
 #define J2K_CCP_CBLKSTY_VSC 0x08      /**< Vertically stripe causal context */
 #define J2K_CCP_CBLKSTY_PTERM 0x10    /**< Predictable termination */
 #define J2K_CCP_CBLKSTY_SEGSYM 0x20   /**< Segmentation symbols are used */
-#define J2K_CCP_CBLKSTY_HT 0x40       /**< (high throughput) HT codeblock */
+#define J2K_CCP_CBLKSTY_HT 0x40       /**< (high throughput) HT codeblocks */
+#define J2K_CCP_CBLKSTY_HTMIXED 0x80  /**< MIXED mode HT codeblocks */
 #define J2K_CCP_QNTSTY_NOQNT 0
 #define J2K_CCP_QNTSTY_SIQNT 1
 #define J2K_CCP_QNTSTY_SEQNT 2
diff --git a/src/lib/openjp2/t1.c b/src/lib/openjp2/t1.c
index bb97c7ea..f5fd2339 100644
--- a/src/lib/openjp2/t1.c
+++ b/src/lib/openjp2/t1.c
@@ -1700,8 +1700,7 @@ static void opj_t1_clbl_decode_processor(void* user_data, opj_tls_t* tls)
             opj_free(job);
             return;
         }
-    }
-    else {
+    } else {
         if (OPJ_FALSE == opj_t1_decode_cblk(
                     t1,
                     cblk,
diff --git a/src/lib/openjp2/t2.c b/src/lib/openjp2/t2.c
index 4626d69b..48f8949b 100644
--- a/src/lib/openjp2/t2.c
+++ b/src/lib/openjp2/t2.c
@@ -1261,61 +1261,61 @@ static OPJ_BOOL opj_t2_read_packet_header(opj_t2_t* p_t2,
 
             if ((p_tcp->tccps[p_pi->compno].cblksty & J2K_CCP_CBLKSTY_HT) != 0)
                 do {
-                  OPJ_UINT32 bit_number;
-                  l_cblk->segs[l_segno].numnewpasses = l_segno == 0 ? 1u : (OPJ_UINT32)n;
-                  bit_number = l_cblk->numlenbits + opj_uint_floorlog2(
-                                   l_cblk->segs[l_segno].numnewpasses);
-                  if (bit_number > 32) {
-                      opj_event_msg(p_manager, EVT_ERROR,
-                                    "Invalid bit number %d in opj_t2_read_packet_header()\n",
-                                    bit_number);
-                      opj_bio_destroy(l_bio);
-                      return OPJ_FALSE;
-                  }
-                  l_cblk->segs[l_segno].newlen = opj_bio_read(l_bio, bit_number);
-                  JAS_FPRINTF(stderr, "included=%d numnewpasses=%d increment=%d len=%d \n",
-                              l_included, l_cblk->segs[l_segno].numnewpasses, l_increment,
-                              l_cblk->segs[l_segno].newlen);
+                    OPJ_UINT32 bit_number;
+                    l_cblk->segs[l_segno].numnewpasses = l_segno == 0 ? 1 : (OPJ_UINT32)n;
+                    bit_number = l_cblk->numlenbits + opj_uint_floorlog2(
+                                     l_cblk->segs[l_segno].numnewpasses);
+                    if (bit_number > 32) {
+                        opj_event_msg(p_manager, EVT_ERROR,
+                                      "Invalid bit number %d in opj_t2_read_packet_header()\n",
+                                      bit_number);
+                        opj_bio_destroy(l_bio);
+                        return OPJ_FALSE;
+                    }
+                    l_cblk->segs[l_segno].newlen = opj_bio_read(l_bio, bit_number);
+                    JAS_FPRINTF(stderr, "included=%d numnewpasses=%d increment=%d len=%d \n",
+                                l_included, l_cblk->segs[l_segno].numnewpasses, l_increment,
+                                l_cblk->segs[l_segno].newlen);
 
-                  n -= (OPJ_INT32)l_cblk->segs[l_segno].numnewpasses;
-                  if (n > 0) {
-                      ++l_segno;
+                    n -= (OPJ_INT32)l_cblk->segs[l_segno].numnewpasses;
+                    if (n > 0) {
+                        ++l_segno;
 
-                      if (! opj_t2_init_seg(l_cblk, l_segno, p_tcp->tccps[p_pi->compno].cblksty, 0)) {
-                          opj_bio_destroy(l_bio);
-                          return OPJ_FALSE;
-                      }
-                  }
+                        if (! opj_t2_init_seg(l_cblk, l_segno, p_tcp->tccps[p_pi->compno].cblksty, 0)) {
+                            opj_bio_destroy(l_bio);
+                            return OPJ_FALSE;
+                        }
+                    }
                 } while (n > 0);
-            else 
+            else
                 do {
-                  OPJ_UINT32 bit_number;
-                  l_cblk->segs[l_segno].numnewpasses = (OPJ_UINT32)opj_int_min((OPJ_INT32)(
-                          l_cblk->segs[l_segno].maxpasses - l_cblk->segs[l_segno].numpasses), n);
-                  bit_number = l_cblk->numlenbits + opj_uint_floorlog2(
-                                   l_cblk->segs[l_segno].numnewpasses);
-                  if (bit_number > 32) {
-                      opj_event_msg(p_manager, EVT_ERROR,
-                                    "Invalid bit number %d in opj_t2_read_packet_header()\n",
-                                    bit_number);
-                      opj_bio_destroy(l_bio);
-                      return OPJ_FALSE;
-                  }
-                  l_cblk->segs[l_segno].newlen = opj_bio_read(l_bio, bit_number);
-                  JAS_FPRINTF(stderr, "included=%d numnewpasses=%d increment=%d len=%d \n",
-                              l_included, l_cblk->segs[l_segno].numnewpasses, l_increment,
-                              l_cblk->segs[l_segno].newlen);
+                    OPJ_UINT32 bit_number;
+                    l_cblk->segs[l_segno].numnewpasses = (OPJ_UINT32)opj_int_min((OPJ_INT32)(
+                            l_cblk->segs[l_segno].maxpasses - l_cblk->segs[l_segno].numpasses), n);
+                    bit_number = l_cblk->numlenbits + opj_uint_floorlog2(
+                                     l_cblk->segs[l_segno].numnewpasses);
+                    if (bit_number > 32) {
+                        opj_event_msg(p_manager, EVT_ERROR,
+                                      "Invalid bit number %d in opj_t2_read_packet_header()\n",
+                                      bit_number);
+                        opj_bio_destroy(l_bio);
+                        return OPJ_FALSE;
+                    }
+                    l_cblk->segs[l_segno].newlen = opj_bio_read(l_bio, bit_number);
+                    JAS_FPRINTF(stderr, "included=%d numnewpasses=%d increment=%d len=%d \n",
+                                l_included, l_cblk->segs[l_segno].numnewpasses, l_increment,
+                                l_cblk->segs[l_segno].newlen);
 
-                  n -= (OPJ_INT32)l_cblk->segs[l_segno].numnewpasses;
-                  if (n > 0) {
-                      ++l_segno;
+                    n -= (OPJ_INT32)l_cblk->segs[l_segno].numnewpasses;
+                    if (n > 0) {
+                        ++l_segno;
 
-                      if (! opj_t2_init_seg(l_cblk, l_segno, p_tcp->tccps[p_pi->compno].cblksty, 0)) {
-                          opj_bio_destroy(l_bio);
-                          return OPJ_FALSE;
-                      }
-                  }
-              } while (n > 0);
+                        if (! opj_t2_init_seg(l_cblk, l_segno, p_tcp->tccps[p_pi->compno].cblksty, 0)) {
+                            opj_bio_destroy(l_bio);
+                            return OPJ_FALSE;
+                        }
+                    }
+                } while (n > 0);
 
             ++l_cblk;
         }
diff --git a/src/lib/openjp2/tcd.h b/src/lib/openjp2/tcd.h
index a89279d0..340c2bf8 100644
--- a/src/lib/openjp2/tcd.h
+++ b/src/lib/openjp2/tcd.h
@@ -122,12 +122,12 @@ typedef struct opj_tcd_cblk_dec {
     opj_tcd_seg_data_chunk_t* chunks; /* Array of chunks */
     /* position of the code-blocks : left upper corner (x0, y0) right low corner (x1,y1) */
     OPJ_INT32 x0, y0, x1, y1;
-    /* Mb is The maximum number of bit-planes available for the representation of 
-       coefficients in any sub-band, b, as defined in Equation (E-2). See 
+    /* Mb is The maximum number of bit-planes available for the representation of
+       coefficients in any sub-band, b, as defined in Equation (E-2). See
        Section B.10.5 of the standard */
     OPJ_UINT32 Mb;  /* currently used only to check if HT decoding is correct */
     /* numbps is Mb - P as defined in Section B.10.5 of the standard */
-    OPJ_UINT32 numbps;  
+    OPJ_UINT32 numbps;
     /* number of bits for len, for the current packet. Transitory value */
     OPJ_UINT32 numlenbits;
     /* number of pass added to the code-blocks, for the current packet. Transitory value */