Coverage Report

Created: 2024-05-21 06:21

/src/grok/src/lib/core/wavelet/WaveletReverse.cpp
Line
Count
Source (jump to first uncovered line)
1
/*
2
 *    Copyright (C) 2016-2024 Grok Image Compression Inc.
3
 *
4
 *    This source code is free software: you can redistribute it and/or  modify
5
 *    it under the terms of the GNU Affero General Public License, version 3,
6
 *    as published by the Free Software Foundation.
7
 *
8
 *    This source code is distributed in the hope that it will be useful,
9
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
10
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
 *    GNU Affero General Public License for more details.
12
 *
13
 *    You should have received a copy of the GNU Affero General Public License
14
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
15
 *
16
 *
17
 *    This source code incorporates work covered by the BSD 2-clause license.
18
 *    Please see the LICENSE file in the root directory for details.
19
 *
20
 */
21
22
#include "grk_includes.h"
23
#include <algorithm>
24
#include <limits>
25
#include <sstream>
26
27
#undef HWY_TARGET_INCLUDE
28
#define HWY_TARGET_INCLUDE "wavelet/WaveletReverse.cpp"
29
#include <hwy/foreach_target.h>
30
#include <hwy/highway.h>
31
HWY_BEFORE_NAMESPACE();
32
namespace grk
33
{
34
namespace HWY_NAMESPACE
35
{
36
   using namespace hwy::HWY_NAMESPACE;
37
38
   static size_t hwy_num_lanes(void)
39
0
   {
40
0
    const HWY_FULL(int32_t) di;
41
0
    return Lanes(di);
42
0
   }
Unexecuted instantiation: WaveletReverse.cpp:grk::N_AVX3_SPR::hwy_num_lanes()
Unexecuted instantiation: WaveletReverse.cpp:grk::N_AVX3_ZEN4::hwy_num_lanes()
Unexecuted instantiation: WaveletReverse.cpp:grk::N_AVX3::hwy_num_lanes()
Unexecuted instantiation: WaveletReverse.cpp:grk::N_AVX2::hwy_num_lanes()
Unexecuted instantiation: WaveletReverse.cpp:grk::N_SSE4::hwy_num_lanes()
Unexecuted instantiation: WaveletReverse.cpp:grk::N_SSSE3::hwy_num_lanes()
Unexecuted instantiation: WaveletReverse.cpp:grk::N_SSE2::hwy_num_lanes()
43
44
0
#define HWY_PLL_COLS_53 (2 * Lanes(di))
45
46
   static void hwy_decompress_v_final_memcpy_53(const int32_t* buf, const uint32_t height,
47
                        int32_t* dest, const size_t strideDest)
48
0
   {
49
0
    const HWY_FULL(int32_t) di;
50
0
    for(uint32_t i = 0; i < height; ++i)
51
0
    {
52
0
     StoreU(Load(di, buf + HWY_PLL_COLS_53 * i + 0), di, &dest[(size_t)i * strideDest + 0]);
53
0
     StoreU(Load(di, buf + HWY_PLL_COLS_53 * i + Lanes(di)), di,
54
0
        dest + (size_t)i * strideDest + Lanes(di));
55
0
    }
56
0
   }
Unexecuted instantiation: WaveletReverse.cpp:grk::N_AVX3_SPR::hwy_decompress_v_final_memcpy_53(int const*, unsigned int, int*, unsigned long)
Unexecuted instantiation: WaveletReverse.cpp:grk::N_AVX3_ZEN4::hwy_decompress_v_final_memcpy_53(int const*, unsigned int, int*, unsigned long)
Unexecuted instantiation: WaveletReverse.cpp:grk::N_AVX3::hwy_decompress_v_final_memcpy_53(int const*, unsigned int, int*, unsigned long)
Unexecuted instantiation: WaveletReverse.cpp:grk::N_AVX2::hwy_decompress_v_final_memcpy_53(int const*, unsigned int, int*, unsigned long)
Unexecuted instantiation: WaveletReverse.cpp:grk::N_SSE4::hwy_decompress_v_final_memcpy_53(int const*, unsigned int, int*, unsigned long)
Unexecuted instantiation: WaveletReverse.cpp:grk::N_SSSE3::hwy_decompress_v_final_memcpy_53(int const*, unsigned int, int*, unsigned long)
Unexecuted instantiation: WaveletReverse.cpp:grk::N_SSE2::hwy_decompress_v_final_memcpy_53(int const*, unsigned int, int*, unsigned long)
57
   /** Vertical inverse 5x3 wavelet transform for 8 columns in SSE2, or
58
  * 16 in AVX2, when top-most pixel is on even coordinate */
59
   static void hwy_decompress_v_parity_even_mcols_53(int32_t* buf, int32_t* bandL, /* even */
60
                           const uint32_t hL, const size_t strideL,
61
                           int32_t* bandH, /* odd */
62
                           const uint32_t hH, const size_t strideH,
63
                           int32_t* dest, const uint32_t strideDest)
64
0
   {
65
0
    const HWY_FULL(int32_t) di;
66
0
    auto two = Set(di, 2);
67
68
0
    const uint32_t total_height = hL + hH;
69
0
    assert(total_height > 1);
70
71
    /* Note: loads of input even/odd values must be done in a unaligned */
72
    /* fashion. But stores in tmp can be done with aligned store, since */
73
    /* the temporary buffer is properly aligned */
74
0
    assert((size_t)buf % (sizeof(int32_t) * Lanes(di)) == 0);
75
76
0
    auto s1n_0 = LoadU(di, bandL + 0);
77
0
    auto s1n_1 = LoadU(di, bandL + Lanes(di));
78
0
    auto d1n_0 = LoadU(di, bandH);
79
0
    auto d1n_1 = LoadU(di, bandH + Lanes(di));
80
81
    /* s0n = s1n - ((d1n + 1) >> 1); <==> */
82
    /* s0n = s1n - ((d1n + d1n + 2) >> 2); */
83
0
    auto s0n_0 = s1n_0 - ShiftRight<2>(d1n_0 + d1n_0 + two);
84
0
    auto s0n_1 = s1n_1 - ShiftRight<2>(d1n_1 + d1n_1 + two);
85
86
0
    uint32_t i = 0;
87
0
    if(total_height > 3)
88
0
    {
89
0
     uint32_t j;
90
0
     for(i = 0, j = 1; i < (total_height - 3); i += 2, j++)
91
0
     {
92
0
      auto d1c_0 = d1n_0;
93
0
      auto s0c_0 = s0n_0;
94
0
      auto d1c_1 = d1n_1;
95
0
      auto s0c_1 = s0n_1;
96
97
0
      s1n_0 = LoadU(di, bandL + j * strideL);
98
0
      s1n_1 = LoadU(di, bandL + j * strideL + Lanes(di));
99
0
      d1n_0 = LoadU(di, bandH + j * strideH);
100
0
      d1n_1 = LoadU(di, bandH + j * strideH + Lanes(di));
101
102
      /*s0n = s1n - ((d1c + d1n + 2) >> 2);*/
103
0
      s0n_0 = s1n_0 - ShiftRight<2>(d1c_0 + d1n_0 + two);
104
0
      s0n_1 = s1n_1 - ShiftRight<2>(d1c_1 + d1n_1 + two);
105
106
0
      Store(s0c_0, di, buf + HWY_PLL_COLS_53 * (i + 0));
107
0
      Store(s0c_1, di, buf + HWY_PLL_COLS_53 * (i + 0) + Lanes(di));
108
109
      /* d1c + ((s0c + s0n) >> 1) */
110
0
      Store(d1c_0 + ShiftRight<1>(s0c_0 + s0n_0), di, buf + HWY_PLL_COLS_53 * (i + 1) + 0);
111
0
      Store(d1c_1 + ShiftRight<1>(s0c_1 + s0n_1), di,
112
0
          buf + HWY_PLL_COLS_53 * (i + 1) + Lanes(di));
113
0
     }
114
0
    }
115
0
    Store(s0n_0, di, buf + HWY_PLL_COLS_53 * (i + 0) + 0);
116
0
    Store(s0n_1, di, buf + HWY_PLL_COLS_53 * (i + 0) + Lanes(di));
117
118
0
    if(total_height & 1)
119
0
    {
120
0
     s1n_0 = LoadU(di, bandL + (size_t)((total_height - 1) / 2) * strideL);
121
     /* tmp_len_minus_1 = s1n - ((d1n + 1) >> 1); */
122
0
     auto tmp_len_minus_1 = s1n_0 - ShiftRight<2>(d1n_0 + d1n_0 + two);
123
0
     Store(tmp_len_minus_1, di, buf + HWY_PLL_COLS_53 * (total_height - 1));
124
     /* d1n + ((s0n + tmp_len_minus_1) >> 1) */
125
0
     Store(d1n_0 + ShiftRight<1>(s0n_0 + tmp_len_minus_1), di,
126
0
         buf + HWY_PLL_COLS_53 * (total_height - 2));
127
128
0
     s1n_1 = LoadU(di, bandL + (size_t)((total_height - 1) / 2) * strideL + Lanes(di));
129
     /* tmp_len_minus_1 = s1n - ((d1n + 1) >> 1); */
130
0
     tmp_len_minus_1 = s1n_1 - ShiftRight<2>(d1n_1 + d1n_1 + two);
131
0
     Store(tmp_len_minus_1, di, buf + HWY_PLL_COLS_53 * (total_height - 1) + Lanes(di));
132
     /* d1n + ((s0n + tmp_len_minus_1) >> 1) */
133
0
     Store(d1n_1 + ShiftRight<1>(s0n_1 + tmp_len_minus_1), di,
134
0
         buf + HWY_PLL_COLS_53 * (total_height - 2) + Lanes(di));
135
0
    }
136
0
    else
137
0
    {
138
0
     Store(d1n_0 + s0n_0, di, buf + HWY_PLL_COLS_53 * (total_height - 1) + 0);
139
0
     Store(d1n_1 + s0n_1, di, buf + HWY_PLL_COLS_53 * (total_height - 1) + Lanes(di));
140
0
    }
141
0
    hwy_decompress_v_final_memcpy_53(buf, total_height, dest, strideDest);
142
0
   }
Unexecuted instantiation: WaveletReverse.cpp:grk::N_AVX3_SPR::hwy_decompress_v_parity_even_mcols_53(int*, int*, unsigned int, unsigned long, int*, unsigned int, unsigned long, int*, unsigned int)
Unexecuted instantiation: WaveletReverse.cpp:grk::N_AVX3_ZEN4::hwy_decompress_v_parity_even_mcols_53(int*, int*, unsigned int, unsigned long, int*, unsigned int, unsigned long, int*, unsigned int)
Unexecuted instantiation: WaveletReverse.cpp:grk::N_AVX3::hwy_decompress_v_parity_even_mcols_53(int*, int*, unsigned int, unsigned long, int*, unsigned int, unsigned long, int*, unsigned int)
Unexecuted instantiation: WaveletReverse.cpp:grk::N_AVX2::hwy_decompress_v_parity_even_mcols_53(int*, int*, unsigned int, unsigned long, int*, unsigned int, unsigned long, int*, unsigned int)
Unexecuted instantiation: WaveletReverse.cpp:grk::N_SSE4::hwy_decompress_v_parity_even_mcols_53(int*, int*, unsigned int, unsigned long, int*, unsigned int, unsigned long, int*, unsigned int)
Unexecuted instantiation: WaveletReverse.cpp:grk::N_SSSE3::hwy_decompress_v_parity_even_mcols_53(int*, int*, unsigned int, unsigned long, int*, unsigned int, unsigned long, int*, unsigned int)
Unexecuted instantiation: WaveletReverse.cpp:grk::N_SSE2::hwy_decompress_v_parity_even_mcols_53(int*, int*, unsigned int, unsigned long, int*, unsigned int, unsigned long, int*, unsigned int)
143
144
   /** Vertical inverse 5x3 wavelet transform for 8 columns in SSE2, or
145
  * 16 in AVX2, when top-most pixel is on odd coordinate */
146
   static void hwy_decompress_v_parity_odd_mcols_53(int32_t* buf, int32_t* bandL, const uint32_t hL,
147
                          const uint32_t strideL, int32_t* bandH,
148
                          const uint32_t hH, const uint32_t strideH,
149
                          int32_t* dest, const uint32_t strideDest)
150
0
   {
151
0
    const HWY_FULL(int32_t) di;
152
0
    auto two = Set(di, 2);
153
154
0
    const uint32_t total_height = hL + hH;
155
0
    assert(total_height > 2);
156
    /* Note: loads of input even/odd values must be done in a unaligned */
157
    /* fashion. But stores in buf can be done with aligned store, since */
158
    /* the temporary buffer is properly aligned */
159
0
    assert((size_t)buf % (sizeof(int32_t) * Lanes(di)) == 0);
160
161
0
    const int32_t* in_even = bandH;
162
0
    const int32_t* in_odd = bandL;
163
0
    auto s1_0 = LoadU(di, in_even + strideH);
164
165
    /* in_odd[0] - ((in_even[0] + s1 + 2) >> 2); */
166
0
    auto dc_0 = LoadU(di, in_odd + 0) - ShiftRight<2>(LoadU(di, in_even + 0) + s1_0 + two);
167
0
    Store(LoadU(di, in_even + 0) + dc_0, di, buf + HWY_PLL_COLS_53 * 0);
168
0
    auto s1_1 = LoadU(di, in_even + strideH + Lanes(di));
169
170
    /* in_odd[0] - ((in_even[0] + s1 + 2) >> 2); */
171
0
    auto dc_1 = LoadU(di, in_odd + Lanes(di)) -
172
0
          ShiftRight<2>(LoadU(di, in_even + Lanes(di)) + s1_1 + two);
173
0
    Store(LoadU(di, in_even + Lanes(di)) + dc_1, di, buf + HWY_PLL_COLS_53 * 0 + Lanes(di));
174
175
0
    uint32_t i;
176
0
    size_t j;
177
0
    for(i = 1, j = 1; i < (total_height - 2 - !(total_height & 1)); i += 2, j++)
178
0
    {
179
0
     auto s2_0 = LoadU(di, in_even + (j + 1) * strideH);
180
0
     auto s2_1 = LoadU(di, in_even + (j + 1) * strideH + Lanes(di));
181
182
     /* dn = in_odd[j * stride] - ((s1 + s2 + 2) >> 2); */
183
0
     auto dn_0 = LoadU(di, in_odd + j * strideL) - ShiftRight<2>(s1_0 + s2_0 + two);
184
0
     auto dn_1 = LoadU(di, in_odd + j * strideL + Lanes(di)) - ShiftRight<2>(s1_1 + s2_1 + two);
185
186
0
     Store(dc_0, di, buf + HWY_PLL_COLS_53 * i);
187
0
     Store(dc_1, di, buf + HWY_PLL_COLS_53 * i + Lanes(di));
188
189
     /* buf[i + 1] = s1 + ((dn + dc) >> 1); */
190
0
     Store(s1_0 + ShiftRight<1>(dn_0 + dc_0), di, buf + HWY_PLL_COLS_53 * (i + 1) + 0);
191
0
     Store(s1_1 + ShiftRight<1>(dn_1 + dc_1), di, buf + HWY_PLL_COLS_53 * (i + 1) + Lanes(di));
192
193
0
     dc_0 = dn_0;
194
0
     s1_0 = s2_0;
195
0
     dc_1 = dn_1;
196
0
     s1_1 = s2_1;
197
0
    }
198
0
    Store(dc_0, di, buf + HWY_PLL_COLS_53 * i);
199
0
    Store(dc_1, di, buf + HWY_PLL_COLS_53 * i + Lanes(di));
200
201
0
    if(!(total_height & 1))
202
0
    {
203
     /*dn = in_odd[(len / 2 - 1) * stride] - ((s1 + 1) >> 1); */
204
0
     auto dn_0 = LoadU(di, in_odd + (size_t)(total_height / 2 - 1) * strideL) -
205
0
           ShiftRight<2>(s1_0 + s1_0 + two);
206
0
     auto dn_1 = LoadU(di, in_odd + (size_t)(total_height / 2 - 1) * strideL + Lanes(di)) -
207
0
           ShiftRight<2>(s1_1 + s1_1 + two);
208
209
     /* buf[len - 2] = s1 + ((dn + dc) >> 1); */
210
0
     Store(s1_0 + ShiftRight<1>(dn_0 + dc_0), di,
211
0
         buf + HWY_PLL_COLS_53 * (total_height - 2) + 0);
212
0
     Store(s1_1 + ShiftRight<1>(dn_1 + dc_1), di,
213
0
         buf + HWY_PLL_COLS_53 * (total_height - 2) + Lanes(di));
214
215
0
     Store(dn_0, di, buf + HWY_PLL_COLS_53 * (total_height - 1) + 0);
216
0
     Store(dn_1, di, buf + HWY_PLL_COLS_53 * (total_height - 1) + Lanes(di));
217
0
    }
218
0
    else
219
0
    {
220
0
     Store(s1_0 + dc_0, di, buf + HWY_PLL_COLS_53 * (total_height - 1) + 0);
221
0
     Store(s1_1 + dc_1, di, buf + HWY_PLL_COLS_53 * (total_height - 1) + Lanes(di));
222
0
    }
223
0
    hwy_decompress_v_final_memcpy_53(buf, total_height, dest, strideDest);
224
0
   }
Unexecuted instantiation: WaveletReverse.cpp:grk::N_AVX3_SPR::hwy_decompress_v_parity_odd_mcols_53(int*, int*, unsigned int, unsigned int, int*, unsigned int, unsigned int, int*, unsigned int)
Unexecuted instantiation: WaveletReverse.cpp:grk::N_AVX3_ZEN4::hwy_decompress_v_parity_odd_mcols_53(int*, int*, unsigned int, unsigned int, int*, unsigned int, unsigned int, int*, unsigned int)
Unexecuted instantiation: WaveletReverse.cpp:grk::N_AVX3::hwy_decompress_v_parity_odd_mcols_53(int*, int*, unsigned int, unsigned int, int*, unsigned int, unsigned int, int*, unsigned int)
Unexecuted instantiation: WaveletReverse.cpp:grk::N_AVX2::hwy_decompress_v_parity_odd_mcols_53(int*, int*, unsigned int, unsigned int, int*, unsigned int, unsigned int, int*, unsigned int)
Unexecuted instantiation: WaveletReverse.cpp:grk::N_SSE4::hwy_decompress_v_parity_odd_mcols_53(int*, int*, unsigned int, unsigned int, int*, unsigned int, unsigned int, int*, unsigned int)
Unexecuted instantiation: WaveletReverse.cpp:grk::N_SSSE3::hwy_decompress_v_parity_odd_mcols_53(int*, int*, unsigned int, unsigned int, int*, unsigned int, unsigned int, int*, unsigned int)
Unexecuted instantiation: WaveletReverse.cpp:grk::N_SSE2::hwy_decompress_v_parity_odd_mcols_53(int*, int*, unsigned int, unsigned int, int*, unsigned int, unsigned int, int*, unsigned int)
225
226
} // namespace HWY_NAMESPACE
227
} // namespace grk
228
HWY_AFTER_NAMESPACE();
229
230
#if HWY_ONCE
231
namespace grk
232
{
233
HWY_EXPORT(hwy_num_lanes);
234
HWY_EXPORT(hwy_decompress_v_parity_even_mcols_53);
235
HWY_EXPORT(hwy_decompress_v_parity_odd_mcols_53);
236
/* <summary>                             */
237
/* Determine maximum computed resolution level for inverse wavelet transform */
238
/* </summary>                            */
239
uint32_t max_resolution(Resolution* GRK_RESTRICT r, uint32_t i)
240
0
{
241
0
   uint32_t mr = 0;
242
0
   while(--i)
243
0
   {
244
0
    ++r;
245
0
    uint32_t w;
246
0
    if(mr < (w = r->x1 - r->x0))
247
0
     mr = w;
248
0
    if(mr < (w = r->y1 - r->y0))
249
0
     mr = w;
250
0
   }
251
0
   return mr;
252
0
}
253
254
/**********************************************************************************
255
 *
256
 * Full 9/7 Inverse Wavelet
257
 *
258
 *
259
 *
260
 **********************************************************************************/
261
262
static const float dwt_alpha = 1.586134342f; /*  12994 */
263
static const float dwt_beta = 0.052980118f; /*    434 */
264
static const float dwt_gamma = -0.882911075f; /*  -7233 */
265
static const float dwt_delta = -0.443506852f; /*  -3633 */
266
static const float K = 1.230174105f; /*  10078 */
267
static const float twice_invK = 1.625732422f;
268
269
// #undef __SSE__
270
271
#ifdef __SSE__
272
void WaveletReverse::decompress_step1_sse_97(Params97 d, const __m128 c)
273
1.61M
{
274
   // process 4 floats at a time
275
1.61M
   auto mmData = (__m128*)d.data;
276
1.61M
   uint32_t i;
277
17.3M
   for(i = 0; i + 3 < d.len; i += 4, mmData += 8)
278
15.7M
   {
279
15.7M
    mmData[0] = _mm_mul_ps(mmData[0], c);
280
15.7M
    mmData[2] = _mm_mul_ps(mmData[2], c);
281
15.7M
    mmData[4] = _mm_mul_ps(mmData[4], c);
282
15.7M
    mmData[6] = _mm_mul_ps(mmData[6], c);
283
15.7M
   }
284
4.02M
   for(; i < d.len; ++i, mmData += 2)
285
2.41M
    mmData[0] = _mm_mul_ps(mmData[0], c);
286
1.61M
}
287
#endif
288
289
void WaveletReverse::decompress_step1_97(const Params97& d, const float c)
290
1.52M
{
291
1.52M
#ifdef __SSE__
292
1.52M
   decompress_step1_sse_97(d, _mm_set1_ps(c));
293
#else
294
   float* GRK_RESTRICT fw = (float*)d.data;
295
296
   for(uint32_t i = 0; i < d.len; ++i, fw += 8)
297
   {
298
    fw[0] *= c;
299
    fw[1] *= c;
300
    fw[2] *= c;
301
    fw[3] *= c;
302
    ;
303
   }
304
#endif
305
1.52M
}
306
307
#ifdef __SSE__
308
static void decompress_step2_sse_97(const Params97& d, __m128 c)
309
3.02M
{
310
3.02M
   __m128* GRK_RESTRICT vec_data = (__m128*)d.data;
311
312
3.02M
   uint32_t imax = (std::min<uint32_t>)(d.len, d.lenMax);
313
314
   // initial tmp1 value is only necessary when
315
   // absolute start of line is at 0
316
3.02M
   auto tmp1 = ((__m128*)d.dataPrev)[0];
317
3.02M
   uint32_t i = 0;
318
34.0M
   for(; i + 3 < imax; i += 4)
319
31.0M
   {
320
31.0M
    auto tmp2 = vec_data[-1];
321
31.0M
    auto tmp3 = vec_data[0];
322
31.0M
    auto tmp4 = vec_data[1];
323
31.0M
    auto tmp5 = vec_data[2];
324
31.0M
    auto tmp6 = vec_data[3];
325
31.0M
    auto tmp7 = vec_data[4];
326
31.0M
    auto tmp8 = vec_data[5];
327
31.0M
    auto tmp9 = vec_data[6];
328
31.0M
    vec_data[-1] = _mm_add_ps(tmp2, _mm_mul_ps(_mm_add_ps(tmp1, tmp3), c));
329
31.0M
    vec_data[1] = _mm_add_ps(tmp4, _mm_mul_ps(_mm_add_ps(tmp3, tmp5), c));
330
31.0M
    vec_data[3] = _mm_add_ps(tmp6, _mm_mul_ps(_mm_add_ps(tmp5, tmp7), c));
331
31.0M
    vec_data[5] = _mm_add_ps(tmp8, _mm_mul_ps(_mm_add_ps(tmp7, tmp9), c));
332
31.0M
    tmp1 = tmp9;
333
31.0M
    vec_data += 8;
334
31.0M
   }
335
336
7.11M
   for(; i < imax; ++i)
337
4.09M
   {
338
4.09M
    auto tmp2 = vec_data[-1];
339
4.09M
    auto tmp3 = vec_data[0];
340
4.09M
    vec_data[-1] = _mm_add_ps(tmp2, _mm_mul_ps(_mm_add_ps(tmp1, tmp3), c));
341
4.09M
    tmp1 = tmp3;
342
4.09M
    vec_data += 2;
343
4.09M
   }
344
3.02M
   if(d.lenMax < d.len)
345
1.48M
   {
346
1.48M
    assert(d.lenMax + 1 == d.len);
347
1.48M
    c = _mm_add_ps(c, c);
348
1.48M
    c = _mm_mul_ps(c, vec_data[-2]);
349
1.48M
    vec_data[-1] = _mm_add_ps(vec_data[-1], c);
350
1.48M
   }
351
3.02M
}
352
#endif
353
354
static void decompress_step2_97(const Params97& d, float c)
355
3.00M
{
356
3.00M
#ifdef __SSE__
357
3.00M
   decompress_step2_sse_97(d, _mm_set1_ps(c));
358
#else
359
360
   float* dataPrev = (float*)d.dataPrev;
361
   float* data = (float*)d.data;
362
363
   uint32_t imax = (std::min<uint32_t>)(d.len, d.lenMax);
364
   for(uint32_t i = 0; i < imax; ++i)
365
   {
366
    float tmp1_1 = dataPrev[0];
367
    float tmp1_2 = dataPrev[1];
368
    float tmp1_3 = dataPrev[2];
369
    float tmp1_4 = dataPrev[3];
370
    float tmp2_1 = data[-4];
371
    float tmp2_2 = data[-3];
372
    float tmp2_3 = data[-2];
373
    float tmp2_4 = data[-1];
374
    float tmp3_1 = data[0];
375
    float tmp3_2 = data[1];
376
    float tmp3_3 = data[2];
377
    float tmp3_4 = data[3];
378
    data[-4] = tmp2_1 + ((tmp1_1 + tmp3_1) * c);
379
    data[-3] = tmp2_2 + ((tmp1_2 + tmp3_2) * c);
380
    data[-2] = tmp2_3 + ((tmp1_3 + tmp3_3) * c);
381
    data[-1] = tmp2_4 + ((tmp1_4 + tmp3_4) * c);
382
    dataPrev = data;
383
    data += 8;
384
   }
385
   if(d.lenMax < d.len)
386
   {
387
    assert(d.lenMax + 1 == d.len);
388
    c += c;
389
    data[-4] = data[-4] + dataPrev[0] * c;
390
    data[-3] = data[-3] + dataPrev[1] * c;
391
    data[-2] = data[-2] + dataPrev[2] * c;
392
    data[-1] = data[-1] + dataPrev[3] * c;
393
   }
394
#endif
395
3.00M
}
396
/* <summary>                             */
397
/* Inverse 9-7 wavelet transform in 1-D. */
398
/* </summary>                            */
399
void WaveletReverse::decompress_step_97(dwt_data<vec4f>* GRK_RESTRICT dwt)
400
878k
{
401
878k
   if((!dwt->parity && dwt->dn_full == 0 && dwt->sn_full <= 1) ||
402
878k
    (dwt->parity && dwt->sn_full == 0 && dwt->dn_full >= 1))
403
22.4k
    return;
404
405
856k
   decompress_step1_97(makeParams97(dwt, true, true), K);
406
856k
   decompress_step1_97(makeParams97(dwt, false, true), twice_invK);
407
856k
   decompress_step2_97(makeParams97(dwt, true, false), dwt_delta);
408
856k
   decompress_step2_97(makeParams97(dwt, false, false), dwt_gamma);
409
856k
   decompress_step2_97(makeParams97(dwt, true, false), dwt_beta);
410
856k
   decompress_step2_97(makeParams97(dwt, false, false), dwt_alpha);
411
856k
}
412
void WaveletReverse::interleave_h_97(dwt_data<vec4f>* GRK_RESTRICT dwt,
413
                   grk_buf2d_simple<float> winL, grk_buf2d_simple<float> winH,
414
                   uint32_t remaining_height)
415
0
{
416
0
   float* GRK_RESTRICT bi = (float*)(dwt->mem + dwt->parity);
417
0
   uint32_t x0 = dwt->win_l.x0;
418
0
   uint32_t x1 = dwt->win_l.x1;
419
0
   const size_t vec4f_elts = vec4f::NUM_ELTS;
420
0
   for(uint32_t k = 0; k < 2; ++k)
421
0
   {
422
0
    auto band = (k == 0) ? winL.buf_ : winH.buf_;
423
0
    uint32_t stride = (k == 0) ? winL.stride_ : winH.stride_;
424
0
    if(remaining_height >= vec4f_elts && ((size_t)band & 0x0f) == 0 && ((size_t)bi & 0x0f) == 0 &&
425
0
     (stride & 0x0f) == 0)
426
0
    {
427
     /* Fast code path */
428
0
     for(uint32_t i = x0; i < x1; ++i, bi += vec4f_elts * 2)
429
0
     {
430
0
      uint32_t j = i;
431
0
      bi[0] = band[j];
432
0
      j += stride;
433
0
      bi[1] = band[j];
434
0
      j += stride;
435
0
      bi[2] = band[j];
436
0
      j += stride;
437
0
      bi[3] = band[j];
438
0
     }
439
0
    }
440
0
    else
441
0
    {
442
     /* Slow code path */
443
0
     for(uint32_t i = x0; i < x1; ++i, bi += vec4f_elts * 2)
444
0
     {
445
0
      uint32_t j = i;
446
0
      bi[0] = band[j];
447
0
      j += stride;
448
0
      if(remaining_height == 1)
449
0
         continue;
450
0
      bi[1] = band[j];
451
0
      j += stride;
452
0
      if(remaining_height == 2)
453
0
         continue;
454
0
      bi[2] = band[j];
455
0
      j += stride;
456
0
      if(remaining_height == 3)
457
0
         continue;
458
0
      bi[3] = band[j];
459
0
     }
460
0
    }
461
0
    bi = (float*)(dwt->mem + 1 - dwt->parity);
462
0
    x0 = dwt->win_h.x0;
463
0
    x1 = dwt->win_h.x1;
464
0
   }
465
0
}
466
void WaveletReverse::decompress_h_strip_97(dwt_data<vec4f>* GRK_RESTRICT horiz,
467
                       const uint32_t resHeight, grk_buf2d_simple<float> winL,
468
                       grk_buf2d_simple<float> winH,
469
                       grk_buf2d_simple<float> winDest)
470
0
{
471
0
   float* GRK_RESTRICT dest = winDest.buf_;
472
0
   const uint32_t strideDest = winDest.stride_;
473
0
   uint32_t j;
474
0
   const size_t vec4f_elts = vec4f::NUM_ELTS;
475
0
   for(j = 0; j < (resHeight & (uint32_t)(~(vec4f_elts - 1))); j += vec4f_elts)
476
0
   {
477
0
    interleave_h_97(horiz, winL, winH, resHeight - j);
478
0
    decompress_step_97(horiz);
479
0
    for(uint32_t k = 0; k < horiz->sn_full + horiz->dn_full; k++)
480
0
    {
481
0
     dest[k] = horiz->mem[k].val[0];
482
0
     dest[k + (size_t)strideDest] = horiz->mem[k].val[1];
483
0
     dest[k + (size_t)strideDest * 2] = horiz->mem[k].val[2];
484
0
     dest[k + (size_t)strideDest * 3] = horiz->mem[k].val[3];
485
0
    }
486
0
    winL.buf_ += winL.stride_ << 2;
487
0
    winH.buf_ += winH.stride_ << 2;
488
0
    dest += strideDest << 2;
489
0
   }
490
0
   if(j < resHeight)
491
0
   {
492
0
    interleave_h_97(horiz, winL, winH, resHeight - j);
493
0
    decompress_step_97(horiz);
494
0
    for(uint32_t k = 0; k < horiz->sn_full + horiz->dn_full; k++)
495
0
    {
496
0
     switch(resHeight - j)
497
0
     {
498
0
      case 3:
499
0
         dest[k + (strideDest << 1)] = horiz->mem[k].val[2];
500
      /* FALLTHRU */
501
0
      case 2:
502
0
         dest[k + strideDest] = horiz->mem[k].val[1];
503
      /* FALLTHRU */
504
0
      case 1:
505
0
         dest[k] = horiz->mem[k].val[0];
506
0
     }
507
0
    }
508
0
   }
509
0
}
510
bool WaveletReverse::decompress_h_97(uint8_t res, uint32_t numThreads, size_t dataLength,
511
                   dwt_data<vec4f>& GRK_RESTRICT horiz, const uint32_t resHeight,
512
                   grk_buf2d_simple<float> winL, grk_buf2d_simple<float> winH,
513
                   grk_buf2d_simple<float> winDest)
514
0
{
515
0
   if(resHeight == 0)
516
0
    return true;
517
0
   if(numThreads == 1)
518
0
   {
519
0
    decompress_h_strip_97(&horiz, resHeight, winL, winH, winDest);
520
0
   }
521
0
   else
522
0
   {
523
0
    uint32_t numTasks = numThreads;
524
0
    if(resHeight < numTasks)
525
0
     numTasks = resHeight;
526
0
    uint32_t incrPerJob = resHeight / numTasks;
527
0
    auto imageComponentFlow = scheduler_->getImageComponentFlow(compno_);
528
0
    if(!imageComponentFlow)
529
0
    {
530
0
     Logger::logger_.warn("Missing image component flow");
531
0
     return true;
532
0
    }
533
0
    auto resFlow = imageComponentFlow->getResFlow(res - 1);
534
0
    for(uint32_t j = 0; j < numTasks; ++j)
535
0
    {
536
0
     auto indexMin = j * incrPerJob;
537
0
     auto indexMax = (j < (numTasks - 1U) ? (j + 1U) * incrPerJob : resHeight) - indexMin;
538
0
     auto myhoriz = new dwt_data<vec4f>(horiz);
539
0
     if(!myhoriz->alloc(dataLength))
540
0
     {
541
0
      Logger::logger_.error("Out of memory");
542
0
      return false;
543
0
     }
544
0
     resFlow->waveletHoriz_->nextTask().work([this, myhoriz, indexMax, winL, winH, winDest] {
545
0
      decompress_h_strip_97(myhoriz, indexMax, winL, winH, winDest);
546
0
      delete myhoriz;
547
0
     });
548
0
     winL.incY_IN_PLACE(incrPerJob);
549
0
     winH.incY_IN_PLACE(incrPerJob);
550
0
     winDest.incY_IN_PLACE(incrPerJob);
551
0
    }
552
0
   }
553
0
   return true;
554
0
}
555
void WaveletReverse::interleave_v_97(dwt_data<vec4f>* GRK_RESTRICT dwt,
556
                   grk_buf2d_simple<float> winL, grk_buf2d_simple<float> winH,
557
                   uint32_t nb_elts_read)
558
0
{
559
0
   auto bi = dwt->mem + dwt->parity;
560
0
   auto band = winL.buf_ + dwt->win_l.x0 * winL.stride_;
561
0
   for(uint32_t i = dwt->win_l.x0; i < dwt->win_l.x1; ++i, bi += 2)
562
0
   {
563
0
    memcpy((float*)bi, band, nb_elts_read * sizeof(float));
564
0
    band += winL.stride_;
565
0
   }
566
0
   bi = dwt->mem + 1 - dwt->parity;
567
0
   band = winH.buf_ + dwt->win_h.x0 * winH.stride_;
568
0
   for(uint32_t i = dwt->win_h.x0; i < dwt->win_h.x1; ++i, bi += 2)
569
0
   {
570
0
    memcpy((float*)bi, band, nb_elts_read * sizeof(float));
571
0
    band += winH.stride_;
572
0
   }
573
0
}
574
void WaveletReverse::decompress_v_strip_97(dwt_data<vec4f>* GRK_RESTRICT vert,
575
                       const uint32_t resWidth, const uint32_t resHeight,
576
                       grk_buf2d_simple<float> winL,
577
                       grk_buf2d_simple<float> winH,
578
                       grk_buf2d_simple<float> winDest)
579
0
{
580
0
   uint32_t j;
581
0
   const size_t vec4f_elts = vec4f::NUM_ELTS;
582
0
   for(j = 0; j < (resWidth & (uint32_t) ~(vec4f_elts - 1)); j += vec4f_elts)
583
0
   {
584
0
    interleave_v_97(vert, winL, winH, vec4f_elts);
585
0
    decompress_step_97(vert);
586
0
    auto destPtr = winDest.buf_;
587
0
    for(uint32_t k = 0; k < resHeight; ++k)
588
0
    {
589
0
     memcpy(destPtr, vert->mem + k, sizeof(vec4f));
590
0
     destPtr += winDest.stride_;
591
0
    }
592
0
    winL.buf_ += vec4f_elts;
593
0
    winH.buf_ += vec4f_elts;
594
0
    winDest.buf_ += vec4f_elts;
595
0
   }
596
0
   if(j < resWidth)
597
0
   {
598
0
    j = resWidth & (vec4f_elts - 1);
599
0
    interleave_v_97(vert, winL, winH, j);
600
0
    decompress_step_97(vert);
601
0
    auto destPtr = winDest.buf_;
602
0
    for(uint32_t k = 0; k < resHeight; ++k)
603
0
    {
604
0
     memcpy(destPtr, vert->mem + k, j * sizeof(float));
605
0
     destPtr += winDest.stride_;
606
0
    }
607
0
   }
608
0
}
609
bool WaveletReverse::decompress_v_97(uint8_t res, uint32_t numThreads, size_t dataLength,
610
                   dwt_data<vec4f>& GRK_RESTRICT vert, const uint32_t resWidth,
611
                   const uint32_t resHeight, grk_buf2d_simple<float> winL,
612
                   grk_buf2d_simple<float> winH, grk_buf2d_simple<float> winDest)
613
0
{
614
0
   if(resWidth == 0)
615
0
    return true;
616
0
   if(numThreads == 1)
617
0
   {
618
0
    decompress_v_strip_97(&vert, resWidth, resHeight, winL, winH, winDest);
619
0
   }
620
0
   else
621
0
   {
622
0
    auto numTasks = numThreads;
623
0
    if(resWidth < numTasks)
624
0
     numTasks = resWidth;
625
0
    auto incrPerJob = resWidth / numTasks;
626
0
    auto imageComponentFlow = scheduler_->getImageComponentFlow(compno_);
627
0
    if(!imageComponentFlow)
628
0
    {
629
0
     Logger::logger_.warn("Missing image component flow");
630
0
     return false;
631
0
    }
632
0
    auto resFlow = imageComponentFlow->getResFlow(res - 1);
633
0
    for(uint32_t j = 0; j < numTasks; j++)
634
0
    {
635
0
     auto indexMin = j * incrPerJob;
636
0
     auto indexMax = (j < (numTasks - 1U) ? (j + 1U) * incrPerJob : resWidth) - indexMin;
637
0
     auto myvert = new dwt_data<vec4f>(vert);
638
0
     if(!myvert->alloc(dataLength))
639
0
     {
640
0
      Logger::logger_.error("Out of memory");
641
0
      delete myvert;
642
0
      return false;
643
0
     }
644
0
     resFlow->waveletVert_->nextTask().work(
645
0
       [this, myvert, resHeight, indexMax, winL, winH, winDest] {
646
0
        decompress_v_strip_97(myvert, indexMax, resHeight, winL, winH, winDest);
647
0
        delete myvert;
648
0
       });
649
0
     winL.incX_IN_PLACE(incrPerJob);
650
0
     winH.incX_IN_PLACE(incrPerJob);
651
0
     winDest.incX_IN_PLACE(incrPerJob);
652
0
    }
653
0
   }
654
655
0
   return true;
656
0
}
657
/* <summary>                             */
658
/* Inverse 9-7 wavelet transform in 2-D. */
659
/* </summary>                            */
660
bool WaveletReverse::decompress_tile_97(void)
661
0
{
662
0
   if(numres_ == 1U)
663
0
    return true;
664
665
0
   auto tr = tilec_->resolutions_;
666
0
   auto buf = tilec_->getWindow();
667
0
   uint32_t resWidth = tr->width();
668
0
   uint32_t resHeight = tr->height();
669
670
0
   size_t dataLength = max_resolution(tr, numres_);
671
0
   if(!horizF_.alloc(dataLength))
672
0
   {
673
0
    Logger::logger_.error("decompress_tile_97: out of memory");
674
0
    return false;
675
0
   }
676
0
   vertF_.mem = horizF_.mem;
677
0
   uint32_t numThreads = (uint32_t)ExecSingleton::get().num_workers();
678
0
   for(uint8_t res = 1; res < numres_; ++res)
679
0
   {
680
0
    horizF_.sn_full = resWidth;
681
0
    vertF_.sn_full = resHeight;
682
0
    ++tr;
683
0
    resWidth = tr->width();
684
0
    resHeight = tr->height();
685
0
    if(resWidth == 0 || resHeight == 0)
686
0
     continue;
687
0
    horizF_.dn_full = resWidth - horizF_.sn_full;
688
0
    horizF_.parity = tr->x0 & 1;
689
0
    horizF_.win_l = grk_line32(0, horizF_.sn_full);
690
0
    horizF_.win_h = grk_line32(0, horizF_.dn_full);
691
0
    auto winSplitL = buf->getResWindowBufferSplitSimpleF(res, SPLIT_L);
692
0
    auto winSplitH = buf->getResWindowBufferSplitSimpleF(res, SPLIT_H);
693
0
    if(!decompress_h_97(res, numThreads, dataLength, horizF_, vertF_.sn_full,
694
0
              buf->getResWindowBufferSimpleF(res - 1U),
695
0
              buf->getBandWindowBufferPaddedSimpleF(res, BAND_ORIENT_HL), winSplitL))
696
0
     return false;
697
0
    if(!decompress_h_97(res, numThreads, dataLength, horizF_, resHeight - vertF_.sn_full,
698
0
              buf->getBandWindowBufferPaddedSimpleF(res, BAND_ORIENT_LH),
699
0
              buf->getBandWindowBufferPaddedSimpleF(res, BAND_ORIENT_HH), winSplitH))
700
0
     return false;
701
0
    vertF_.dn_full = resHeight - vertF_.sn_full;
702
0
    vertF_.parity = tr->y0 & 1;
703
0
    vertF_.win_l = grk_line32(0, vertF_.sn_full);
704
0
    vertF_.win_h = grk_line32(0, vertF_.dn_full);
705
0
    if(!decompress_v_97(res, numThreads, dataLength, vertF_, resWidth, resHeight, winSplitL,
706
0
              winSplitH, buf->getResWindowBufferSimpleF(res)))
707
0
     return false;
708
0
   }
709
710
0
   return true;
711
0
}
712
713
/**************************************************************************************
714
 *
715
 * Full 5/3 Inverse Wavelet
716
 *
717
 *
718
 *************************************************************************************/
719
720
void WaveletReverse::decompress_h_parity_even_53(int32_t* buf, int32_t* bandL, /* even */
721
                         const uint32_t wL, int32_t* bandH,
722
                         const uint32_t wH, int32_t* dest)
723
0
{ /* odd */
724
0
   const uint32_t total_width = wL + wH;
725
0
   assert(total_width > 1);
726
727
   /* Improved version of the TWO_PASS_VERSION: */
728
   /* Performs lifting in one single iteration. Saves memory */
729
   /* accesses and explicit interleaving. */
730
0
   int32_t s1n = bandL[0];
731
0
   int32_t d1n = bandH[0];
732
0
   int32_t s0n = s1n - ((d1n + 1) >> 1);
733
0
   uint32_t i = 0;
734
0
   if(total_width > 2)
735
0
   {
736
0
    for(uint32_t j = 1; i < (total_width - 3); i += 2, j++)
737
0
    {
738
0
     int32_t d1c = d1n;
739
0
     int32_t s0c = s0n;
740
741
0
     s1n = bandL[j];
742
0
     d1n = bandH[j];
743
0
     s0n = s1n - ((d1c + d1n + 2) >> 2);
744
0
     buf[i] = s0c;
745
0
     buf[i + 1] = d1c + ((s0c + s0n) >> 1);
746
0
    }
747
0
   }
748
0
   buf[i] = s0n;
749
0
   if(total_width & 1)
750
0
   {
751
0
    buf[total_width - 1] = bandL[(total_width - 1) >> 1] - ((d1n + 1) >> 1);
752
0
    buf[total_width - 2] = d1n + ((s0n + buf[total_width - 1]) >> 1);
753
0
   }
754
0
   else
755
0
   {
756
0
    buf[total_width - 1] = d1n + s0n;
757
0
   }
758
0
   memcpy(dest, buf, total_width * sizeof(int32_t));
759
0
}
760
761
void WaveletReverse::decompress_h_parity_odd_53(int32_t* buf, int32_t* bandL, /* odd */
762
                        const uint32_t wL, int32_t* bandH,
763
                        const uint32_t wH, int32_t* dest)
764
0
{ /* even */
765
0
   const uint32_t total_width = wL + wH;
766
0
   assert(total_width > 2);
767
768
   /* Improved version of the TWO_PASS_VERSION:
769
    Performs lifting in one single iteration. Saves memory
770
    accesses and explicit interleaving. */
771
0
   int32_t s1 = bandH[1];
772
0
   int32_t dc = bandL[0] - ((bandH[0] + s1 + 2) >> 2);
773
0
   buf[0] = bandH[0] + dc;
774
0
   uint32_t i, j;
775
0
   for(i = 1, j = 1; i < (total_width - 2 - !(total_width & 1)); i += 2, j++)
776
0
   {
777
0
    int32_t s2 = bandH[j + 1];
778
0
    int32_t dn = bandL[j] - ((s1 + s2 + 2) >> 2);
779
780
0
    buf[i] = dc;
781
0
    buf[i + 1] = s1 + ((dn + dc) >> 1);
782
0
    dc = dn;
783
0
    s1 = s2;
784
0
   }
785
0
   buf[i] = dc;
786
0
   if(!(total_width & 1))
787
0
   {
788
0
    int32_t dn = bandL[(total_width >> 1) - 1] - ((s1 + 1) >> 1);
789
0
    buf[total_width - 2] = s1 + ((dn + dc) >> 1);
790
0
    buf[total_width - 1] = dn;
791
0
   }
792
0
   else
793
0
   {
794
0
    buf[total_width - 1] = s1 + dc;
795
0
   }
796
0
   memcpy(dest, buf, total_width * sizeof(int32_t));
797
0
}
798
799
/** Vertical inverse 5x3 wavelet transform for one column, when top-most
800
 * pixel is on even coordinate */
801
void WaveletReverse::decompress_v_parity_even_53(int32_t* buf, int32_t* bandL, const uint32_t hL,
802
                         const uint32_t strideL, int32_t* bandH,
803
                         const uint32_t hH, const uint32_t strideH,
804
                         int32_t* dest, const uint32_t strideDest)
805
0
{
806
0
   const uint32_t total_height = hL + hH;
807
0
   assert(total_height > 1);
808
809
   /* Performs lifting in one single iteration. Saves memory */
810
   /* accesses and explicit interleaving. */
811
0
   int32_t s1n = bandL[0];
812
0
   int32_t d1n = bandH[0];
813
0
   int32_t s0n = s1n - ((d1n + 1) >> 1);
814
815
0
   uint32_t i = 0;
816
0
   if(total_height > 2)
817
0
   {
818
0
    auto bL = bandL + strideL;
819
0
    auto bH = bandH + strideH;
820
0
    for(uint32_t j = 0; i < (total_height - 3); i += 2, j++)
821
0
    {
822
0
     int32_t d1c = d1n;
823
0
     int32_t s0c = s0n;
824
0
     s1n = *bL;
825
0
     bL += strideL;
826
0
     d1n = *bH;
827
0
     bH += strideH;
828
0
     s0n = s1n - ((d1c + d1n + 2) >> 2);
829
0
     buf[i] = s0c;
830
0
     buf[i + 1] = d1c + ((s0c + s0n) >> 1);
831
0
    }
832
0
   }
833
0
   buf[i] = s0n;
834
0
   if(total_height & 1)
835
0
   {
836
0
    buf[total_height - 1] = bandL[((total_height - 1) >> 1) * strideL] - ((d1n + 1) >> 1);
837
0
    buf[total_height - 2] = d1n + ((s0n + buf[total_height - 1]) >> 1);
838
0
   }
839
0
   else
840
0
   {
841
0
    buf[total_height - 1] = d1n + s0n;
842
0
   }
843
0
   for(i = 0; i < total_height; ++i)
844
0
   {
845
0
    *dest = buf[i];
846
0
    dest += strideDest;
847
0
   }
848
0
}
849
/** Vertical inverse 5x3 wavelet transform for one column, when top-most
850
 * pixel is on odd coordinate */
851
void WaveletReverse::decompress_v_parity_odd_53(int32_t* buf, int32_t* bandL, const uint32_t hL,
852
                        const uint32_t strideL, int32_t* bandH,
853
                        const uint32_t hH, const uint32_t strideH,
854
                        int32_t* dest, const uint32_t strideDest)
855
0
{
856
0
   const uint32_t total_height = hL + hH;
857
0
   assert(total_height > 2);
858
859
   /* Performs lifting in one single iteration. Saves memory */
860
   /* accesses and explicit interleaving. */
861
0
   int32_t s1 = bandH[strideH];
862
0
   int32_t dc = bandL[0] - ((bandH[0] + s1 + 2) >> 2);
863
0
   buf[0] = bandH[0] + dc;
864
0
   auto s2_ptr = bandH + (strideH << 1);
865
0
   auto dn_ptr = bandL + strideL;
866
0
   uint32_t i, j;
867
0
   for(i = 1, j = 1; i < (total_height - 2 - !(total_height & 1)); i += 2, j++)
868
0
   {
869
0
    int32_t s2 = *s2_ptr;
870
0
    s2_ptr += strideH;
871
872
0
    int32_t dn = *dn_ptr - ((s1 + s2 + 2) >> 2);
873
0
    dn_ptr += strideL;
874
875
0
    buf[i] = dc;
876
0
    buf[i + 1] = s1 + ((dn + dc) >> 1);
877
0
    dc = dn;
878
0
    s1 = s2;
879
0
   }
880
0
   buf[i] = dc;
881
0
   if(!(total_height & 1))
882
0
   {
883
0
    int32_t dn = bandL[((total_height >> 1) - 1) * strideL] - ((s1 + 1) >> 1);
884
0
    buf[total_height - 2] = s1 + ((dn + dc) >> 1);
885
0
    buf[total_height - 1] = dn;
886
0
   }
887
0
   else
888
0
   {
889
0
    buf[total_height - 1] = s1 + dc;
890
0
   }
891
0
   for(i = 0; i < total_height; ++i)
892
0
   {
893
0
    *dest = buf[i];
894
0
    dest += strideDest;
895
0
   }
896
0
}
897
/* <summary>                            */
898
/* Inverse 5-3 wavelet transform in 1-D for one row. */
899
/* </summary>                           */
900
/* Performs interleave, inverse wavelet transform and copy back to buffer */
901
void WaveletReverse::decompress_h_53(const dwt_data<int32_t>* dwt, int32_t* bandL, int32_t* bandH,
902
                   int32_t* dest)
903
0
{
904
0
   const uint32_t total_width = dwt->sn_full + dwt->dn_full;
905
0
   assert(total_width != 0);
906
0
   if(dwt->parity == 0)
907
0
   { /* Left-most sample is on even coordinate */
908
0
    if(total_width > 1)
909
0
    {
910
0
     decompress_h_parity_even_53(dwt->mem, bandL, dwt->sn_full, bandH, dwt->dn_full, dest);
911
0
    }
912
0
    else
913
0
    {
914
0
     assert(dwt->sn_full == 1);
915
     // only L op: only one sample in L band and H band is empty
916
0
     dest[0] = bandL[0];
917
0
    }
918
0
   }
919
0
   else
920
0
   { /* Left-most sample is on odd coordinate */
921
0
    if(total_width == 1)
922
0
    {
923
0
     assert(dwt->dn_full == 1);
924
     // only H op: only one sample in H band and L band is empty
925
0
     dest[0] = bandH[0] >> 1;
926
0
    }
927
0
    else if(total_width == 2)
928
0
    {
929
0
     dwt->mem[1] = bandL[0] - ((bandH[0] + 1) >> 1);
930
0
     dest[0] = bandH[0] + dwt->mem[1];
931
0
     dest[1] = dwt->mem[1];
932
0
    }
933
0
    else
934
0
    {
935
0
     decompress_h_parity_odd_53(dwt->mem, bandL, dwt->sn_full, bandH, dwt->dn_full, dest);
936
0
    }
937
0
   }
938
0
}
939
940
/* <summary>                            */
941
/* Inverse vertical 5-3 wavelet transform in 1-D for several columns. */
942
/* </summary>                           */
943
/* Performs interleave, inverse wavelet transform and copy back to buffer */
944
/** Number of columns that we can process in parallel in the vertical pass */
945
0
#define PLL_COLS_53 (2 * uint32_t(HWY_DYNAMIC_DISPATCH(hwy_num_lanes)()))
946
void WaveletReverse::decompress_v_53(const dwt_data<int32_t>* dwt, grk_buf2d_simple<int32_t> winL,
947
                   grk_buf2d_simple<int32_t> winH,
948
                   grk_buf2d_simple<int32_t> winDest, uint32_t nb_cols)
949
0
{
950
0
   const uint32_t total_height = dwt->sn_full + dwt->dn_full;
951
0
   assert(total_height != 0);
952
0
   if(dwt->parity == 0)
953
0
   {
954
0
    if(total_height == 1)
955
0
    {
956
0
     for(uint32_t c = 0; c < nb_cols; c++, winL.buf_++, winDest.buf_++)
957
0
      winDest.buf_[0] = winL.buf_[0];
958
0
    }
959
0
    else
960
0
    {
961
0
     if(nb_cols == PLL_COLS_53)
962
0
     {
963
      /* Same as below general case, except that thanks to SSE2/AVX2 */
964
      /* we can efficiently process 8/16 columns in parallel */
965
0
      HWY_DYNAMIC_DISPATCH(hwy_decompress_v_parity_even_mcols_53)
966
0
      (dwt->mem, winL.buf_, dwt->sn_full, winL.stride_, winH.buf_, dwt->dn_full, winH.stride_,
967
0
       winDest.buf_, winDest.stride_);
968
0
     }
969
0
     else
970
0
     {
971
0
      for(uint32_t c = 0; c < nb_cols; c++, winL.buf_++, winH.buf_++, winDest.buf_++)
972
0
         decompress_v_parity_even_53(dwt->mem, winL.buf_, dwt->sn_full, winL.stride_,
973
0
                       winH.buf_, dwt->dn_full, winL.stride_, winDest.buf_,
974
0
                       winDest.stride_);
975
0
     }
976
0
    }
977
0
   }
978
0
   else
979
0
   {
980
0
    if(total_height == 1)
981
0
    {
982
0
     for(uint32_t c = 0; c < nb_cols; c++, winL.buf_++, winDest.buf_++)
983
0
      winDest.buf_[0] = winL.buf_[0] >> 1;
984
0
    }
985
0
    else if(total_height == 2)
986
0
    {
987
0
     auto out = dwt->mem;
988
0
     for(uint32_t c = 0; c < nb_cols; c++, winL.buf_++, winH.buf_++, winDest.buf_++)
989
0
     {
990
0
      out[1] = winL.buf_[0] - ((winH.buf_[0] + 1) >> 1);
991
0
      winDest.buf_[0] = winH.buf_[0] + out[1];
992
0
      winDest.buf_[1] = out[1];
993
0
     }
994
0
    }
995
0
    else
996
0
    {
997
0
     if(nb_cols == PLL_COLS_53)
998
0
     {
999
      /* Same as below general case, except that thanks to SSE2/AVX2 */
1000
      /* we can efficiently process 8/16 columns in parallel */
1001
0
      HWY_DYNAMIC_DISPATCH(hwy_decompress_v_parity_odd_mcols_53)
1002
0
      (dwt->mem, winL.buf_, dwt->sn_full, winL.stride_, winH.buf_, dwt->dn_full, winH.stride_,
1003
0
       winDest.buf_, winDest.stride_);
1004
0
     }
1005
0
     else
1006
0
     {
1007
0
      for(uint32_t c = 0; c < nb_cols; c++, winL.buf_++, winH.buf_++, winDest.buf_++)
1008
0
         decompress_v_parity_odd_53(dwt->mem, winL.buf_, dwt->sn_full, winL.stride_,
1009
0
                      winH.buf_, dwt->dn_full, winH.stride_, winDest.buf_,
1010
0
                      winDest.stride_);
1011
0
     }
1012
0
    }
1013
0
   }
1014
0
}
1015
1016
void WaveletReverse::decompress_h_strip_53(const dwt_data<int32_t>* horiz, uint32_t hMin,
1017
                       uint32_t hMax, grk_buf2d_simple<int32_t> winL,
1018
                       grk_buf2d_simple<int32_t> winH,
1019
                       grk_buf2d_simple<int32_t> winDest)
1020
0
{
1021
0
   for(uint32_t j = hMin; j < hMax; ++j)
1022
0
   {
1023
0
    decompress_h_53(horiz, winL.buf_, winH.buf_, winDest.buf_);
1024
0
    winL.incY_IN_PLACE(1);
1025
0
    winH.incY_IN_PLACE(1);
1026
0
    winDest.incY_IN_PLACE(1);
1027
0
   }
1028
0
}
1029
bool WaveletReverse::decompress_h_53(uint8_t res, TileComponentWindow<int32_t>* buf,
1030
                   uint32_t resHeight, size_t dataLength)
1031
0
{
1032
0
   uint32_t numThreads = (uint32_t)ExecSingleton::get().num_workers();
1033
0
   grk_buf2d_simple<int32_t> winL, winH, winDest;
1034
0
   auto imageComponentFlow = scheduler_->getImageComponentFlow(compno_);
1035
0
   auto resFlow = imageComponentFlow->getResFlow(res - 1);
1036
0
   uint32_t numTasks[2] = {0, 0};
1037
0
   uint32_t height[2] = {0, 0};
1038
0
   for(uint32_t orient = 0; orient < 2; ++orient)
1039
0
   {
1040
0
    height[orient] = (orient == 0) ? vert_.sn_full : resHeight - vert_.sn_full;
1041
0
    if(numThreads > 1)
1042
0
     numTasks[orient] = height[orient] < numThreads ? height[orient] : numThreads;
1043
0
   }
1044
0
   for(uint32_t orient = 0; orient < 2; ++orient)
1045
0
   {
1046
0
    if(height[orient] == 0)
1047
0
     continue;
1048
0
    if(orient == 0)
1049
0
    {
1050
0
     winL = buf->getResWindowBufferSimple(res - 1U);
1051
0
     winH = buf->getBandWindowBufferPaddedSimple(res, BAND_ORIENT_HL);
1052
0
     winDest = buf->getResWindowBufferSplitSimple(res, SPLIT_L);
1053
0
    }
1054
0
    else
1055
0
    {
1056
0
     winL = buf->getBandWindowBufferPaddedSimple(res, BAND_ORIENT_LH);
1057
0
     winH = buf->getBandWindowBufferPaddedSimple(res, BAND_ORIENT_HH);
1058
0
     winDest = buf->getResWindowBufferSplitSimple(res, SPLIT_H);
1059
0
    }
1060
0
    if(numThreads == 1)
1061
0
    {
1062
0
     if(!horiz_.mem)
1063
0
     {
1064
0
      if(!horiz_.alloc(dataLength))
1065
0
      {
1066
0
         Logger::logger_.error("Out of memory");
1067
0
         return false;
1068
0
      }
1069
0
      vert_.mem = horiz_.mem;
1070
0
     }
1071
0
     decompress_h_strip_53(&horiz_, 0, height[orient], winL, winH, winDest);
1072
0
    }
1073
0
    else
1074
0
    {
1075
0
     uint32_t incrPerJob = height[orient] / numTasks[orient];
1076
0
     for(uint32_t j = 0; j < numTasks[orient]; ++j)
1077
0
     {
1078
0
      auto indexMin = j * incrPerJob;
1079
0
      auto indexMax = j < (numTasks[orient] - 1U) ? (j + 1U) * incrPerJob : height[orient];
1080
0
      auto horiz = new dwt_data<int32_t>(horiz_);
1081
0
      if(!horiz->alloc(dataLength))
1082
0
      {
1083
0
         Logger::logger_.error("Out of memory");
1084
0
         delete horiz;
1085
0
         return false;
1086
0
      }
1087
0
      resFlow->waveletHoriz_->nextTask().work(
1088
0
        [this, horiz, winL, winH, winDest, indexMin, indexMax] {
1089
0
           decompress_h_strip_53(horiz, indexMin, indexMax, winL, winH, winDest);
1090
0
           delete horiz;
1091
0
        });
1092
0
      winL.incY_IN_PLACE(incrPerJob);
1093
0
      winH.incY_IN_PLACE(incrPerJob);
1094
0
      winDest.incY_IN_PLACE(incrPerJob);
1095
0
     }
1096
0
    }
1097
0
   }
1098
1099
0
   return true;
1100
0
}
1101
1102
void WaveletReverse::decompress_v_strip_53(const dwt_data<int32_t>* vert, uint32_t wMin,
1103
                       uint32_t wMax, grk_buf2d_simple<int32_t> winL,
1104
                       grk_buf2d_simple<int32_t> winH,
1105
                       grk_buf2d_simple<int32_t> winDest)
1106
0
{
1107
0
   uint32_t j;
1108
0
   for(j = wMin; j + PLL_COLS_53 <= wMax; j += PLL_COLS_53)
1109
0
   {
1110
0
    decompress_v_53(vert, winL, winH, winDest, PLL_COLS_53);
1111
0
    winL.incX_IN_PLACE(PLL_COLS_53);
1112
0
    winH.incX_IN_PLACE(PLL_COLS_53);
1113
0
    winDest.incX_IN_PLACE(PLL_COLS_53);
1114
0
   }
1115
0
   if(j < wMax)
1116
0
    decompress_v_53(vert, winL, winH, winDest, wMax - j);
1117
0
}
1118
1119
bool WaveletReverse::decompress_v_53(uint8_t res, TileComponentWindow<int32_t>* buf,
1120
                   uint32_t resWidth, size_t dataLength)
1121
0
{
1122
0
   if(resWidth == 0)
1123
0
    return true;
1124
0
   uint32_t numThreads = (uint32_t)ExecSingleton::get().num_workers();
1125
0
   auto winL = buf->getResWindowBufferSplitSimple(res, SPLIT_L);
1126
0
   auto winH = buf->getResWindowBufferSplitSimple(res, SPLIT_H);
1127
0
   auto winDest = buf->getResWindowBufferSimple(res);
1128
0
   if(numThreads == 1)
1129
0
   {
1130
0
    if(!horiz_.mem)
1131
0
    {
1132
0
     if(!horiz_.alloc(dataLength))
1133
0
     {
1134
0
      Logger::logger_.error("Out of memory");
1135
0
      return false;
1136
0
     }
1137
0
     vert_.mem = horiz_.mem;
1138
0
    }
1139
0
    decompress_v_strip_53(&vert_, 0, resWidth, winL, winH, winDest);
1140
0
   }
1141
0
   else
1142
0
   {
1143
0
    auto imageComponentFlow = scheduler_->getImageComponentFlow(compno_);
1144
0
    auto resFlow = imageComponentFlow->getResFlow(res - 1);
1145
0
    const uint32_t numTasks = resWidth < numThreads ? resWidth : numThreads;
1146
0
    uint32_t step = resWidth / numTasks;
1147
0
    for(uint32_t j = 0; j < numTasks; j++)
1148
0
    {
1149
0
     auto indexMin = j * step;
1150
0
     auto indexMax = j < (numTasks - 1U) ? (j + 1U) * step : resWidth;
1151
0
     auto vert = new dwt_data<int32_t>(vert_);
1152
0
     if(!vert->alloc(dataLength))
1153
0
     {
1154
0
      Logger::logger_.error("Out of memory");
1155
0
      delete vert;
1156
0
      return false;
1157
0
     }
1158
0
     resFlow->waveletVert_->nextTask().work(
1159
0
       [this, vert, indexMin, indexMax, winL, winH, winDest] {
1160
0
        decompress_v_strip_53(vert, indexMin, indexMax, winL, winH, winDest);
1161
0
        delete vert;
1162
0
       });
1163
0
     winL.incX_IN_PLACE(step);
1164
0
     winH.incX_IN_PLACE(step);
1165
0
     winDest.incX_IN_PLACE(step);
1166
0
    }
1167
0
   }
1168
0
   return true;
1169
0
}
1170
/* <summary>                            */
1171
/* Inverse wavelet transform in 2-D.    */
1172
/* </summary>                           */
1173
bool WaveletReverse::decompress_tile_53(void)
1174
0
{
1175
0
   if(numres_ == 1U)
1176
0
    return true;
1177
1178
0
   auto tileCompRes = tilec_->resolutions_;
1179
0
   auto buf = tilec_->getWindow();
1180
0
   size_t dataLength = max_resolution(tileCompRes, numres_);
1181
   /* overflow check */
1182
0
   if(dataLength > (SIZE_MAX / PLL_COLS_53 / sizeof(int32_t)))
1183
0
   {
1184
0
    Logger::logger_.error("Overflow");
1185
0
    return false;
1186
0
   }
1187
   /* We need PLL_COLS_53 times the height of the array, */
1188
   /* since for the vertical pass */
1189
   /* we process PLL_COLS_53 columns at a time */
1190
0
   dataLength *= PLL_COLS_53 * sizeof(int32_t);
1191
0
   for(uint8_t res = 1; res < numres_; ++res)
1192
0
   {
1193
0
    horiz_.sn_full = tileCompRes->width();
1194
0
    vert_.sn_full = tileCompRes->height();
1195
0
    ++tileCompRes;
1196
0
    auto resWidth = tileCompRes->width();
1197
0
    auto resHeight = tileCompRes->height();
1198
0
    if(resWidth == 0 || resHeight == 0)
1199
0
     continue;
1200
0
    horiz_.dn_full = resWidth - horiz_.sn_full;
1201
0
    horiz_.parity = tileCompRes->x0 & 1;
1202
0
    vert_.dn_full = resHeight - vert_.sn_full;
1203
0
    vert_.parity = tileCompRes->y0 & 1;
1204
0
    if(!decompress_h_53(res, buf, resHeight, dataLength))
1205
0
     return false;
1206
0
    if(!decompress_v_53(res, buf, resWidth, dataLength))
1207
0
     return false;
1208
0
   }
1209
1210
0
   return true;
1211
0
}
1212
1213
/*************************************************************************************
1214
 *
1215
 * Partial 5/3 or 9/7 Inverse Wavelet
1216
 *
1217
 **************************************************************************************
1218
 *
1219
 *
1220
 * 5/3 operates on elements of type int32_t while 9/7 operates on elements of type vec4f
1221
 *
1222
 * Horizontal pass
1223
 *
1224
 * Each thread processes a strip running the length of the window, with height
1225
 *   5/3
1226
 *   Height : sizeof(T)/sizeof(int32_t)
1227
 *
1228
 *   9/7
1229
 *   Height : sizeof(T)/sizeof(int32_t)
1230
 *
1231
 * Vertical pass
1232
 *
1233
 * Each thread processes a strip running the height of the window, with width
1234
 *
1235
 *  5/3
1236
 *  Width :  4
1237
 *
1238
 *  9/7
1239
 *  Width :  4
1240
 *
1241
 ****************************************************************************/
1242
template<typename T, uint32_t FILTER_WIDTH, uint32_t VERT_PASS_WIDTH>
1243
class PartialInterleaver
1244
{
1245
 public:
1246
   bool interleave_h(dwt_data<T>* dwt, ISparseCanvas* sa, uint32_t y_offset, uint32_t height)
1247
515k
   {
1248
515k
    const uint32_t stripHeight = (uint32_t)(sizeof(T) / sizeof(int32_t));
1249
1.07M
    for(uint32_t y = 0; y < height; y++)
1250
583k
    {
1251
     // read one row of L band
1252
583k
     if(dwt->sn_full)
1253
576k
     {
1254
576k
      bool ret =
1255
576k
        sa->read(dwt->resno,
1256
576k
             grk_rect32(dwt->win_l.x0, y_offset + y,
1257
576k
                  std::min<uint32_t>(dwt->win_l.x1 + FILTER_WIDTH, dwt->sn_full),
1258
576k
                  y_offset + y + 1),
1259
576k
             (int32_t*)dwt->memL + y, 2 * stripHeight, 0);
1260
576k
      if(!ret)
1261
5.60k
         return false;
1262
576k
     }
1263
     // read one row of H band
1264
577k
     if(dwt->dn_full)
1265
589k
     {
1266
589k
      bool ret =
1267
589k
        sa->read(dwt->resno,
1268
589k
             grk_rect32(dwt->sn_full + dwt->win_h.x0, y_offset + y,
1269
589k
                  dwt->sn_full + std::min<uint32_t>(dwt->win_h.x1 + FILTER_WIDTH,
1270
589k
                                    dwt->dn_full),
1271
589k
                  y_offset + y + 1),
1272
589k
             (int32_t*)dwt->memH + y, 2 * stripHeight, 0);
1273
589k
      if(!ret)
1274
14.2k
         return false;
1275
589k
     }
1276
577k
    }
1277
1278
495k
    return true;
1279
515k
   }
grk::PartialInterleaver<int, 2u, 4u>::interleave_h(grk::dwt_data<int>*, grk::ISparseCanvas*, unsigned int, unsigned int)
Line
Count
Source
1247
84.3k
   {
1248
84.3k
    const uint32_t stripHeight = (uint32_t)(sizeof(T) / sizeof(int32_t));
1249
168k
    for(uint32_t y = 0; y < height; y++)
1250
84.3k
    {
1251
     // read one row of L band
1252
84.3k
     if(dwt->sn_full)
1253
84.3k
     {
1254
84.3k
      bool ret =
1255
84.3k
        sa->read(dwt->resno,
1256
84.3k
             grk_rect32(dwt->win_l.x0, y_offset + y,
1257
84.3k
                  std::min<uint32_t>(dwt->win_l.x1 + FILTER_WIDTH, dwt->sn_full),
1258
84.3k
                  y_offset + y + 1),
1259
84.3k
             (int32_t*)dwt->memL + y, 2 * stripHeight, 0);
1260
84.3k
      if(!ret)
1261
0
         return false;
1262
84.3k
     }
1263
     // read one row of H band
1264
84.3k
     if(dwt->dn_full)
1265
84.2k
     {
1266
84.2k
      bool ret =
1267
84.2k
        sa->read(dwt->resno,
1268
84.2k
             grk_rect32(dwt->sn_full + dwt->win_h.x0, y_offset + y,
1269
84.2k
                  dwt->sn_full + std::min<uint32_t>(dwt->win_h.x1 + FILTER_WIDTH,
1270
84.2k
                                    dwt->dn_full),
1271
84.2k
                  y_offset + y + 1),
1272
84.2k
             (int32_t*)dwt->memH + y, 2 * stripHeight, 0);
1273
84.2k
      if(!ret)
1274
0
         return false;
1275
84.2k
     }
1276
84.3k
    }
1277
1278
84.3k
    return true;
1279
84.3k
   }
grk::PartialInterleaver<grk::vec<float, 4ul>, 2u, 1u>::interleave_h(grk::dwt_data<grk::vec<float, 4ul> >*, grk::ISparseCanvas*, unsigned int, unsigned int)
Line
Count
Source
1247
430k
   {
1248
430k
    const uint32_t stripHeight = (uint32_t)(sizeof(T) / sizeof(int32_t));
1249
910k
    for(uint32_t y = 0; y < height; y++)
1250
499k
    {
1251
     // read one row of L band
1252
499k
     if(dwt->sn_full)
1253
492k
     {
1254
492k
      bool ret =
1255
492k
        sa->read(dwt->resno,
1256
492k
             grk_rect32(dwt->win_l.x0, y_offset + y,
1257
492k
                  std::min<uint32_t>(dwt->win_l.x1 + FILTER_WIDTH, dwt->sn_full),
1258
492k
                  y_offset + y + 1),
1259
492k
             (int32_t*)dwt->memL + y, 2 * stripHeight, 0);
1260
492k
      if(!ret)
1261
5.60k
         return false;
1262
492k
     }
1263
     // read one row of H band
1264
493k
     if(dwt->dn_full)
1265
505k
     {
1266
505k
      bool ret =
1267
505k
        sa->read(dwt->resno,
1268
505k
             grk_rect32(dwt->sn_full + dwt->win_h.x0, y_offset + y,
1269
505k
                  dwt->sn_full + std::min<uint32_t>(dwt->win_h.x1 + FILTER_WIDTH,
1270
505k
                                    dwt->dn_full),
1271
505k
                  y_offset + y + 1),
1272
505k
             (int32_t*)dwt->memH + y, 2 * stripHeight, 0);
1273
505k
      if(!ret)
1274
14.2k
         return false;
1275
505k
     }
1276
493k
    }
1277
1278
411k
    return true;
1279
430k
   }
1280
   bool interleave_v(dwt_data<T>* GRK_RESTRICT dwt, ISparseCanvas* sa, uint32_t x_offset,
1281
           uint32_t xWidth)
1282
489k
   {
1283
489k
    const uint32_t stripWidth = (sizeof(T) / sizeof(int32_t)) * VERT_PASS_WIDTH;
1284
    // read one vertical strip (of width xWidth <= stripWidth) of L band
1285
489k
    bool ret = false;
1286
489k
    if(dwt->sn_full)
1287
486k
    {
1288
486k
     ret = sa->read(dwt->resno,
1289
486k
            grk_rect32(x_offset, dwt->win_l.x0, x_offset + xWidth,
1290
486k
                   std::min<uint32_t>(dwt->win_l.x1 + FILTER_WIDTH, dwt->sn_full)),
1291
486k
            (int32_t*)dwt->memL, 1, 2 * stripWidth);
1292
486k
    }
1293
    // read one vertical strip (of width x_num_elements <= stripWidth) of H band
1294
489k
    if(dwt->dn_full)
1295
502k
    {
1296
502k
     ret = sa->read(dwt->resno,
1297
502k
            grk_rect32(x_offset, dwt->sn_full + dwt->win_h.x0, x_offset + xWidth,
1298
502k
                   dwt->sn_full + std::min<uint32_t>(dwt->win_h.x1 + FILTER_WIDTH,
1299
502k
                                   dwt->dn_full)),
1300
502k
            (int32_t*)dwt->memH, 1, 2 * stripWidth);
1301
502k
    }
1302
1303
489k
    return ret;
1304
489k
   }
grk::PartialInterleaver<int, 2u, 4u>::interleave_v(grk::dwt_data<int>*, grk::ISparseCanvas*, unsigned int, unsigned int)
Line
Count
Source
1282
29.9k
   {
1283
29.9k
    const uint32_t stripWidth = (sizeof(T) / sizeof(int32_t)) * VERT_PASS_WIDTH;
1284
    // read one vertical strip (of width xWidth <= stripWidth) of L band
1285
29.9k
    bool ret = false;
1286
29.9k
    if(dwt->sn_full)
1287
30.0k
    {
1288
30.0k
     ret = sa->read(dwt->resno,
1289
30.0k
            grk_rect32(x_offset, dwt->win_l.x0, x_offset + xWidth,
1290
30.0k
                   std::min<uint32_t>(dwt->win_l.x1 + FILTER_WIDTH, dwt->sn_full)),
1291
30.0k
            (int32_t*)dwt->memL, 1, 2 * stripWidth);
1292
30.0k
    }
1293
    // read one vertical strip (of width x_num_elements <= stripWidth) of H band
1294
29.9k
    if(dwt->dn_full)
1295
30.2k
    {
1296
30.2k
     ret = sa->read(dwt->resno,
1297
30.2k
            grk_rect32(x_offset, dwt->sn_full + dwt->win_h.x0, x_offset + xWidth,
1298
30.2k
                   dwt->sn_full + std::min<uint32_t>(dwt->win_h.x1 + FILTER_WIDTH,
1299
30.2k
                                   dwt->dn_full)),
1300
30.2k
            (int32_t*)dwt->memH, 1, 2 * stripWidth);
1301
30.2k
    }
1302
1303
29.9k
    return ret;
1304
29.9k
   }
grk::PartialInterleaver<grk::vec<float, 4ul>, 2u, 1u>::interleave_v(grk::dwt_data<grk::vec<float, 4ul> >*, grk::ISparseCanvas*, unsigned int, unsigned int)
Line
Count
Source
1282
459k
   {
1283
459k
    const uint32_t stripWidth = (sizeof(T) / sizeof(int32_t)) * VERT_PASS_WIDTH;
1284
    // read one vertical strip (of width xWidth <= stripWidth) of L band
1285
459k
    bool ret = false;
1286
459k
    if(dwt->sn_full)
1287
456k
    {
1288
456k
     ret = sa->read(dwt->resno,
1289
456k
            grk_rect32(x_offset, dwt->win_l.x0, x_offset + xWidth,
1290
456k
                   std::min<uint32_t>(dwt->win_l.x1 + FILTER_WIDTH, dwt->sn_full)),
1291
456k
            (int32_t*)dwt->memL, 1, 2 * stripWidth);
1292
456k
    }
1293
    // read one vertical strip (of width x_num_elements <= stripWidth) of H band
1294
459k
    if(dwt->dn_full)
1295
472k
    {
1296
472k
     ret = sa->read(dwt->resno,
1297
472k
            grk_rect32(x_offset, dwt->sn_full + dwt->win_h.x0, x_offset + xWidth,
1298
472k
                   dwt->sn_full + std::min<uint32_t>(dwt->win_h.x1 + FILTER_WIDTH,
1299
472k
                                   dwt->dn_full)),
1300
472k
            (int32_t*)dwt->memH, 1, 2 * stripWidth);
1301
472k
    }
1302
1303
459k
    return ret;
1304
459k
   }
1305
};
1306
template<typename T, uint32_t FILTER_WIDTH, uint32_t VERT_PASS_WIDTH>
1307
class Partial53 : public PartialInterleaver<T, FILTER_WIDTH, VERT_PASS_WIDTH>
1308
{
1309
 public:
1310
   void decompress_h(dwt_data<T>* dwt)
1311
84.4k
   {
1312
84.4k
#ifndef GRK_DEBUG_SPARSE
1313
407k
#define get_S(buf, i) buf[(i) << 1]
1314
29.8M
#define get_D(buf, i) buf[(1 + ((i) << 1))]
1315
84.4k
#endif
1316
1317
44.6M
#define S(buf, i) buf[(i) << 1]
1318
15.0M
#define D(buf, i) buf[(1 + ((i) << 1))]
1319
1320
// parity == 0
1321
84.4k
#define S_(buf, i) \
1322
100k
   ((i) < -win_l_x0 ? get_S(buf, -win_l_x0) : ((i) >= sn ? get_S(buf, sn - 1) : get_S(buf, i)))
1323
84.4k
#define D_(buf, i) \
1324
174k
   ((i) < -win_h_x0 ? get_D(buf, -win_h_x0) : ((i) >= dn ? get_D(buf, dn - 1) : get_D(buf, i)))
1325
1326
// parity == 1
1327
84.4k
#define SS_(buf, i) \
1328
84.4k
   ((i) < -win_h_x0 ? get_S(buf, -win_h_x0) : ((i) >= dn ? get_S(buf, dn - 1) : get_S(buf, i)))
1329
84.4k
#define DD_(buf, i) \
1330
84.4k
   ((i) < -win_l_x0 ? get_D(buf, -win_l_x0) : ((i) >= sn ? get_D(buf, sn - 1) : get_D(buf, i)))
1331
1332
84.4k
    int64_t i;
1333
84.4k
    int64_t parity = dwt->parity;
1334
84.4k
    int64_t win_l_x0 = dwt->win_l.x0;
1335
84.4k
    int64_t win_l_x1 = dwt->win_l.x1;
1336
84.4k
    int64_t win_h_x0 = dwt->win_h.x0;
1337
84.4k
    int64_t win_h_x1 = dwt->win_h.x1;
1338
84.4k
    assert(dwt->win_l.x0 <= dwt->sn_full);
1339
84.4k
    int64_t sn = (int64_t)dwt->sn_full - (int64_t)dwt->win_l.x0;
1340
84.4k
    int64_t sn_full = dwt->sn_full;
1341
84.4k
    assert(dwt->win_h.x0 <= dwt->dn_full);
1342
84.4k
    int64_t dn = (int64_t)dwt->dn_full - (int64_t)dwt->win_h.x0;
1343
84.4k
    int64_t dn_full = dwt->dn_full;
1344
1345
84.4k
    adjust_bounds(dwt, sn_full, dn_full, &sn, &dn);
1346
1347
84.4k
    assert(dwt->win_l.x1 <= sn_full && dwt->win_h.x1 <= dn_full);
1348
1349
84.4k
    auto buf = dwt->mem;
1350
84.4k
    if(!parity)
1351
84.1k
    {
1352
84.1k
     if((dn_full != 0) || (sn_full > 1))
1353
84.1k
     {
1354
      /* Naive version is :
1355
      for (i = win_l_x0; i < i_max; i++) {
1356
        S(i) -= (D_(i - 1) + D_(i) + 2) >> 2;
1357
      }
1358
      for (i = win_h_x0; i < win_h_x1; i++) {
1359
        D(i) += (S_(i) + S_(i + 1)) >> 1;
1360
      }
1361
      but the compiler doesn't manage to unroll it to avoid bound
1362
      checking in S_ and D_ macros
1363
      */
1364
84.1k
      i = 0;
1365
84.1k
      int64_t i_max = win_l_x1 - win_l_x0;
1366
84.1k
      if(i < i_max)
1367
84.1k
      {
1368
         /* Left-most case */
1369
84.1k
         S(buf, i) -= (D_(buf, i - 1) + D_(buf, i) + 2) >> 2;
1370
84.1k
         i++;
1371
1372
84.1k
         if(i_max > dn)
1373
3.15k
          i_max = dn;
1374
14.7M
         for(; i < i_max; i++)
1375
          /* No bound checking */
1376
14.6M
          S(buf, i) -= (get_D(buf, i - 1) + get_D(buf, i) + 2) >> 2;
1377
87.3k
         for(; i < win_l_x1 - win_l_x0; i++)
1378
          /* Right-most case */
1379
3.16k
          S(buf, i) -= (D_(buf, i - 1) + D_(buf, i) + 2) >> 2;
1380
84.1k
      }
1381
84.1k
      i = 0;
1382
84.1k
      i_max = win_h_x1 - win_h_x0;
1383
84.1k
      if(i < i_max)
1384
85.5k
      {
1385
85.5k
         if(i_max >= sn)
1386
44.6k
          i_max = sn - 1;
1387
15.0M
         for(; i < i_max; i++)
1388
          /* No bound checking */
1389
14.9M
          D(buf, i) += (S(buf, i) + S(buf, i + 1)) >> 1;
1390
135k
         for(; i < win_h_x1 - win_h_x0; i++)
1391
          /* Right-most case */
1392
50.1k
          D(buf, i) += (S_(buf, i) + S_(buf, i + 1)) >> 1;
1393
85.5k
      }
1394
84.1k
     }
1395
84.1k
    }
1396
273
    else
1397
273
    {
1398
273
     if(sn_full == 0 && dn_full == 1)
1399
15
     {
1400
      // only do L band (high pass)
1401
15
      S(buf, 0) >>= 1;
1402
15
     }
1403
258
     else
1404
258
     {
1405
1.05k
      for(i = 0; i < win_l_x1 - win_l_x0; i++)
1406
795
         D(buf, i) -= (SS_(buf, i) + SS_(buf, i + 1) + 2) >> 2;
1407
1.19k
      for(i = 0; i < win_h_x1 - win_h_x0; i++)
1408
933
         S(buf, i) += (DD_(buf, i) + DD_(buf, i - 1)) >> 1;
1409
258
     }
1410
273
    }
1411
84.4k
   }
1412
   void decompress_v(dwt_data<T>* dwt)
1413
30.2k
   {
1414
30.2k
#ifndef GRK_DEBUG_SPARSE
1415
542k
#define get_S_off(buf, i, off) buf[((i) << 1) * VERT_PASS_WIDTH + off]
1416
989k
#define get_D_off(buf, i, off) buf[(1 + ((i) << 1)) * VERT_PASS_WIDTH + off]
1417
30.2k
#endif
1418
1419
323k
#define S_off(buf, i, off) buf[((i) << 1) * VERT_PASS_WIDTH + off]
1420
224k
#define D_off(buf, i, off) buf[(1 + ((i) << 1)) * VERT_PASS_WIDTH + off]
1421
1422
// parity == 0
1423
174k
#define S_off_(buf, i, off) (((i) >= sn ? get_S_off(buf, sn - 1, off) : get_S_off(buf, i, off)))
1424
227k
#define D_off_(buf, i, off) (((i) >= dn ? get_D_off(buf, dn - 1, off) : get_D_off(buf, i, off)))
1425
1426
30.2k
#define S_sgnd_off_(buf, i, off) \
1427
30.2k
   (((i) < (-win_l_x0) ? get_S_off(buf, -win_l_x0, off) : S_off_(buf, i, off)))
1428
30.2k
#define D_sgnd_off_(buf, i, off) \
1429
193k
   (((i) < (-win_h_x0) ? get_D_off(buf, -win_h_x0, off) : D_off_(buf, i, off)))
1430
1431
// case == 1
1432
30.2k
#define SS_sgnd_off_(buf, i, off)                      \
1433
30.2k
   ((i) < (-win_h_x0) ? get_S_off(buf, -win_h_x0, off) \
1434
30.2k
            : ((i) >= dn ? get_S_off(buf, dn - 1, off) : get_S_off(buf, i, off)))
1435
30.2k
#define DD_sgnd_off_(buf, i, off)                      \
1436
30.2k
   ((i) < (-win_l_x0) ? get_D_off(buf, -win_l_x0, off) \
1437
3.78k
            : ((i) >= sn ? get_D_off(buf, sn - 1, off) : get_D_off(buf, i, off)))
1438
1439
30.2k
#define SS_off_(buf, i, off) (((i) >= dn ? get_S_off(buf, dn - 1, off) : get_S_off(buf, i, off)))
1440
30.2k
#define DD_off_(buf, i, off) (((i) >= sn ? get_D_off(buf, sn - 1, off) : get_D_off(buf, i, off)))
1441
1442
30.2k
    int64_t i;
1443
30.2k
    int64_t parity = dwt->parity;
1444
30.2k
    int64_t win_l_x0 = dwt->win_l.x0;
1445
30.2k
    int64_t win_l_x1 = dwt->win_l.x1;
1446
30.2k
    int64_t win_h_x0 = dwt->win_h.x0;
1447
30.2k
    int64_t win_h_x1 = dwt->win_h.x1;
1448
30.2k
    int64_t sn = (int64_t)dwt->sn_full - (int64_t)dwt->win_l.x0;
1449
30.2k
    int64_t sn_full = dwt->sn_full;
1450
30.2k
    int64_t dn = (int64_t)dwt->dn_full - (int64_t)dwt->win_h.x0;
1451
30.2k
    int64_t dn_full = dwt->dn_full;
1452
1453
30.2k
    adjust_bounds(dwt, sn_full, dn_full, &sn, &dn);
1454
1455
30.2k
    assert(dwt->win_l.x1 <= sn_full && dwt->win_h.x1 <= dn_full);
1456
1457
30.2k
    auto buf = dwt->mem;
1458
30.2k
    if(!parity)
1459
29.9k
    {
1460
29.9k
     if((dn_full != 0) || (sn_full > 1))
1461
29.9k
     {
1462
      /* Naive version is :
1463
      for (i = win_l_x0; i < i_max; i++) {
1464
        S(i) -= (D_(i - 1) + D_(i) + 2) >> 2;
1465
      }
1466
      for (i = win_h_x0; i < win_h_x1; i++) {
1467
        D(i) += (S_(i) + S_(i + 1)) >> 1;
1468
      }
1469
      but the compiler doesn't manage to unroll it to avoid bound
1470
      checking in S_ and D_ macros
1471
      */
1472
1473
      // 1. low pass
1474
29.9k
      i = 0;
1475
29.9k
      int64_t i_max = win_l_x1 - win_l_x0;
1476
29.9k
      assert(win_l_x1 >= win_l_x0);
1477
29.9k
      if(i < i_max)
1478
29.8k
      {
1479
         /* Left-most case */
1480
149k
         for(int64_t off = 0; off < VERT_PASS_WIDTH; off++)
1481
119k
          S_off(buf, i, off) -=
1482
119k
            (D_sgnd_off_(buf, i - 1, off) + D_off_(buf, i, off) + 2) >> 2;
1483
29.8k
         i++;
1484
29.8k
         if(i_max > dn)
1485
964
          i_max = dn;
1486
29.8k
#ifdef __SSE2__
1487
29.8k
         if(i + 1 < i_max)
1488
29.7k
         {
1489
29.7k
          const __m128i two = _mm_set1_epi32(2);
1490
29.7k
          auto Dm1 = _mm_load_si128((__m128i*)(buf + ((i << 1) - 1) * VERT_PASS_WIDTH));
1491
2.03M
          for(; i + 1 < i_max; i += 2)
1492
2.00M
          {
1493
           /* No bound checking */
1494
2.00M
           auto S = _mm_load_si128((__m128i*)(buf + (i << 1) * VERT_PASS_WIDTH));
1495
2.00M
           auto D = _mm_load_si128((__m128i*)(buf + ((i << 1) + 1) * VERT_PASS_WIDTH));
1496
2.00M
           auto S1 = _mm_load_si128((__m128i*)(buf + ((i << 1) + 2) * VERT_PASS_WIDTH));
1497
2.00M
           auto D1 = _mm_load_si128((__m128i*)(buf + ((i << 1) + 3) * VERT_PASS_WIDTH));
1498
2.00M
           S = _mm_sub_epi32(
1499
2.00M
             S, _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(Dm1, D), two), 2));
1500
2.00M
           S1 = _mm_sub_epi32(
1501
2.00M
             S1, _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(D, D1), two), 2));
1502
2.00M
           _mm_store_si128((__m128i*)(buf + (i << 1) * VERT_PASS_WIDTH), S);
1503
2.00M
           _mm_store_si128((__m128i*)(buf + ((i + 1) << 1) * VERT_PASS_WIDTH), S1);
1504
2.00M
           Dm1 = D1;
1505
2.00M
          }
1506
29.7k
         }
1507
29.8k
#endif
1508
47.5k
         for(; i < i_max; i++)
1509
17.6k
         {
1510
          /* No bound checking */
1511
88.1k
          for(uint32_t off = 0; off < VERT_PASS_WIDTH; off++)
1512
70.5k
           S_off(buf, i, off) -=
1513
70.5k
             (D_sgnd_off_(buf, i - 1, off) + D_off(buf, i, off) + 2) >> 2;
1514
17.6k
         }
1515
30.8k
         for(; i < win_l_x1 - win_l_x0; i++)
1516
970
         {
1517
          /* Right-most case */
1518
4.83k
          for(uint32_t off = 0; off < VERT_PASS_WIDTH; off++)
1519
3.86k
           S_off(buf, i, off) -=
1520
3.86k
             (D_sgnd_off_(buf, i - 1, off) + D_off_(buf, i, off) + 2) >> 2;
1521
970
         }
1522
29.8k
      }
1523
1524
      // 2. high pass
1525
29.9k
      i = 0;
1526
29.9k
      assert(win_h_x1 >= win_h_x0);
1527
29.9k
      i_max = win_h_x1 - win_h_x0;
1528
29.9k
      if(i < i_max)
1529
30.0k
      {
1530
30.0k
         if(i_max >= sn)
1531
18.8k
          i_max = sn - 1;
1532
30.0k
#ifdef __SSE2__
1533
30.0k
         if(i + 1 < i_max)
1534
29.9k
         {
1535
29.9k
          auto S = _mm_load_si128((__m128i*)(buf + (i << 1) * VERT_PASS_WIDTH));
1536
2.05M
          for(; i + 1 < i_max; i += 2)
1537
2.02M
          {
1538
           /* No bound checking */
1539
2.02M
           auto D = _mm_load_si128((__m128i*)(buf + (1 + (i << 1)) * VERT_PASS_WIDTH));
1540
2.02M
           auto S1 = _mm_load_si128((__m128i*)(buf + ((i + 1) << 1) * VERT_PASS_WIDTH));
1541
2.02M
           auto D1 =
1542
2.02M
             _mm_load_si128((__m128i*)(buf + (1 + ((i + 1) << 1)) * VERT_PASS_WIDTH));
1543
2.02M
           auto S2 = _mm_load_si128((__m128i*)(buf + ((i + 2) << 1) * VERT_PASS_WIDTH));
1544
2.02M
           D = _mm_add_epi32(D, _mm_srai_epi32(_mm_add_epi32(S, S1), 1));
1545
2.02M
           D1 = _mm_add_epi32(D1, _mm_srai_epi32(_mm_add_epi32(S1, S2), 1));
1546
2.02M
           _mm_store_si128((__m128i*)(buf + (1 + (i << 1)) * VERT_PASS_WIDTH), D);
1547
2.02M
           _mm_store_si128((__m128i*)(buf + (1 + ((i + 1) << 1)) * VERT_PASS_WIDTH), D1);
1548
2.02M
           S = S2;
1549
2.02M
          }
1550
29.9k
         }
1551
30.0k
#endif
1552
45.8k
         for(; i < i_max; i++)
1553
15.7k
         {
1554
          /* No bound checking */
1555
78.8k
          for(uint32_t off = 0; off < VERT_PASS_WIDTH; off++)
1556
63.0k
           D_off(buf, i, off) += (S_off(buf, i, off) + S_off(buf, i + 1, off)) >> 1;
1557
15.7k
         }
1558
51.8k
         for(; i < win_h_x1 - win_h_x0; i++)
1559
21.8k
         {
1560
          /* Right-most case */
1561
109k
          for(uint32_t off = 0; off < VERT_PASS_WIDTH; off++)
1562
87.2k
           D_off(buf, i, off) += (S_off_(buf, i, off) + S_off_(buf, i + 1, off)) >> 1;
1563
21.8k
         }
1564
30.0k
      }
1565
29.9k
     }
1566
29.9k
    }
1567
276
    else
1568
276
    {
1569
276
     if(sn_full == 0 && dn_full == 1)
1570
6
     {
1571
      // edge case at origin
1572
30
      for(uint32_t off = 0; off < VERT_PASS_WIDTH; off++)
1573
24
         S_off(buf, 0, off) >>= 1;
1574
6
     }
1575
270
     else
1576
270
     {
1577
270
      assert((uint64_t)(dwt->memL + (win_l_x1 - win_l_x0) * VERT_PASS_WIDTH) -
1578
270
             (uint64_t)dwt->allocatedMem <
1579
270
           dwt->lenBytes_);
1580
1.08k
      for(i = 0; i < win_l_x1 - win_l_x0; i++)
1581
815
      {
1582
4.07k
         for(uint32_t off = 0; off < VERT_PASS_WIDTH; off++)
1583
3.26k
          D_off(buf, i, off) -= (SS_off_(buf, i, off) + SS_off_(buf, i + 1, off) + 2) >> 2;
1584
815
      }
1585
270
      assert((uint64_t)(dwt->memH + (win_h_x1 - win_h_x0) * VERT_PASS_WIDTH) -
1586
270
             (uint64_t)dwt->allocatedMem <
1587
270
           dwt->lenBytes_);
1588
1.21k
      for(i = 0; i < win_h_x1 - win_h_x0; i++)
1589
947
      {
1590
4.73k
         for(uint32_t off = 0; off < VERT_PASS_WIDTH; off++)
1591
3.78k
          S_off(buf, i, off) += (DD_off_(buf, i, off) + DD_sgnd_off_(buf, i - 1, off)) >> 1;
1592
947
      }
1593
270
     }
1594
276
    }
1595
30.2k
   }
1596
1597
 private:
1598
   void adjust_bounds(dwt_data<T>* dwt, [[maybe_unused]] int64_t sn_full,
1599
            [[maybe_unused]] int64_t dn_full, int64_t* sn, int64_t* dn)
1600
114k
   {
1601
114k
    if((uint64_t)dwt->memH < (uint64_t)dwt->memL && *sn == *dn)
1602
0
    {
1603
0
     assert(dn_full == sn_full - 1);
1604
0
     (*dn)--;
1605
0
    }
1606
114k
    if((uint64_t)dwt->memL < (uint64_t)dwt->memH && *sn == *dn)
1607
0
    {
1608
0
     assert(sn_full == dn_full - 1);
1609
0
     (*sn)--;
1610
0
    }
1611
114k
   }
1612
#ifdef GRK_DEBUG_SPARSE
1613
   inline T get_S(T* const buf, int64_t i)
1614
   {
1615
    auto ret = buf[(i) << 1];
1616
    assert(abs(ret) < 0xFFFFFFF);
1617
    return ret;
1618
   }
1619
   inline T get_D(T* const buf, int64_t i)
1620
   {
1621
    auto ret = buf[(1 + ((i) << 1))];
1622
    assert(abs(ret) < 0xFFFFFFF);
1623
    return ret;
1624
   }
1625
   inline T get_S_off(T* const buf, int64_t i, int64_t off)
1626
   {
1627
    auto ret = buf[(i) * 2 * VERT_PASS_WIDTH + off];
1628
    assert(abs(ret) < 0xFFFFFFF);
1629
    return ret;
1630
   }
1631
   inline T get_D_off(T* const buf, int64_t i, int64_t off)
1632
   {
1633
    auto ret = buf[(1 + (i) * 2) * VERT_PASS_WIDTH + off];
1634
    assert(abs(ret) < 0xFFFFFFF);
1635
    return ret;
1636
   }
1637
#endif
1638
};
1639
template<typename T, uint32_t FILTER_WIDTH, uint32_t VERT_PASS_WIDTH>
1640
class Partial97 : public PartialInterleaver<T, FILTER_WIDTH, VERT_PASS_WIDTH>
1641
{
1642
 public:
1643
   void decompress_h(dwt_data<T>* dwt)
1644
436k
   {
1645
436k
    WaveletReverse::decompress_step_97(dwt);
1646
436k
   }
1647
   void decompress_v(dwt_data<T>* dwt)
1648
449k
   {
1649
449k
    WaveletReverse::decompress_step_97(dwt);
1650
449k
   }
1651
};
1652
// Notes:
1653
// 1. line buffer 0 offset == dwt->win_l.x0
1654
// 2. dwt->memL and dwt->memH are only set for partial decode
1655
Params97 WaveletReverse::makeParams97(dwt_data<vec4f>* dwt, bool isBandL, bool step1)
1656
4.38M
{
1657
4.38M
   Params97 rc;
1658
   // band_0 specifies absolute start of line buffer
1659
4.38M
   int64_t band_0 = isBandL ? dwt->win_l.x0 : dwt->win_h.x0;
1660
4.38M
   int64_t band_1 = isBandL ? dwt->win_l.x1 : dwt->win_h.x1;
1661
4.38M
   auto memPartial = isBandL ? dwt->memL : dwt->memH;
1662
4.38M
   int64_t parityOffset = isBandL ? dwt->parity : !dwt->parity;
1663
4.38M
   int64_t lenMax = isBandL
1664
4.38M
            ? (std::min<int64_t>)(dwt->sn_full, (int64_t)dwt->dn_full - parityOffset)
1665
4.38M
            : (std::min<int64_t>)(dwt->dn_full, (int64_t)dwt->sn_full - parityOffset);
1666
4.38M
   if(lenMax < 0)
1667
4.49k
    lenMax = 0;
1668
4.38M
   assert(lenMax >= band_0);
1669
4.38M
   lenMax -= band_0;
1670
18.4E
   rc.data = memPartial ? memPartial : dwt->mem;
1671
1672
4.38M
   assert(!memPartial || (dwt->win_l.x1 <= dwt->sn_full && dwt->win_h.x1 <= dwt->dn_full));
1673
4.38M
   assert(band_1 >= band_0);
1674
1675
4.38M
   rc.data += parityOffset + band_0 - dwt->win_l.x0;
1676
4.38M
   rc.len = (uint32_t)(band_1 - band_0);
1677
4.38M
   if(!step1)
1678
3.00M
   {
1679
3.00M
    rc.data += 1;
1680
3.00M
    rc.dataPrev = parityOffset ? rc.data - 2 : rc.data;
1681
3.00M
    rc.lenMax = (uint32_t)lenMax;
1682
3.00M
   }
1683
4.38M
   if(memPartial)
1684
4.43M
   {
1685
4.43M
    assert((uint64_t)rc.data >= (uint64_t)dwt->allocatedMem);
1686
4.43M
    assert((uint64_t)rc.data <= (uint64_t)dwt->allocatedMem + dwt->lenBytes_);
1687
4.43M
   }
1688
1689
4.38M
   return rc;
1690
4.38M
};
1691
1692
template<uint32_t FILTER_WIDTH>
1693
struct PartialBandInfo
1694
{
1695
   // 1. set up windows for horizontal and vertical passes
1696
   grk_rect32 bandWindowREL_[BAND_NUM_ORIENTATIONS];
1697
   // two windows formed by horizontal pass and used as input for vertical pass
1698
   grk_rect32 splitWindowREL_[SPLIT_NUM_ORIENTATIONS];
1699
   grk_rect32 resWindowREL_;
1700
1701
   bool alloc(ISparseCanvas* sa, uint8_t resno, Resolution* fullRes,
1702
        TileComponentWindow<int32_t>* tileWindow)
1703
40.4k
   {
1704
40.4k
    bandWindowREL_[BAND_ORIENT_LL] =
1705
40.4k
      tileWindow->getBandWindowBufferPaddedREL(resno, BAND_ORIENT_LL);
1706
40.4k
    bandWindowREL_[BAND_ORIENT_HL] =
1707
40.4k
      tileWindow->getBandWindowBufferPaddedREL(resno, BAND_ORIENT_HL);
1708
40.4k
    bandWindowREL_[BAND_ORIENT_LH] =
1709
40.4k
      tileWindow->getBandWindowBufferPaddedREL(resno, BAND_ORIENT_LH);
1710
40.4k
    bandWindowREL_[BAND_ORIENT_HH] =
1711
40.4k
      tileWindow->getBandWindowBufferPaddedREL(resno, BAND_ORIENT_HH);
1712
1713
    // band windows in band coordinates - needed to pre-allocate sparse blocks
1714
40.4k
    grk_rect32 tileBandWindowREL[BAND_NUM_ORIENTATIONS];
1715
1716
40.4k
    tileBandWindowREL[BAND_ORIENT_LL] = bandWindowREL_[BAND_ORIENT_LL];
1717
40.4k
    tileBandWindowREL[BAND_ORIENT_HL] =
1718
40.4k
      bandWindowREL_[BAND_ORIENT_HL].pan(fullRes->tileBand[BAND_INDEX_LH].width(), 0);
1719
40.4k
    tileBandWindowREL[BAND_ORIENT_LH] =
1720
40.4k
      bandWindowREL_[BAND_ORIENT_LH].pan(0, fullRes->tileBand[BAND_INDEX_HL].height());
1721
40.4k
    tileBandWindowREL[BAND_ORIENT_HH] = bandWindowREL_[BAND_ORIENT_HH].pan(
1722
40.4k
      fullRes->tileBand[BAND_INDEX_LH].width(), fullRes->tileBand[BAND_INDEX_HL].height());
1723
    // 2. pre-allocate sparse blocks
1724
201k
    for(uint32_t i = 0; i < BAND_NUM_ORIENTATIONS; ++i)
1725
160k
    {
1726
160k
     auto temp = tileBandWindowREL[i];
1727
160k
     if(!sa->alloc(temp.grow_IN_PLACE(2 * FILTER_WIDTH, fullRes->width(), fullRes->height()),
1728
160k
             true))
1729
0
      return false;
1730
160k
    }
1731
40.4k
    resWindowREL_ = tileWindow->getResWindowBufferREL(resno);
1732
40.4k
    if(!sa->alloc(resWindowREL_, true))
1733
0
     return false;
1734
40.4k
    splitWindowREL_[SPLIT_L] = tileWindow->getResWindowBufferSplitREL(resno, SPLIT_L);
1735
40.4k
    splitWindowREL_[SPLIT_H] = tileWindow->getResWindowBufferSplitREL(resno, SPLIT_H);
1736
1737
40.4k
    auto fullResNext = fullRes + 1;
1738
121k
    for(uint32_t k = 0; k < SPLIT_NUM_ORIENTATIONS; ++k)
1739
80.6k
    {
1740
80.6k
     auto temp = splitWindowREL_[k];
1741
80.6k
     if(!sa->alloc(
1742
80.6k
        temp.grow_IN_PLACE(2 * FILTER_WIDTH, fullResNext->width(), fullResNext->height()),
1743
80.6k
        true))
1744
0
      return false;
1745
80.6k
    }
1746
1747
40.4k
    return true;
1748
40.4k
   }
grk::PartialBandInfo<1u>::alloc(grk::ISparseCanvas*, unsigned char, grk::Resolution*, grk::TileComponentWindow<int>*)
Line
Count
Source
1703
853
   {
1704
853
    bandWindowREL_[BAND_ORIENT_LL] =
1705
853
      tileWindow->getBandWindowBufferPaddedREL(resno, BAND_ORIENT_LL);
1706
853
    bandWindowREL_[BAND_ORIENT_HL] =
1707
853
      tileWindow->getBandWindowBufferPaddedREL(resno, BAND_ORIENT_HL);
1708
853
    bandWindowREL_[BAND_ORIENT_LH] =
1709
853
      tileWindow->getBandWindowBufferPaddedREL(resno, BAND_ORIENT_LH);
1710
853
    bandWindowREL_[BAND_ORIENT_HH] =
1711
853
      tileWindow->getBandWindowBufferPaddedREL(resno, BAND_ORIENT_HH);
1712
1713
    // band windows in band coordinates - needed to pre-allocate sparse blocks
1714
853
    grk_rect32 tileBandWindowREL[BAND_NUM_ORIENTATIONS];
1715
1716
853
    tileBandWindowREL[BAND_ORIENT_LL] = bandWindowREL_[BAND_ORIENT_LL];
1717
853
    tileBandWindowREL[BAND_ORIENT_HL] =
1718
853
      bandWindowREL_[BAND_ORIENT_HL].pan(fullRes->tileBand[BAND_INDEX_LH].width(), 0);
1719
853
    tileBandWindowREL[BAND_ORIENT_LH] =
1720
853
      bandWindowREL_[BAND_ORIENT_LH].pan(0, fullRes->tileBand[BAND_INDEX_HL].height());
1721
853
    tileBandWindowREL[BAND_ORIENT_HH] = bandWindowREL_[BAND_ORIENT_HH].pan(
1722
853
      fullRes->tileBand[BAND_INDEX_LH].width(), fullRes->tileBand[BAND_INDEX_HL].height());
1723
    // 2. pre-allocate sparse blocks
1724
4.26k
    for(uint32_t i = 0; i < BAND_NUM_ORIENTATIONS; ++i)
1725
3.41k
    {
1726
3.41k
     auto temp = tileBandWindowREL[i];
1727
3.41k
     if(!sa->alloc(temp.grow_IN_PLACE(2 * FILTER_WIDTH, fullRes->width(), fullRes->height()),
1728
3.41k
             true))
1729
0
      return false;
1730
3.41k
    }
1731
853
    resWindowREL_ = tileWindow->getResWindowBufferREL(resno);
1732
853
    if(!sa->alloc(resWindowREL_, true))
1733
0
     return false;
1734
853
    splitWindowREL_[SPLIT_L] = tileWindow->getResWindowBufferSplitREL(resno, SPLIT_L);
1735
853
    splitWindowREL_[SPLIT_H] = tileWindow->getResWindowBufferSplitREL(resno, SPLIT_H);
1736
1737
853
    auto fullResNext = fullRes + 1;
1738
2.55k
    for(uint32_t k = 0; k < SPLIT_NUM_ORIENTATIONS; ++k)
1739
1.70k
    {
1740
1.70k
     auto temp = splitWindowREL_[k];
1741
1.70k
     if(!sa->alloc(
1742
1.70k
        temp.grow_IN_PLACE(2 * FILTER_WIDTH, fullResNext->width(), fullResNext->height()),
1743
1.70k
        true))
1744
0
      return false;
1745
1.70k
    }
1746
1747
853
    return true;
1748
853
   }
grk::PartialBandInfo<2u>::alloc(grk::ISparseCanvas*, unsigned char, grk::Resolution*, grk::TileComponentWindow<int>*)
Line
Count
Source
1703
39.5k
   {
1704
39.5k
    bandWindowREL_[BAND_ORIENT_LL] =
1705
39.5k
      tileWindow->getBandWindowBufferPaddedREL(resno, BAND_ORIENT_LL);
1706
39.5k
    bandWindowREL_[BAND_ORIENT_HL] =
1707
39.5k
      tileWindow->getBandWindowBufferPaddedREL(resno, BAND_ORIENT_HL);
1708
39.5k
    bandWindowREL_[BAND_ORIENT_LH] =
1709
39.5k
      tileWindow->getBandWindowBufferPaddedREL(resno, BAND_ORIENT_LH);
1710
39.5k
    bandWindowREL_[BAND_ORIENT_HH] =
1711
39.5k
      tileWindow->getBandWindowBufferPaddedREL(resno, BAND_ORIENT_HH);
1712
1713
    // band windows in band coordinates - needed to pre-allocate sparse blocks
1714
39.5k
    grk_rect32 tileBandWindowREL[BAND_NUM_ORIENTATIONS];
1715
1716
39.5k
    tileBandWindowREL[BAND_ORIENT_LL] = bandWindowREL_[BAND_ORIENT_LL];
1717
39.5k
    tileBandWindowREL[BAND_ORIENT_HL] =
1718
39.5k
      bandWindowREL_[BAND_ORIENT_HL].pan(fullRes->tileBand[BAND_INDEX_LH].width(), 0);
1719
39.5k
    tileBandWindowREL[BAND_ORIENT_LH] =
1720
39.5k
      bandWindowREL_[BAND_ORIENT_LH].pan(0, fullRes->tileBand[BAND_INDEX_HL].height());
1721
39.5k
    tileBandWindowREL[BAND_ORIENT_HH] = bandWindowREL_[BAND_ORIENT_HH].pan(
1722
39.5k
      fullRes->tileBand[BAND_INDEX_LH].width(), fullRes->tileBand[BAND_INDEX_HL].height());
1723
    // 2. pre-allocate sparse blocks
1724
197k
    for(uint32_t i = 0; i < BAND_NUM_ORIENTATIONS; ++i)
1725
157k
    {
1726
157k
     auto temp = tileBandWindowREL[i];
1727
157k
     if(!sa->alloc(temp.grow_IN_PLACE(2 * FILTER_WIDTH, fullRes->width(), fullRes->height()),
1728
157k
             true))
1729
0
      return false;
1730
157k
    }
1731
39.5k
    resWindowREL_ = tileWindow->getResWindowBufferREL(resno);
1732
39.5k
    if(!sa->alloc(resWindowREL_, true))
1733
0
     return false;
1734
39.5k
    splitWindowREL_[SPLIT_L] = tileWindow->getResWindowBufferSplitREL(resno, SPLIT_L);
1735
39.5k
    splitWindowREL_[SPLIT_H] = tileWindow->getResWindowBufferSplitREL(resno, SPLIT_H);
1736
1737
39.5k
    auto fullResNext = fullRes + 1;
1738
118k
    for(uint32_t k = 0; k < SPLIT_NUM_ORIENTATIONS; ++k)
1739
78.9k
    {
1740
78.9k
     auto temp = splitWindowREL_[k];
1741
78.9k
     if(!sa->alloc(
1742
78.9k
        temp.grow_IN_PLACE(2 * FILTER_WIDTH, fullResNext->width(), fullResNext->height()),
1743
78.9k
        true))
1744
0
      return false;
1745
78.9k
    }
1746
1747
39.5k
    return true;
1748
39.5k
   }
1749
};
1750
1751
/**
1752
 * ************************************************************************************
1753
 *
1754
 * 5/3 operates on elements of type int32_t while 9/7 operates on elements of type vec4f
1755
 *
1756
 * Horizontal pass
1757
 *
1758
 * Each thread processes a strip running the length of the window, of the following dimensions:
1759
 *
1760
 *   5/3
1761
 *   Height : 1
1762
 *
1763
 *   9/7
1764
 *   Height : 4
1765
 *
1766
 * Vertical pass
1767
 *
1768
 *  5/3
1769
 *  Width :  4
1770
 *
1771
 *  9/7
1772
 *  Height : 1
1773
 *
1774
 ****************************************************************************
1775
 *
1776
 * FILTER_WIDTH value matches the maximum left/right extension given in tables
1777
 * F.2 and F.3 of the standard
1778
 */
1779
template<typename T, uint32_t FILTER_WIDTH, uint32_t VERT_PASS_WIDTH, typename D>
1780
1781
bool WaveletReverse::decompress_partial_tile(ISparseCanvas* sa,
1782
                       std::vector<TaskInfo<T, dwt_data<T>>*>& tasks)
1783
18.3k
{
1784
18.3k
   uint8_t numresolutions = tilec_->numresolutions;
1785
18.3k
   auto buf = tilec_->getWindow();
1786
18.3k
   auto simpleBuf = buf->getResWindowBufferHighestSimple();
1787
18.3k
   auto fullRes = tilec_->resolutions_;
1788
18.3k
   auto fullResTopLevel = tilec_->resolutions_ + numres_ - 1;
1789
18.3k
   if(!fullResTopLevel->width() || !fullResTopLevel->height())
1790
6.93k
    return true;
1791
1792
11.3k
   [[maybe_unused]] const uint16_t debug_compno = 0;
1793
11.3k
   const uint32_t HORIZ_PASS_HEIGHT = sizeof(T) / sizeof(int32_t);
1794
11.3k
   const uint32_t pad = FILTER_WIDTH * std::max<uint32_t>(HORIZ_PASS_HEIGHT, VERT_PASS_WIDTH) *
1795
11.3k
            sizeof(T) / sizeof(int32_t);
1796
   // reduce window
1797
11.3k
   auto synthesisWindow = unreducedWindow_.scaleDownCeilPow2(numresolutions - numres_);
1798
11.3k
   assert(fullResTopLevel->intersection(synthesisWindow) == synthesisWindow);
1799
   // shift to relative coordinates
1800
11.3k
   synthesisWindow =
1801
11.3k
     synthesisWindow.pan(-(int64_t)fullResTopLevel->x0, -(int64_t)fullResTopLevel->y0);
1802
11.3k
   if(synthesisWindow.empty())
1803
33
    return true;
1804
11.3k
   uint32_t numThreads = (uint32_t)ExecSingleton::get().num_workers();
1805
11.3k
   auto imageComponentFlow = scheduler_->getImageComponentFlow(compno_);
1806
   // imageComponentFlow == nullptr ==> no blocks were decompressed for this component
1807
11.3k
   if(!imageComponentFlow)
1808
2.30k
    return true;
1809
9.04k
   if(numres_ == 1U)
1810
2.46k
   {
1811
2.46k
    auto final_read = [sa, synthesisWindow, simpleBuf]() {
1812
     // final read into tile buffer
1813
2.46k
     bool ret = sa->read(0, synthesisWindow, simpleBuf.buf_, 1, simpleBuf.stride_);
1814
1815
2.46k
     return ret;
1816
2.46k
    };
grk::WaveletReverse::decompress_partial_tile<int, 1u, 4u, grk::Partial53<int, 2u, 4u> >(grk::ISparseCanvas*, std::__1::vector<grk::TaskInfo<int, grk::dwt_data<int> >*, std::__1::allocator<grk::TaskInfo<int, grk::dwt_data<int> >*> >&)::{lambda()#1}::operator()() const
Line
Count
Source
1811
2.28k
    auto final_read = [sa, synthesisWindow, simpleBuf]() {
1812
     // final read into tile buffer
1813
2.28k
     bool ret = sa->read(0, synthesisWindow, simpleBuf.buf_, 1, simpleBuf.stride_);
1814
1815
2.28k
     return ret;
1816
2.28k
    };
grk::WaveletReverse::decompress_partial_tile<grk::vec<float, 4ul>, 2u, 1u, grk::Partial97<grk::vec<float, 4ul>, 2u, 1u> >(grk::ISparseCanvas*, std::__1::vector<grk::TaskInfo<grk::vec<float, 4ul>, grk::dwt_data<grk::vec<float, 4ul> > >*, std::__1::allocator<grk::TaskInfo<grk::vec<float, 4ul>, grk::dwt_data<grk::vec<float, 4ul> > >*> >&)::{lambda()#1}::operator()() const
Line
Count
Source
1811
180
    auto final_read = [sa, synthesisWindow, simpleBuf]() {
1812
     // final read into tile buffer
1813
180
     bool ret = sa->read(0, synthesisWindow, simpleBuf.buf_, 1, simpleBuf.stride_);
1814
1815
180
     return ret;
1816
180
    };
1817
2.46k
    if(numThreads > 1)
1818
2.46k
     imageComponentFlow->waveletFinalCopy_->nextTask().work([final_read] { final_read(); });
grk::WaveletReverse::decompress_partial_tile<int, 1u, 4u, grk::Partial53<int, 2u, 4u> >(grk::ISparseCanvas*, std::__1::vector<grk::TaskInfo<int, grk::dwt_data<int> >*, std::__1::allocator<grk::TaskInfo<int, grk::dwt_data<int> >*> >&)::{lambda()#2}::operator()() const
Line
Count
Source
1818
2.28k
     imageComponentFlow->waveletFinalCopy_->nextTask().work([final_read] { final_read(); });
grk::WaveletReverse::decompress_partial_tile<grk::vec<float, 4ul>, 2u, 1u, grk::Partial97<grk::vec<float, 4ul>, 2u, 1u> >(grk::ISparseCanvas*, std::__1::vector<grk::TaskInfo<grk::vec<float, 4ul>, grk::dwt_data<grk::vec<float, 4ul> > >*, std::__1::allocator<grk::TaskInfo<grk::vec<float, 4ul>, grk::dwt_data<grk::vec<float, 4ul> > >*> >&)::{lambda()#2}::operator()() const
Line
Count
Source
1818
180
     imageComponentFlow->waveletFinalCopy_->nextTask().work([final_read] { final_read(); });
1819
3
    else
1820
3
     final_read();
1821
1822
2.46k
    return true;
1823
2.46k
   }
1824
6.57k
   auto final_read = [this, sa, synthesisWindow, simpleBuf]() {
1825
    // final read into tile buffer
1826
6.56k
    bool ret = sa->read(numres_ - 1, synthesisWindow, simpleBuf.buf_, 1, simpleBuf.stride_);
1827
1828
6.56k
    return ret;
1829
6.56k
   };
grk::WaveletReverse::decompress_partial_tile<int, 1u, 4u, grk::Partial53<int, 2u, 4u> >(grk::ISparseCanvas*, std::__1::vector<grk::TaskInfo<int, grk::dwt_data<int> >*, std::__1::allocator<grk::TaskInfo<int, grk::dwt_data<int> >*> >&)::{lambda()#3}::operator()() const
Line
Count
Source
1824
271
   auto final_read = [this, sa, synthesisWindow, simpleBuf]() {
1825
    // final read into tile buffer
1826
271
    bool ret = sa->read(numres_ - 1, synthesisWindow, simpleBuf.buf_, 1, simpleBuf.stride_);
1827
1828
271
    return ret;
1829
271
   };
grk::WaveletReverse::decompress_partial_tile<grk::vec<float, 4ul>, 2u, 1u, grk::Partial97<grk::vec<float, 4ul>, 2u, 1u> >(grk::ISparseCanvas*, std::__1::vector<grk::TaskInfo<grk::vec<float, 4ul>, grk::dwt_data<grk::vec<float, 4ul> > >*, std::__1::allocator<grk::TaskInfo<grk::vec<float, 4ul>, grk::dwt_data<grk::vec<float, 4ul> > >*> >&)::{lambda()#3}::operator()() const
Line
Count
Source
1824
6.29k
   auto final_read = [this, sa, synthesisWindow, simpleBuf]() {
1825
    // final read into tile buffer
1826
6.29k
    bool ret = sa->read(numres_ - 1, synthesisWindow, simpleBuf.buf_, 1, simpleBuf.stride_);
1827
1828
6.29k
    return ret;
1829
6.29k
   };
1830
   // pre-allocate all blocks
1831
6.57k
   std::vector<PartialBandInfo<FILTER_WIDTH>> resBandInfo;
1832
47.0k
   for(uint8_t resno = 1; resno < numres_; resno++)
1833
40.4k
   {
1834
40.4k
    PartialBandInfo<FILTER_WIDTH> bandInfo;
1835
40.4k
    if(!bandInfo.alloc(sa, resno, fullRes + resno - 1, buf))
1836
0
     return false;
1837
40.4k
    resBandInfo.push_back(bandInfo);
1838
40.4k
   }
1839
6.57k
   D decompressor;
1840
47.0k
   for(uint8_t resno = 1; resno < numres_; resno++)
1841
40.4k
   {
1842
40.4k
    dwt_data<T> horiz;
1843
40.4k
    dwt_data<T> vert;
1844
40.4k
    horiz.sn_full = fullRes->width();
1845
40.4k
    vert.sn_full = fullRes->height();
1846
40.4k
    fullRes++;
1847
40.4k
    horiz.dn_full = fullRes->width() - horiz.sn_full;
1848
40.4k
    horiz.parity = fullRes->x0 & 1;
1849
40.4k
    vert.dn_full = fullRes->height() - vert.sn_full;
1850
40.4k
    vert.parity = fullRes->y0 & 1;
1851
40.4k
    PartialBandInfo<FILTER_WIDTH>& bandInfo = resBandInfo[resno - 1];
1852
1853
457k
    auto executor_h = [resno, sa, bandInfo, &decompressor](TaskInfo<T, dwt_data<T>>* taskInfo) {
1854
948k
     for(uint32_t yPos = taskInfo->indexMin_; yPos < taskInfo->indexMax_;
1855
491k
       yPos += HORIZ_PASS_HEIGHT)
1856
513k
     {
1857
513k
      auto height =
1858
513k
        std::min<uint32_t>((uint32_t)HORIZ_PASS_HEIGHT, taskInfo->indexMax_ - yPos);
1859
513k
      taskInfo->data.memL = taskInfo->data.mem + taskInfo->data.parity;
1860
513k
      taskInfo->data.memH =
1861
513k
        taskInfo->data.mem + (int64_t)(!taskInfo->data.parity) +
1862
513k
        2 * ((int64_t)taskInfo->data.win_h.x0 - (int64_t)taskInfo->data.win_l.x0);
1863
513k
      if(!decompressor.interleave_h(&taskInfo->data, sa, yPos, height))
1864
19.7k
      {
1865
19.7k
         return false;
1866
19.7k
      }
1867
493k
      taskInfo->data.memL = taskInfo->data.mem;
1868
493k
      taskInfo->data.memH = taskInfo->data.mem + ((int64_t)taskInfo->data.win_h.x0 -
1869
493k
                            (int64_t)taskInfo->data.win_l.x0);
1870
493k
      decompressor.decompress_h(&taskInfo->data);
1871
493k
      if(!sa->write(resno,
1872
493k
              grk_rect32(bandInfo.resWindowREL_.x0, yPos, bandInfo.resWindowREL_.x1,
1873
493k
                   yPos + height),
1874
493k
              (int32_t*)(taskInfo->data.mem + (int64_t)bandInfo.resWindowREL_.x0 -
1875
493k
                   2 * (int64_t)taskInfo->data.win_l.x0),
1876
493k
              HORIZ_PASS_HEIGHT, 1))
1877
2.46k
      {
1878
2.46k
         return false;
1879
2.46k
      }
1880
493k
     }
1881
1882
435k
     return true;
1883
457k
    };
grk::WaveletReverse::decompress_partial_tile<int, 1u, 4u, grk::Partial53<int, 2u, 4u> >(grk::ISparseCanvas*, std::__1::vector<grk::TaskInfo<int, grk::dwt_data<int> >*, std::__1::allocator<grk::TaskInfo<int, grk::dwt_data<int> >*> >&)::{lambda(grk::TaskInfo<int, grk::dwt_data<int> >*)#1}::operator()(grk::TaskInfo<int, grk::dwt_data<int> >*) const
Line
Count
Source
1853
37.1k
    auto executor_h = [resno, sa, bandInfo, &decompressor](TaskInfo<T, dwt_data<T>>* taskInfo) {
1854
121k
     for(uint32_t yPos = taskInfo->indexMin_; yPos < taskInfo->indexMax_;
1855
84.3k
       yPos += HORIZ_PASS_HEIGHT)
1856
84.4k
     {
1857
84.4k
      auto height =
1858
84.4k
        std::min<uint32_t>((uint32_t)HORIZ_PASS_HEIGHT, taskInfo->indexMax_ - yPos);
1859
84.4k
      taskInfo->data.memL = taskInfo->data.mem + taskInfo->data.parity;
1860
84.4k
      taskInfo->data.memH =
1861
84.4k
        taskInfo->data.mem + (int64_t)(!taskInfo->data.parity) +
1862
84.4k
        2 * ((int64_t)taskInfo->data.win_h.x0 - (int64_t)taskInfo->data.win_l.x0);
1863
84.4k
      if(!decompressor.interleave_h(&taskInfo->data, sa, yPos, height))
1864
0
      {
1865
0
         return false;
1866
0
      }
1867
84.4k
      taskInfo->data.memL = taskInfo->data.mem;
1868
84.4k
      taskInfo->data.memH = taskInfo->data.mem + ((int64_t)taskInfo->data.win_h.x0 -
1869
84.4k
                            (int64_t)taskInfo->data.win_l.x0);
1870
84.4k
      decompressor.decompress_h(&taskInfo->data);
1871
84.4k
      if(!sa->write(resno,
1872
84.4k
              grk_rect32(bandInfo.resWindowREL_.x0, yPos, bandInfo.resWindowREL_.x1,
1873
84.4k
                   yPos + height),
1874
84.4k
              (int32_t*)(taskInfo->data.mem + (int64_t)bandInfo.resWindowREL_.x0 -
1875
84.4k
                   2 * (int64_t)taskInfo->data.win_l.x0),
1876
84.4k
              HORIZ_PASS_HEIGHT, 1))
1877
54
      {
1878
54
         return false;
1879
54
      }
1880
84.4k
     }
1881
1882
37.1k
     return true;
1883
37.1k
    };
grk::WaveletReverse::decompress_partial_tile<grk::vec<float, 4ul>, 2u, 1u, grk::Partial97<grk::vec<float, 4ul>, 2u, 1u> >(grk::ISparseCanvas*, std::__1::vector<grk::TaskInfo<grk::vec<float, 4ul>, grk::dwt_data<grk::vec<float, 4ul> > >*, std::__1::allocator<grk::TaskInfo<grk::vec<float, 4ul>, grk::dwt_data<grk::vec<float, 4ul> > >*> >&)::{lambda(grk::TaskInfo<grk::vec<float, 4ul>, grk::dwt_data<grk::vec<float, 4ul> > >*)#1}::operator()(grk::TaskInfo<grk::vec<float, 4ul>, grk::dwt_data<grk::vec<float, 4ul> > >*) const
Line
Count
Source
1853
420k
    auto executor_h = [resno, sa, bandInfo, &decompressor](TaskInfo<T, dwt_data<T>>* taskInfo) {
1854
826k
     for(uint32_t yPos = taskInfo->indexMin_; yPos < taskInfo->indexMax_;
1855
420k
       yPos += HORIZ_PASS_HEIGHT)
1856
428k
     {
1857
428k
      auto height =
1858
428k
        std::min<uint32_t>((uint32_t)HORIZ_PASS_HEIGHT, taskInfo->indexMax_ - yPos);
1859
428k
      taskInfo->data.memL = taskInfo->data.mem + taskInfo->data.parity;
1860
428k
      taskInfo->data.memH =
1861
428k
        taskInfo->data.mem + (int64_t)(!taskInfo->data.parity) +
1862
428k
        2 * ((int64_t)taskInfo->data.win_h.x0 - (int64_t)taskInfo->data.win_l.x0);
1863
428k
      if(!decompressor.interleave_h(&taskInfo->data, sa, yPos, height))
1864
19.7k
      {
1865
19.7k
         return false;
1866
19.7k
      }
1867
409k
      taskInfo->data.memL = taskInfo->data.mem;
1868
409k
      taskInfo->data.memH = taskInfo->data.mem + ((int64_t)taskInfo->data.win_h.x0 -
1869
409k
                            (int64_t)taskInfo->data.win_l.x0);
1870
409k
      decompressor.decompress_h(&taskInfo->data);
1871
409k
      if(!sa->write(resno,
1872
409k
              grk_rect32(bandInfo.resWindowREL_.x0, yPos, bandInfo.resWindowREL_.x1,
1873
409k
                   yPos + height),
1874
409k
              (int32_t*)(taskInfo->data.mem + (int64_t)bandInfo.resWindowREL_.x0 -
1875
409k
                   2 * (int64_t)taskInfo->data.win_l.x0),
1876
409k
              HORIZ_PASS_HEIGHT, 1))
1877
2.41k
      {
1878
2.41k
         return false;
1879
2.41k
      }
1880
409k
     }
1881
1882
398k
     return true;
1883
420k
    };
1884
357k
    auto executor_v = [resno, sa, bandInfo, &decompressor](TaskInfo<T, dwt_data<T>>* taskInfo) {
1885
816k
     for(uint32_t xPos = taskInfo->indexMin_; xPos < taskInfo->indexMax_;
1886
459k
       xPos += VERT_PASS_WIDTH)
1887
487k
     {
1888
487k
      auto width = std::min<uint32_t>(VERT_PASS_WIDTH, (taskInfo->indexMax_ - xPos));
1889
487k
      taskInfo->data.memL = taskInfo->data.mem + (taskInfo->data.parity) * VERT_PASS_WIDTH;
1890
487k
      taskInfo->data.memH =
1891
487k
        taskInfo->data.mem +
1892
487k
        ((!taskInfo->data.parity) +
1893
487k
         2 * ((int64_t)taskInfo->data.win_h.x0 - (int64_t)taskInfo->data.win_l.x0)) *
1894
487k
          VERT_PASS_WIDTH;
1895
487k
      if(!decompressor.interleave_v(&taskInfo->data, sa, xPos, width))
1896
28.8k
      {
1897
28.8k
         return false;
1898
28.8k
      }
1899
459k
      taskInfo->data.memL = taskInfo->data.mem;
1900
459k
      taskInfo->data.memH = taskInfo->data.mem + ((int64_t)taskInfo->data.win_h.x0 -
1901
459k
                            (int64_t)taskInfo->data.win_l.x0) *
1902
459k
                               VERT_PASS_WIDTH;
1903
459k
      decompressor.decompress_v(&taskInfo->data);
1904
      // write to buffer for final res
1905
459k
      if(!sa->write(resno,
1906
459k
              grk_rect32(xPos, bandInfo.resWindowREL_.y0, xPos + width,
1907
459k
                   bandInfo.resWindowREL_.y0 + taskInfo->data.win_l.length() +
1908
459k
                     taskInfo->data.win_h.length()),
1909
459k
              (int32_t*)(taskInfo->data.mem + ((int64_t)bandInfo.resWindowREL_.y0 -
1910
459k
                               2 * (int64_t)taskInfo->data.win_l.x0) *
1911
459k
                                VERT_PASS_WIDTH),
1912
459k
              1, VERT_PASS_WIDTH * (sizeof(T) / sizeof(int32_t))))
1913
0
      {
1914
0
         Logger::logger_.error("Sparse array write failure");
1915
0
         return false;
1916
0
      }
1917
459k
     }
1918
1919
329k
     return true;
1920
357k
    };
grk::WaveletReverse::decompress_partial_tile<int, 1u, 4u, grk::Partial53<int, 2u, 4u> >(grk::ISparseCanvas*, std::__1::vector<grk::TaskInfo<int, grk::dwt_data<int> >*, std::__1::allocator<grk::TaskInfo<int, grk::dwt_data<int> >*> >&)::{lambda(grk::TaskInfo<int, grk::dwt_data<int> >*)#2}::operator()(grk::TaskInfo<int, grk::dwt_data<int> >*) const
Line
Count
Source
1884
20.3k
    auto executor_v = [resno, sa, bandInfo, &decompressor](TaskInfo<T, dwt_data<T>>* taskInfo) {
1885
50.3k
     for(uint32_t xPos = taskInfo->indexMin_; xPos < taskInfo->indexMax_;
1886
30.0k
       xPos += VERT_PASS_WIDTH)
1887
30.0k
     {
1888
30.0k
      auto width = std::min<uint32_t>(VERT_PASS_WIDTH, (taskInfo->indexMax_ - xPos));
1889
30.0k
      taskInfo->data.memL = taskInfo->data.mem + (taskInfo->data.parity) * VERT_PASS_WIDTH;
1890
30.0k
      taskInfo->data.memH =
1891
30.0k
        taskInfo->data.mem +
1892
30.0k
        ((!taskInfo->data.parity) +
1893
30.0k
         2 * ((int64_t)taskInfo->data.win_h.x0 - (int64_t)taskInfo->data.win_l.x0)) *
1894
30.0k
          VERT_PASS_WIDTH;
1895
30.0k
      if(!decompressor.interleave_v(&taskInfo->data, sa, xPos, width))
1896
3
      {
1897
3
         return false;
1898
3
      }
1899
30.0k
      taskInfo->data.memL = taskInfo->data.mem;
1900
30.0k
      taskInfo->data.memH = taskInfo->data.mem + ((int64_t)taskInfo->data.win_h.x0 -
1901
30.0k
                            (int64_t)taskInfo->data.win_l.x0) *
1902
30.0k
                               VERT_PASS_WIDTH;
1903
30.0k
      decompressor.decompress_v(&taskInfo->data);
1904
      // write to buffer for final res
1905
30.0k
      if(!sa->write(resno,
1906
30.0k
              grk_rect32(xPos, bandInfo.resWindowREL_.y0, xPos + width,
1907
30.0k
                   bandInfo.resWindowREL_.y0 + taskInfo->data.win_l.length() +
1908
30.0k
                     taskInfo->data.win_h.length()),
1909
30.0k
              (int32_t*)(taskInfo->data.mem + ((int64_t)bandInfo.resWindowREL_.y0 -
1910
30.0k
                               2 * (int64_t)taskInfo->data.win_l.x0) *
1911
30.0k
                                VERT_PASS_WIDTH),
1912
30.0k
              1, VERT_PASS_WIDTH * (sizeof(T) / sizeof(int32_t))))
1913
0
      {
1914
0
         Logger::logger_.error("Sparse array write failure");
1915
0
         return false;
1916
0
      }
1917
30.0k
     }
1918
1919
20.3k
     return true;
1920
20.3k
    };
grk::WaveletReverse::decompress_partial_tile<grk::vec<float, 4ul>, 2u, 1u, grk::Partial97<grk::vec<float, 4ul>, 2u, 1u> >(grk::ISparseCanvas*, std::__1::vector<grk::TaskInfo<grk::vec<float, 4ul>, grk::dwt_data<grk::vec<float, 4ul> > >*, std::__1::allocator<grk::TaskInfo<grk::vec<float, 4ul>, grk::dwt_data<grk::vec<float, 4ul> > >*> >&)::{lambda(grk::TaskInfo<grk::vec<float, 4ul>, grk::dwt_data<grk::vec<float, 4ul> > >*)#2}::operator()(grk::TaskInfo<grk::vec<float, 4ul>, grk::dwt_data<grk::vec<float, 4ul> > >*) const
Line
Count
Source
1884
337k
    auto executor_v = [resno, sa, bandInfo, &decompressor](TaskInfo<T, dwt_data<T>>* taskInfo) {
1885
766k
     for(uint32_t xPos = taskInfo->indexMin_; xPos < taskInfo->indexMax_;
1886
429k
       xPos += VERT_PASS_WIDTH)
1887
457k
     {
1888
457k
      auto width = std::min<uint32_t>(VERT_PASS_WIDTH, (taskInfo->indexMax_ - xPos));
1889
457k
      taskInfo->data.memL = taskInfo->data.mem + (taskInfo->data.parity) * VERT_PASS_WIDTH;
1890
457k
      taskInfo->data.memH =
1891
457k
        taskInfo->data.mem +
1892
457k
        ((!taskInfo->data.parity) +
1893
457k
         2 * ((int64_t)taskInfo->data.win_h.x0 - (int64_t)taskInfo->data.win_l.x0)) *
1894
457k
          VERT_PASS_WIDTH;
1895
457k
      if(!decompressor.interleave_v(&taskInfo->data, sa, xPos, width))
1896
28.8k
      {
1897
28.8k
         return false;
1898
28.8k
      }
1899
429k
      taskInfo->data.memL = taskInfo->data.mem;
1900
429k
      taskInfo->data.memH = taskInfo->data.mem + ((int64_t)taskInfo->data.win_h.x0 -
1901
429k
                            (int64_t)taskInfo->data.win_l.x0) *
1902
429k
                               VERT_PASS_WIDTH;
1903
429k
      decompressor.decompress_v(&taskInfo->data);
1904
      // write to buffer for final res
1905
429k
      if(!sa->write(resno,
1906
429k
              grk_rect32(xPos, bandInfo.resWindowREL_.y0, xPos + width,
1907
429k
                   bandInfo.resWindowREL_.y0 + taskInfo->data.win_l.length() +
1908
429k
                     taskInfo->data.win_h.length()),
1909
429k
              (int32_t*)(taskInfo->data.mem + ((int64_t)bandInfo.resWindowREL_.y0 -
1910
429k
                               2 * (int64_t)taskInfo->data.win_l.x0) *
1911
429k
                                VERT_PASS_WIDTH),
1912
429k
              1, VERT_PASS_WIDTH * (sizeof(T) / sizeof(int32_t))))
1913
0
      {
1914
0
         Logger::logger_.error("Sparse array write failure");
1915
0
         return false;
1916
0
      }
1917
429k
     }
1918
1919
308k
     return true;
1920
337k
    };
1921
1922
    // 3. calculate synthesis
1923
40.4k
    horiz.win_l = bandInfo.bandWindowREL_[BAND_ORIENT_LL].dimX();
1924
40.4k
    horiz.win_h = bandInfo.bandWindowREL_[BAND_ORIENT_HL].dimX();
1925
40.4k
    horiz.resno = resno;
1926
40.4k
    size_t dataLength =
1927
40.4k
      (bandInfo.splitWindowREL_[0].width() + 2 * FILTER_WIDTH) * HORIZ_PASS_HEIGHT;
1928
40.4k
    auto resFlow = imageComponentFlow->getResFlow(resno - 1);
1929
120k
    for(uint32_t k = 0; k < 2 && dataLength; ++k)
1930
80.4k
    {
1931
80.4k
     uint32_t numTasks = numThreads;
1932
80.4k
     uint32_t num_rows = bandInfo.splitWindowREL_[k].height();
1933
80.4k
     if(num_rows < numTasks)
1934
77.4k
      numTasks = num_rows;
1935
80.4k
     uint32_t incrPerJob = numTasks ? (num_rows / numTasks) : 0;
1936
80.4k
     if(numThreads == 1)
1937
0
      numTasks = 1;
1938
80.4k
     if(incrPerJob == 0)
1939
13.0k
      continue;
1940
547k
     for(uint32_t j = 0; j < numTasks; ++j)
1941
480k
     {
1942
480k
      uint32_t indexMin = bandInfo.splitWindowREL_[k].y0 + j * incrPerJob;
1943
480k
      uint32_t indexMax = j < (numTasks - 1U)
1944
480k
                  ? bandInfo.splitWindowREL_[k].y0 + (j + 1U) * incrPerJob
1945
480k
                  : bandInfo.splitWindowREL_[k].y1;
1946
480k
      if(indexMin == indexMax)
1947
0
         continue;
1948
480k
      auto taskInfo = new TaskInfo<T, dwt_data<T>>(horiz, indexMin, indexMax);
1949
480k
      if(!taskInfo->data.alloc(dataLength, pad))
1950
0
      {
1951
0
         delete taskInfo;
1952
0
         return false;
1953
0
      }
1954
480k
      tasks.push_back(taskInfo);
1955
480k
      if(numThreads > 1)
1956
492k
         resFlow->waveletHoriz_->nextTask().work(
1957
492k
           [taskInfo, executor_h] { executor_h(taskInfo); });
grk::WaveletReverse::decompress_partial_tile<int, 1u, 4u, grk::Partial53<int, 2u, 4u> >(grk::ISparseCanvas*, std::__1::vector<grk::TaskInfo<int, grk::dwt_data<int> >*, std::__1::allocator<grk::TaskInfo<int, grk::dwt_data<int> >*> >&)::{lambda()#4}::operator()() const
Line
Count
Source
1957
37.1k
           [taskInfo, executor_h] { executor_h(taskInfo); });
grk::WaveletReverse::decompress_partial_tile<grk::vec<float, 4ul>, 2u, 1u, grk::Partial97<grk::vec<float, 4ul>, 2u, 1u> >(grk::ISparseCanvas*, std::__1::vector<grk::TaskInfo<grk::vec<float, 4ul>, grk::dwt_data<grk::vec<float, 4ul> > >*, std::__1::allocator<grk::TaskInfo<grk::vec<float, 4ul>, grk::dwt_data<grk::vec<float, 4ul> > >*> >&)::{lambda()#4}::operator()() const
Line
Count
Source
1957
421k
           [taskInfo, executor_h] { executor_h(taskInfo); });
1958
18.4E
      else
1959
18.4E
         executor_h(taskInfo);
1960
480k
     }
1961
67.4k
    }
1962
40.4k
    dataLength = (bandInfo.resWindowREL_.height() + 2 * FILTER_WIDTH) * VERT_PASS_WIDTH *
1963
40.4k
           sizeof(T) / sizeof(int32_t);
1964
40.4k
    vert.win_l = bandInfo.bandWindowREL_[BAND_ORIENT_LL].dimY();
1965
40.4k
    vert.win_h = bandInfo.bandWindowREL_[BAND_ORIENT_LH].dimY();
1966
40.4k
    vert.resno = resno;
1967
40.4k
    uint32_t numTasks = numThreads;
1968
40.4k
    uint32_t numColumns = bandInfo.resWindowREL_.width();
1969
40.4k
    if(numColumns < numTasks)
1970
34.4k
     numTasks = numColumns;
1971
40.4k
    uint32_t incrPerJob = numTasks ? (numColumns / numTasks) : 0;
1972
40.4k
    if(numThreads == 1)
1973
0
     numTasks = 1;
1974
412k
    for(uint32_t j = 0; j < numTasks && incrPerJob > 0 && dataLength; ++j)
1975
372k
    {
1976
372k
     uint32_t indexMin = bandInfo.resWindowREL_.x0 + j * incrPerJob;
1977
372k
     uint32_t indexMax = j < (numTasks - 1U) ? bandInfo.resWindowREL_.x0 + (j + 1U) * incrPerJob
1978
372k
                         : bandInfo.resWindowREL_.x1;
1979
372k
     if(indexMin == indexMax)
1980
0
      continue;
1981
372k
     auto taskInfo = new TaskInfo<T, dwt_data<T>>(vert, indexMin, indexMax);
1982
372k
     if(!taskInfo->data.alloc(dataLength, pad))
1983
0
     {
1984
0
      delete taskInfo;
1985
0
      return false;
1986
0
     }
1987
372k
     tasks.push_back(taskInfo);
1988
372k
     if(numThreads > 1)
1989
381k
      resFlow->waveletVert_->nextTask().work(
1990
381k
        [taskInfo, executor_v] { executor_v(taskInfo); });
grk::WaveletReverse::decompress_partial_tile<int, 1u, 4u, grk::Partial53<int, 2u, 4u> >(grk::ISparseCanvas*, std::__1::vector<grk::TaskInfo<int, grk::dwt_data<int> >*, std::__1::allocator<grk::TaskInfo<int, grk::dwt_data<int> >*> >&)::{lambda()#5}::operator()() const
Line
Count
Source
1990
20.3k
        [taskInfo, executor_v] { executor_v(taskInfo); });
grk::WaveletReverse::decompress_partial_tile<grk::vec<float, 4ul>, 2u, 1u, grk::Partial97<grk::vec<float, 4ul>, 2u, 1u> >(grk::ISparseCanvas*, std::__1::vector<grk::TaskInfo<grk::vec<float, 4ul>, grk::dwt_data<grk::vec<float, 4ul> > >*, std::__1::allocator<grk::TaskInfo<grk::vec<float, 4ul>, grk::dwt_data<grk::vec<float, 4ul> > >*> >&)::{lambda()#5}::operator()() const
Line
Count
Source
1990
338k
        [taskInfo, executor_v] { executor_v(taskInfo); });
1991
18.4E
     else
1992
18.4E
      executor_v(taskInfo);
1993
372k
    }
1994
40.4k
   }
1995
1996
6.57k
   if(numThreads > 1)
1997
6.58k
    imageComponentFlow->waveletFinalCopy_->nextTask().work([final_read] { final_read(); });
grk::WaveletReverse::decompress_partial_tile<int, 1u, 4u, grk::Partial53<int, 2u, 4u> >(grk::ISparseCanvas*, std::__1::vector<grk::TaskInfo<int, grk::dwt_data<int> >*, std::__1::allocator<grk::TaskInfo<int, grk::dwt_data<int> >*> >&)::{lambda()#6}::operator()() const
Line
Count
Source
1997
271
    imageComponentFlow->waveletFinalCopy_->nextTask().work([final_read] { final_read(); });
grk::WaveletReverse::decompress_partial_tile<grk::vec<float, 4ul>, 2u, 1u, grk::Partial97<grk::vec<float, 4ul>, 2u, 1u> >(grk::ISparseCanvas*, std::__1::vector<grk::TaskInfo<grk::vec<float, 4ul>, grk::dwt_data<grk::vec<float, 4ul> > >*, std::__1::allocator<grk::TaskInfo<grk::vec<float, 4ul>, grk::dwt_data<grk::vec<float, 4ul> > >*> >&)::{lambda()#6}::operator()() const
Line
Count
Source
1997
6.29k
    imageComponentFlow->waveletFinalCopy_->nextTask().work([final_read] { final_read(); });
1998
18.4E
   else
1999
18.4E
    final_read();
2000
2001
6.57k
   return true;
2002
6.57k
}
bool grk::WaveletReverse::decompress_partial_tile<int, 1u, 4u, grk::Partial53<int, 2u, 4u> >(grk::ISparseCanvas*, std::__1::vector<grk::TaskInfo<int, grk::dwt_data<int> >*, std::__1::allocator<grk::TaskInfo<int, grk::dwt_data<int> >*> >&)
Line
Count
Source
1783
6.85k
{
1784
6.85k
   uint8_t numresolutions = tilec_->numresolutions;
1785
6.85k
   auto buf = tilec_->getWindow();
1786
6.85k
   auto simpleBuf = buf->getResWindowBufferHighestSimple();
1787
6.85k
   auto fullRes = tilec_->resolutions_;
1788
6.85k
   auto fullResTopLevel = tilec_->resolutions_ + numres_ - 1;
1789
6.85k
   if(!fullResTopLevel->width() || !fullResTopLevel->height())
1790
2.20k
    return true;
1791
1792
4.64k
   [[maybe_unused]] const uint16_t debug_compno = 0;
1793
4.64k
   const uint32_t HORIZ_PASS_HEIGHT = sizeof(T) / sizeof(int32_t);
1794
4.64k
   const uint32_t pad = FILTER_WIDTH * std::max<uint32_t>(HORIZ_PASS_HEIGHT, VERT_PASS_WIDTH) *
1795
4.64k
            sizeof(T) / sizeof(int32_t);
1796
   // reduce window
1797
4.64k
   auto synthesisWindow = unreducedWindow_.scaleDownCeilPow2(numresolutions - numres_);
1798
4.64k
   assert(fullResTopLevel->intersection(synthesisWindow) == synthesisWindow);
1799
   // shift to relative coordinates
1800
4.64k
   synthesisWindow =
1801
4.64k
     synthesisWindow.pan(-(int64_t)fullResTopLevel->x0, -(int64_t)fullResTopLevel->y0);
1802
4.64k
   if(synthesisWindow.empty())
1803
30
    return true;
1804
4.61k
   uint32_t numThreads = (uint32_t)ExecSingleton::get().num_workers();
1805
4.61k
   auto imageComponentFlow = scheduler_->getImageComponentFlow(compno_);
1806
   // imageComponentFlow == nullptr ==> no blocks were decompressed for this component
1807
4.61k
   if(!imageComponentFlow)
1808
2.07k
    return true;
1809
2.54k
   if(numres_ == 1U)
1810
2.28k
   {
1811
2.28k
    auto final_read = [sa, synthesisWindow, simpleBuf]() {
1812
     // final read into tile buffer
1813
2.28k
     bool ret = sa->read(0, synthesisWindow, simpleBuf.buf_, 1, simpleBuf.stride_);
1814
1815
2.28k
     return ret;
1816
2.28k
    };
1817
2.28k
    if(numThreads > 1)
1818
2.28k
     imageComponentFlow->waveletFinalCopy_->nextTask().work([final_read] { final_read(); });
1819
3
    else
1820
3
     final_read();
1821
1822
2.28k
    return true;
1823
2.28k
   }
1824
261
   auto final_read = [this, sa, synthesisWindow, simpleBuf]() {
1825
    // final read into tile buffer
1826
261
    bool ret = sa->read(numres_ - 1, synthesisWindow, simpleBuf.buf_, 1, simpleBuf.stride_);
1827
1828
261
    return ret;
1829
261
   };
1830
   // pre-allocate all blocks
1831
261
   std::vector<PartialBandInfo<FILTER_WIDTH>> resBandInfo;
1832
1.11k
   for(uint8_t resno = 1; resno < numres_; resno++)
1833
853
   {
1834
853
    PartialBandInfo<FILTER_WIDTH> bandInfo;
1835
853
    if(!bandInfo.alloc(sa, resno, fullRes + resno - 1, buf))
1836
0
     return false;
1837
853
    resBandInfo.push_back(bandInfo);
1838
853
   }
1839
261
   D decompressor;
1840
1.11k
   for(uint8_t resno = 1; resno < numres_; resno++)
1841
853
   {
1842
853
    dwt_data<T> horiz;
1843
853
    dwt_data<T> vert;
1844
853
    horiz.sn_full = fullRes->width();
1845
853
    vert.sn_full = fullRes->height();
1846
853
    fullRes++;
1847
853
    horiz.dn_full = fullRes->width() - horiz.sn_full;
1848
853
    horiz.parity = fullRes->x0 & 1;
1849
853
    vert.dn_full = fullRes->height() - vert.sn_full;
1850
853
    vert.parity = fullRes->y0 & 1;
1851
853
    PartialBandInfo<FILTER_WIDTH>& bandInfo = resBandInfo[resno - 1];
1852
1853
853
    auto executor_h = [resno, sa, bandInfo, &decompressor](TaskInfo<T, dwt_data<T>>* taskInfo) {
1854
853
     for(uint32_t yPos = taskInfo->indexMin_; yPos < taskInfo->indexMax_;
1855
853
       yPos += HORIZ_PASS_HEIGHT)
1856
853
     {
1857
853
      auto height =
1858
853
        std::min<uint32_t>((uint32_t)HORIZ_PASS_HEIGHT, taskInfo->indexMax_ - yPos);
1859
853
      taskInfo->data.memL = taskInfo->data.mem + taskInfo->data.parity;
1860
853
      taskInfo->data.memH =
1861
853
        taskInfo->data.mem + (int64_t)(!taskInfo->data.parity) +
1862
853
        2 * ((int64_t)taskInfo->data.win_h.x0 - (int64_t)taskInfo->data.win_l.x0);
1863
853
      if(!decompressor.interleave_h(&taskInfo->data, sa, yPos, height))
1864
853
      {
1865
853
         return false;
1866
853
      }
1867
853
      taskInfo->data.memL = taskInfo->data.mem;
1868
853
      taskInfo->data.memH = taskInfo->data.mem + ((int64_t)taskInfo->data.win_h.x0 -
1869
853
                            (int64_t)taskInfo->data.win_l.x0);
1870
853
      decompressor.decompress_h(&taskInfo->data);
1871
853
      if(!sa->write(resno,
1872
853
              grk_rect32(bandInfo.resWindowREL_.x0, yPos, bandInfo.resWindowREL_.x1,
1873
853
                   yPos + height),
1874
853
              (int32_t*)(taskInfo->data.mem + (int64_t)bandInfo.resWindowREL_.x0 -
1875
853
                   2 * (int64_t)taskInfo->data.win_l.x0),
1876
853
              HORIZ_PASS_HEIGHT, 1))
1877
853
      {
1878
853
         return false;
1879
853
      }
1880
853
     }
1881
1882
853
     return true;
1883
853
    };
1884
853
    auto executor_v = [resno, sa, bandInfo, &decompressor](TaskInfo<T, dwt_data<T>>* taskInfo) {
1885
853
     for(uint32_t xPos = taskInfo->indexMin_; xPos < taskInfo->indexMax_;
1886
853
       xPos += VERT_PASS_WIDTH)
1887
853
     {
1888
853
      auto width = std::min<uint32_t>(VERT_PASS_WIDTH, (taskInfo->indexMax_ - xPos));
1889
853
      taskInfo->data.memL = taskInfo->data.mem + (taskInfo->data.parity) * VERT_PASS_WIDTH;
1890
853
      taskInfo->data.memH =
1891
853
        taskInfo->data.mem +
1892
853
        ((!taskInfo->data.parity) +
1893
853
         2 * ((int64_t)taskInfo->data.win_h.x0 - (int64_t)taskInfo->data.win_l.x0)) *
1894
853
          VERT_PASS_WIDTH;
1895
853
      if(!decompressor.interleave_v(&taskInfo->data, sa, xPos, width))
1896
853
      {
1897
853
         return false;
1898
853
      }
1899
853
      taskInfo->data.memL = taskInfo->data.mem;
1900
853
      taskInfo->data.memH = taskInfo->data.mem + ((int64_t)taskInfo->data.win_h.x0 -
1901
853
                            (int64_t)taskInfo->data.win_l.x0) *
1902
853
                               VERT_PASS_WIDTH;
1903
853
      decompressor.decompress_v(&taskInfo->data);
1904
      // write to buffer for final res
1905
853
      if(!sa->write(resno,
1906
853
              grk_rect32(xPos, bandInfo.resWindowREL_.y0, xPos + width,
1907
853
                   bandInfo.resWindowREL_.y0 + taskInfo->data.win_l.length() +
1908
853
                     taskInfo->data.win_h.length()),
1909
853
              (int32_t*)(taskInfo->data.mem + ((int64_t)bandInfo.resWindowREL_.y0 -
1910
853
                               2 * (int64_t)taskInfo->data.win_l.x0) *
1911
853
                                VERT_PASS_WIDTH),
1912
853
              1, VERT_PASS_WIDTH * (sizeof(T) / sizeof(int32_t))))
1913
853
      {
1914
853
         Logger::logger_.error("Sparse array write failure");
1915
853
         return false;
1916
853
      }
1917
853
     }
1918
1919
853
     return true;
1920
853
    };
1921
1922
    // 3. calculate synthesis
1923
853
    horiz.win_l = bandInfo.bandWindowREL_[BAND_ORIENT_LL].dimX();
1924
853
    horiz.win_h = bandInfo.bandWindowREL_[BAND_ORIENT_HL].dimX();
1925
853
    horiz.resno = resno;
1926
853
    size_t dataLength =
1927
853
      (bandInfo.splitWindowREL_[0].width() + 2 * FILTER_WIDTH) * HORIZ_PASS_HEIGHT;
1928
853
    auto resFlow = imageComponentFlow->getResFlow(resno - 1);
1929
2.55k
    for(uint32_t k = 0; k < 2 && dataLength; ++k)
1930
1.70k
    {
1931
1.70k
     uint32_t numTasks = numThreads;
1932
1.70k
     uint32_t num_rows = bandInfo.splitWindowREL_[k].height();
1933
1.70k
     if(num_rows < numTasks)
1934
768
      numTasks = num_rows;
1935
1.70k
     uint32_t incrPerJob = numTasks ? (num_rows / numTasks) : 0;
1936
1.70k
     if(numThreads == 1)
1937
0
      numTasks = 1;
1938
1.70k
     if(incrPerJob == 0)
1939
66
      continue;
1940
39.0k
     for(uint32_t j = 0; j < numTasks; ++j)
1941
37.3k
     {
1942
37.3k
      uint32_t indexMin = bandInfo.splitWindowREL_[k].y0 + j * incrPerJob;
1943
37.3k
      uint32_t indexMax = j < (numTasks - 1U)
1944
37.3k
                  ? bandInfo.splitWindowREL_[k].y0 + (j + 1U) * incrPerJob
1945
37.3k
                  : bandInfo.splitWindowREL_[k].y1;
1946
37.3k
      if(indexMin == indexMax)
1947
0
         continue;
1948
37.3k
      auto taskInfo = new TaskInfo<T, dwt_data<T>>(horiz, indexMin, indexMax);
1949
37.3k
      if(!taskInfo->data.alloc(dataLength, pad))
1950
0
      {
1951
0
         delete taskInfo;
1952
0
         return false;
1953
0
      }
1954
37.3k
      tasks.push_back(taskInfo);
1955
37.3k
      if(numThreads > 1)
1956
37.8k
         resFlow->waveletHoriz_->nextTask().work(
1957
37.8k
           [taskInfo, executor_h] { executor_h(taskInfo); });
1958
18.4E
      else
1959
18.4E
         executor_h(taskInfo);
1960
37.3k
     }
1961
1.64k
    }
1962
853
    dataLength = (bandInfo.resWindowREL_.height() + 2 * FILTER_WIDTH) * VERT_PASS_WIDTH *
1963
853
           sizeof(T) / sizeof(int32_t);
1964
853
    vert.win_l = bandInfo.bandWindowREL_[BAND_ORIENT_LL].dimY();
1965
853
    vert.win_h = bandInfo.bandWindowREL_[BAND_ORIENT_LH].dimY();
1966
853
    vert.resno = resno;
1967
853
    uint32_t numTasks = numThreads;
1968
853
    uint32_t numColumns = bandInfo.resWindowREL_.width();
1969
853
    if(numColumns < numTasks)
1970
237
     numTasks = numColumns;
1971
853
    uint32_t incrPerJob = numTasks ? (numColumns / numTasks) : 0;
1972
853
    if(numThreads == 1)
1973
0
     numTasks = 1;
1974
21.3k
    for(uint32_t j = 0; j < numTasks && incrPerJob > 0 && dataLength; ++j)
1975
20.4k
    {
1976
20.4k
     uint32_t indexMin = bandInfo.resWindowREL_.x0 + j * incrPerJob;
1977
20.4k
     uint32_t indexMax = j < (numTasks - 1U) ? bandInfo.resWindowREL_.x0 + (j + 1U) * incrPerJob
1978
20.4k
                         : bandInfo.resWindowREL_.x1;
1979
20.4k
     if(indexMin == indexMax)
1980
0
      continue;
1981
20.4k
     auto taskInfo = new TaskInfo<T, dwt_data<T>>(vert, indexMin, indexMax);
1982
20.4k
     if(!taskInfo->data.alloc(dataLength, pad))
1983
0
     {
1984
0
      delete taskInfo;
1985
0
      return false;
1986
0
     }
1987
20.4k
     tasks.push_back(taskInfo);
1988
20.4k
     if(numThreads > 1)
1989
20.6k
      resFlow->waveletVert_->nextTask().work(
1990
20.6k
        [taskInfo, executor_v] { executor_v(taskInfo); });
1991
18.4E
     else
1992
18.4E
      executor_v(taskInfo);
1993
20.4k
    }
1994
853
   }
1995
1996
261
   if(numThreads > 1)
1997
271
    imageComponentFlow->waveletFinalCopy_->nextTask().work([final_read] { final_read(); });
1998
18.4E
   else
1999
18.4E
    final_read();
2000
2001
261
   return true;
2002
261
}
bool grk::WaveletReverse::decompress_partial_tile<grk::vec<float, 4ul>, 2u, 1u, grk::Partial97<grk::vec<float, 4ul>, 2u, 1u> >(grk::ISparseCanvas*, std::__1::vector<grk::TaskInfo<grk::vec<float, 4ul>, grk::dwt_data<grk::vec<float, 4ul> > >*, std::__1::allocator<grk::TaskInfo<grk::vec<float, 4ul>, grk::dwt_data<grk::vec<float, 4ul> > >*> >&)
Line
Count
Source
1783
11.4k
{
1784
11.4k
   uint8_t numresolutions = tilec_->numresolutions;
1785
11.4k
   auto buf = tilec_->getWindow();
1786
11.4k
   auto simpleBuf = buf->getResWindowBufferHighestSimple();
1787
11.4k
   auto fullRes = tilec_->resolutions_;
1788
11.4k
   auto fullResTopLevel = tilec_->resolutions_ + numres_ - 1;
1789
11.4k
   if(!fullResTopLevel->width() || !fullResTopLevel->height())
1790
4.72k
    return true;
1791
1792
6.73k
   [[maybe_unused]] const uint16_t debug_compno = 0;
1793
6.73k
   const uint32_t HORIZ_PASS_HEIGHT = sizeof(T) / sizeof(int32_t);
1794
6.73k
   const uint32_t pad = FILTER_WIDTH * std::max<uint32_t>(HORIZ_PASS_HEIGHT, VERT_PASS_WIDTH) *
1795
6.73k
            sizeof(T) / sizeof(int32_t);
1796
   // reduce window
1797
6.73k
   auto synthesisWindow = unreducedWindow_.scaleDownCeilPow2(numresolutions - numres_);
1798
6.73k
   assert(fullResTopLevel->intersection(synthesisWindow) == synthesisWindow);
1799
   // shift to relative coordinates
1800
6.73k
   synthesisWindow =
1801
6.73k
     synthesisWindow.pan(-(int64_t)fullResTopLevel->x0, -(int64_t)fullResTopLevel->y0);
1802
6.73k
   if(synthesisWindow.empty())
1803
3
    return true;
1804
6.72k
   uint32_t numThreads = (uint32_t)ExecSingleton::get().num_workers();
1805
6.72k
   auto imageComponentFlow = scheduler_->getImageComponentFlow(compno_);
1806
   // imageComponentFlow == nullptr ==> no blocks were decompressed for this component
1807
6.72k
   if(!imageComponentFlow)
1808
232
    return true;
1809
6.49k
   if(numres_ == 1U)
1810
182
   {
1811
182
    auto final_read = [sa, synthesisWindow, simpleBuf]() {
1812
     // final read into tile buffer
1813
182
     bool ret = sa->read(0, synthesisWindow, simpleBuf.buf_, 1, simpleBuf.stride_);
1814
1815
182
     return ret;
1816
182
    };
1817
182
    if(numThreads > 1)
1818
182
     imageComponentFlow->waveletFinalCopy_->nextTask().work([final_read] { final_read(); });
1819
0
    else
1820
0
     final_read();
1821
1822
182
    return true;
1823
182
   }
1824
6.31k
   auto final_read = [this, sa, synthesisWindow, simpleBuf]() {
1825
    // final read into tile buffer
1826
6.31k
    bool ret = sa->read(numres_ - 1, synthesisWindow, simpleBuf.buf_, 1, simpleBuf.stride_);
1827
1828
6.31k
    return ret;
1829
6.31k
   };
1830
   // pre-allocate all blocks
1831
6.31k
   std::vector<PartialBandInfo<FILTER_WIDTH>> resBandInfo;
1832
45.9k
   for(uint8_t resno = 1; resno < numres_; resno++)
1833
39.6k
   {
1834
39.6k
    PartialBandInfo<FILTER_WIDTH> bandInfo;
1835
39.6k
    if(!bandInfo.alloc(sa, resno, fullRes + resno - 1, buf))
1836
0
     return false;
1837
39.6k
    resBandInfo.push_back(bandInfo);
1838
39.6k
   }
1839
6.31k
   D decompressor;
1840
45.8k
   for(uint8_t resno = 1; resno < numres_; resno++)
1841
39.5k
   {
1842
39.5k
    dwt_data<T> horiz;
1843
39.5k
    dwt_data<T> vert;
1844
39.5k
    horiz.sn_full = fullRes->width();
1845
39.5k
    vert.sn_full = fullRes->height();
1846
39.5k
    fullRes++;
1847
39.5k
    horiz.dn_full = fullRes->width() - horiz.sn_full;
1848
39.5k
    horiz.parity = fullRes->x0 & 1;
1849
39.5k
    vert.dn_full = fullRes->height() - vert.sn_full;
1850
39.5k
    vert.parity = fullRes->y0 & 1;
1851
39.5k
    PartialBandInfo<FILTER_WIDTH>& bandInfo = resBandInfo[resno - 1];
1852
1853
39.5k
    auto executor_h = [resno, sa, bandInfo, &decompressor](TaskInfo<T, dwt_data<T>>* taskInfo) {
1854
39.5k
     for(uint32_t yPos = taskInfo->indexMin_; yPos < taskInfo->indexMax_;
1855
39.5k
       yPos += HORIZ_PASS_HEIGHT)
1856
39.5k
     {
1857
39.5k
      auto height =
1858
39.5k
        std::min<uint32_t>((uint32_t)HORIZ_PASS_HEIGHT, taskInfo->indexMax_ - yPos);
1859
39.5k
      taskInfo->data.memL = taskInfo->data.mem + taskInfo->data.parity;
1860
39.5k
      taskInfo->data.memH =
1861
39.5k
        taskInfo->data.mem + (int64_t)(!taskInfo->data.parity) +
1862
39.5k
        2 * ((int64_t)taskInfo->data.win_h.x0 - (int64_t)taskInfo->data.win_l.x0);
1863
39.5k
      if(!decompressor.interleave_h(&taskInfo->data, sa, yPos, height))
1864
39.5k
      {
1865
39.5k
         return false;
1866
39.5k
      }
1867
39.5k
      taskInfo->data.memL = taskInfo->data.mem;
1868
39.5k
      taskInfo->data.memH = taskInfo->data.mem + ((int64_t)taskInfo->data.win_h.x0 -
1869
39.5k
                            (int64_t)taskInfo->data.win_l.x0);
1870
39.5k
      decompressor.decompress_h(&taskInfo->data);
1871
39.5k
      if(!sa->write(resno,
1872
39.5k
              grk_rect32(bandInfo.resWindowREL_.x0, yPos, bandInfo.resWindowREL_.x1,
1873
39.5k
                   yPos + height),
1874
39.5k
              (int32_t*)(taskInfo->data.mem + (int64_t)bandInfo.resWindowREL_.x0 -
1875
39.5k
                   2 * (int64_t)taskInfo->data.win_l.x0),
1876
39.5k
              HORIZ_PASS_HEIGHT, 1))
1877
39.5k
      {
1878
39.5k
         return false;
1879
39.5k
      }
1880
39.5k
     }
1881
1882
39.5k
     return true;
1883
39.5k
    };
1884
39.5k
    auto executor_v = [resno, sa, bandInfo, &decompressor](TaskInfo<T, dwt_data<T>>* taskInfo) {
1885
39.5k
     for(uint32_t xPos = taskInfo->indexMin_; xPos < taskInfo->indexMax_;
1886
39.5k
       xPos += VERT_PASS_WIDTH)
1887
39.5k
     {
1888
39.5k
      auto width = std::min<uint32_t>(VERT_PASS_WIDTH, (taskInfo->indexMax_ - xPos));
1889
39.5k
      taskInfo->data.memL = taskInfo->data.mem + (taskInfo->data.parity) * VERT_PASS_WIDTH;
1890
39.5k
      taskInfo->data.memH =
1891
39.5k
        taskInfo->data.mem +
1892
39.5k
        ((!taskInfo->data.parity) +
1893
39.5k
         2 * ((int64_t)taskInfo->data.win_h.x0 - (int64_t)taskInfo->data.win_l.x0)) *
1894
39.5k
          VERT_PASS_WIDTH;
1895
39.5k
      if(!decompressor.interleave_v(&taskInfo->data, sa, xPos, width))
1896
39.5k
      {
1897
39.5k
         return false;
1898
39.5k
      }
1899
39.5k
      taskInfo->data.memL = taskInfo->data.mem;
1900
39.5k
      taskInfo->data.memH = taskInfo->data.mem + ((int64_t)taskInfo->data.win_h.x0 -
1901
39.5k
                            (int64_t)taskInfo->data.win_l.x0) *
1902
39.5k
                               VERT_PASS_WIDTH;
1903
39.5k
      decompressor.decompress_v(&taskInfo->data);
1904
      // write to buffer for final res
1905
39.5k
      if(!sa->write(resno,
1906
39.5k
              grk_rect32(xPos, bandInfo.resWindowREL_.y0, xPos + width,
1907
39.5k
                   bandInfo.resWindowREL_.y0 + taskInfo->data.win_l.length() +
1908
39.5k
                     taskInfo->data.win_h.length()),
1909
39.5k
              (int32_t*)(taskInfo->data.mem + ((int64_t)bandInfo.resWindowREL_.y0 -
1910
39.5k
                               2 * (int64_t)taskInfo->data.win_l.x0) *
1911
39.5k
                                VERT_PASS_WIDTH),
1912
39.5k
              1, VERT_PASS_WIDTH * (sizeof(T) / sizeof(int32_t))))
1913
39.5k
      {
1914
39.5k
         Logger::logger_.error("Sparse array write failure");
1915
39.5k
         return false;
1916
39.5k
      }
1917
39.5k
     }
1918
1919
39.5k
     return true;
1920
39.5k
    };
1921
1922
    // 3. calculate synthesis
1923
39.5k
    horiz.win_l = bandInfo.bandWindowREL_[BAND_ORIENT_LL].dimX();
1924
39.5k
    horiz.win_h = bandInfo.bandWindowREL_[BAND_ORIENT_HL].dimX();
1925
39.5k
    horiz.resno = resno;
1926
39.5k
    size_t dataLength =
1927
39.5k
      (bandInfo.splitWindowREL_[0].width() + 2 * FILTER_WIDTH) * HORIZ_PASS_HEIGHT;
1928
39.5k
    auto resFlow = imageComponentFlow->getResFlow(resno - 1);
1929
118k
    for(uint32_t k = 0; k < 2 && dataLength; ++k)
1930
78.7k
    {
1931
78.7k
     uint32_t numTasks = numThreads;
1932
78.7k
     uint32_t num_rows = bandInfo.splitWindowREL_[k].height();
1933
78.7k
     if(num_rows < numTasks)
1934
76.7k
      numTasks = num_rows;
1935
78.7k
     uint32_t incrPerJob = numTasks ? (num_rows / numTasks) : 0;
1936
78.7k
     if(numThreads == 1)
1937
0
      numTasks = 1;
1938
78.7k
     if(incrPerJob == 0)
1939
12.9k
      continue;
1940
508k
     for(uint32_t j = 0; j < numTasks; ++j)
1941
442k
     {
1942
442k
      uint32_t indexMin = bandInfo.splitWindowREL_[k].y0 + j * incrPerJob;
1943
442k
      uint32_t indexMax = j < (numTasks - 1U)
1944
442k
                  ? bandInfo.splitWindowREL_[k].y0 + (j + 1U) * incrPerJob
1945
442k
                  : bandInfo.splitWindowREL_[k].y1;
1946
442k
      if(indexMin == indexMax)
1947
0
         continue;
1948
442k
      auto taskInfo = new TaskInfo<T, dwt_data<T>>(horiz, indexMin, indexMax);
1949
442k
      if(!taskInfo->data.alloc(dataLength, pad))
1950
0
      {
1951
0
         delete taskInfo;
1952
0
         return false;
1953
0
      }
1954
442k
      tasks.push_back(taskInfo);
1955
442k
      if(numThreads > 1)
1956
455k
         resFlow->waveletHoriz_->nextTask().work(
1957
455k
           [taskInfo, executor_h] { executor_h(taskInfo); });
1958
18.4E
      else
1959
18.4E
         executor_h(taskInfo);
1960
442k
     }
1961
65.7k
    }
1962
39.5k
    dataLength = (bandInfo.resWindowREL_.height() + 2 * FILTER_WIDTH) * VERT_PASS_WIDTH *
1963
39.5k
           sizeof(T) / sizeof(int32_t);
1964
39.5k
    vert.win_l = bandInfo.bandWindowREL_[BAND_ORIENT_LL].dimY();
1965
39.5k
    vert.win_h = bandInfo.bandWindowREL_[BAND_ORIENT_LH].dimY();
1966
39.5k
    vert.resno = resno;
1967
39.5k
    uint32_t numTasks = numThreads;
1968
39.5k
    uint32_t numColumns = bandInfo.resWindowREL_.width();
1969
39.5k
    if(numColumns < numTasks)
1970
34.2k
     numTasks = numColumns;
1971
39.5k
    uint32_t incrPerJob = numTasks ? (numColumns / numTasks) : 0;
1972
39.5k
    if(numThreads == 1)
1973
0
     numTasks = 1;
1974
391k
    for(uint32_t j = 0; j < numTasks && incrPerJob > 0 && dataLength; ++j)
1975
351k
    {
1976
351k
     uint32_t indexMin = bandInfo.resWindowREL_.x0 + j * incrPerJob;
1977
351k
     uint32_t indexMax = j < (numTasks - 1U) ? bandInfo.resWindowREL_.x0 + (j + 1U) * incrPerJob
1978
351k
                         : bandInfo.resWindowREL_.x1;
1979
351k
     if(indexMin == indexMax)
1980
0
      continue;
1981
351k
     auto taskInfo = new TaskInfo<T, dwt_data<T>>(vert, indexMin, indexMax);
1982
351k
     if(!taskInfo->data.alloc(dataLength, pad))
1983
0
     {
1984
0
      delete taskInfo;
1985
0
      return false;
1986
0
     }
1987
351k
     tasks.push_back(taskInfo);
1988
351k
     if(numThreads > 1)
1989
360k
      resFlow->waveletVert_->nextTask().work(
1990
360k
        [taskInfo, executor_v] { executor_v(taskInfo); });
1991
18.4E
     else
1992
18.4E
      executor_v(taskInfo);
1993
351k
    }
1994
39.5k
   }
1995
1996
6.31k
   if(numThreads > 1)
1997
6.31k
    imageComponentFlow->waveletFinalCopy_->nextTask().work([final_read] { final_read(); });
1998
3
   else
1999
3
    final_read();
2000
2001
6.31k
   return true;
2002
6.31k
}
2003
WaveletReverse::WaveletReverse(TileProcessor* tileProcessor, TileComponent* tilec, uint16_t compno,
2004
                 grk_rect32 unreducedWindow, uint8_t numres, uint8_t qmfbid)
2005
  : tileProcessor_(tileProcessor), scheduler_(tileProcessor->getScheduler()), tilec_(tilec),
2006
    compno_(compno), unreducedWindow_(unreducedWindow), numres_(numres), qmfbid_(qmfbid)
2007
18.3k
{}
2008
WaveletReverse::~WaveletReverse(void)
2009
18.3k
{
2010
18.3k
   for(const auto& t : tasks_)
2011
58.6k
    delete t;
2012
18.3k
   for(const auto& t : tasksF_)
2013
813k
    delete t;
2014
18.3k
}
2015
bool WaveletReverse::decompress(void)
2016
18.3k
{
2017
18.3k
   if(qmfbid_ == 1)
2018
6.85k
   {
2019
6.85k
    if(tileProcessor_->cp_->wholeTileDecompress_)
2020
0
     return decompress_tile_53();
2021
6.85k
    else
2022
6.85k
    {
2023
6.85k
     constexpr uint32_t VERT_PASS_WIDTH = 4;
2024
6.85k
     return decompress_partial_tile<
2025
6.85k
       int32_t, getFilterPad<uint32_t>(true), VERT_PASS_WIDTH,
2026
6.85k
       Partial53<int32_t, getFilterPad<uint32_t>(false), VERT_PASS_WIDTH>>(
2027
6.85k
       tilec_->getRegionWindow(), tasks_);
2028
6.85k
    }
2029
6.85k
   }
2030
11.4k
   else
2031
11.4k
   {
2032
11.4k
    if(tileProcessor_->cp_->wholeTileDecompress_)
2033
0
     return decompress_tile_97();
2034
11.4k
    else
2035
11.4k
    {
2036
11.4k
     constexpr uint32_t VERT_PASS_WIDTH = 1;
2037
11.4k
     return decompress_partial_tile<
2038
11.4k
       vec4f, getFilterPad<uint32_t>(false), VERT_PASS_WIDTH,
2039
11.4k
       Partial97<vec4f, getFilterPad<uint32_t>(false), VERT_PASS_WIDTH>>(
2040
11.4k
       tilec_->getRegionWindow(), tasksF_);
2041
11.4k
    }
2042
11.4k
   }
2043
18.3k
}
2044
2045
} // namespace grk
2046
#endif