/src/grok/src/lib/core/wavelet/WaveletReverse.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (C) 2016-2024 Grok Image Compression Inc. |
3 | | * |
4 | | * This source code is free software: you can redistribute it and/or modify |
5 | | * it under the terms of the GNU Affero General Public License, version 3, |
6 | | * as published by the Free Software Foundation. |
7 | | * |
8 | | * This source code is distributed in the hope that it will be useful, |
9 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
10 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
11 | | * GNU Affero General Public License for more details. |
12 | | * |
13 | | * You should have received a copy of the GNU Affero General Public License |
14 | | * along with this program. If not, see <http://www.gnu.org/licenses/>. |
15 | | * |
16 | | * |
17 | | * This source code incorporates work covered by the BSD 2-clause license. |
18 | | * Please see the LICENSE file in the root directory for details. |
19 | | * |
20 | | */ |
21 | | |
22 | | #include "grk_includes.h" |
23 | | #include <algorithm> |
24 | | #include <limits> |
25 | | #include <sstream> |
26 | | |
27 | | #undef HWY_TARGET_INCLUDE |
28 | | #define HWY_TARGET_INCLUDE "wavelet/WaveletReverse.cpp" |
29 | | #include <hwy/foreach_target.h> |
30 | | #include <hwy/highway.h> |
31 | | HWY_BEFORE_NAMESPACE(); |
32 | | namespace grk |
33 | | { |
34 | | namespace HWY_NAMESPACE |
35 | | { |
36 | | using namespace hwy::HWY_NAMESPACE; |
37 | | |
38 | | static size_t hwy_num_lanes(void) |
39 | 0 | { |
40 | 0 | const HWY_FULL(int32_t) di; |
41 | 0 | return Lanes(di); |
42 | 0 | } Unexecuted instantiation: WaveletReverse.cpp:grk::N_AVX3_SPR::hwy_num_lanes() Unexecuted instantiation: WaveletReverse.cpp:grk::N_AVX3_ZEN4::hwy_num_lanes() Unexecuted instantiation: WaveletReverse.cpp:grk::N_AVX3::hwy_num_lanes() Unexecuted instantiation: WaveletReverse.cpp:grk::N_AVX2::hwy_num_lanes() Unexecuted instantiation: WaveletReverse.cpp:grk::N_SSE4::hwy_num_lanes() Unexecuted instantiation: WaveletReverse.cpp:grk::N_SSSE3::hwy_num_lanes() Unexecuted instantiation: WaveletReverse.cpp:grk::N_SSE2::hwy_num_lanes() |
43 | | |
44 | 0 | #define HWY_PLL_COLS_53 (2 * Lanes(di)) |
45 | | |
46 | | static void hwy_decompress_v_final_memcpy_53(const int32_t* buf, const uint32_t height, |
47 | | int32_t* dest, const size_t strideDest) |
48 | 0 | { |
49 | 0 | const HWY_FULL(int32_t) di; |
50 | 0 | for(uint32_t i = 0; i < height; ++i) |
51 | 0 | { |
52 | 0 | StoreU(Load(di, buf + HWY_PLL_COLS_53 * i + 0), di, &dest[(size_t)i * strideDest + 0]); |
53 | 0 | StoreU(Load(di, buf + HWY_PLL_COLS_53 * i + Lanes(di)), di, |
54 | 0 | dest + (size_t)i * strideDest + Lanes(di)); |
55 | 0 | } |
56 | 0 | } Unexecuted instantiation: WaveletReverse.cpp:grk::N_AVX3_SPR::hwy_decompress_v_final_memcpy_53(int const*, unsigned int, int*, unsigned long) Unexecuted instantiation: WaveletReverse.cpp:grk::N_AVX3_ZEN4::hwy_decompress_v_final_memcpy_53(int const*, unsigned int, int*, unsigned long) Unexecuted instantiation: WaveletReverse.cpp:grk::N_AVX3::hwy_decompress_v_final_memcpy_53(int const*, unsigned int, int*, unsigned long) Unexecuted instantiation: WaveletReverse.cpp:grk::N_AVX2::hwy_decompress_v_final_memcpy_53(int const*, unsigned int, int*, unsigned long) Unexecuted instantiation: WaveletReverse.cpp:grk::N_SSE4::hwy_decompress_v_final_memcpy_53(int const*, unsigned int, int*, unsigned long) Unexecuted instantiation: WaveletReverse.cpp:grk::N_SSSE3::hwy_decompress_v_final_memcpy_53(int const*, unsigned int, int*, unsigned long) Unexecuted instantiation: WaveletReverse.cpp:grk::N_SSE2::hwy_decompress_v_final_memcpy_53(int const*, unsigned int, int*, unsigned long) |
57 | | /** Vertical inverse 5x3 wavelet transform for 8 columns in SSE2, or |
58 | | * 16 in AVX2, when top-most pixel is on even coordinate */ |
59 | | static void hwy_decompress_v_parity_even_mcols_53(int32_t* buf, int32_t* bandL, /* even */ |
60 | | const uint32_t hL, const size_t strideL, |
61 | | int32_t* bandH, /* odd */ |
62 | | const uint32_t hH, const size_t strideH, |
63 | | int32_t* dest, const uint32_t strideDest) |
64 | 0 | { |
65 | 0 | const HWY_FULL(int32_t) di; |
66 | 0 | auto two = Set(di, 2); |
67 | |
|
68 | 0 | const uint32_t total_height = hL + hH; |
69 | 0 | assert(total_height > 1); |
70 | | |
71 | | /* Note: loads of input even/odd values must be done in a unaligned */ |
72 | | /* fashion. But stores in tmp can be done with aligned store, since */ |
73 | | /* the temporary buffer is properly aligned */ |
74 | 0 | assert((size_t)buf % (sizeof(int32_t) * Lanes(di)) == 0); |
75 | |
|
76 | 0 | auto s1n_0 = LoadU(di, bandL + 0); |
77 | 0 | auto s1n_1 = LoadU(di, bandL + Lanes(di)); |
78 | 0 | auto d1n_0 = LoadU(di, bandH); |
79 | 0 | auto d1n_1 = LoadU(di, bandH + Lanes(di)); |
80 | | |
81 | | /* s0n = s1n - ((d1n + 1) >> 1); <==> */ |
82 | | /* s0n = s1n - ((d1n + d1n + 2) >> 2); */ |
83 | 0 | auto s0n_0 = s1n_0 - ShiftRight<2>(d1n_0 + d1n_0 + two); |
84 | 0 | auto s0n_1 = s1n_1 - ShiftRight<2>(d1n_1 + d1n_1 + two); |
85 | |
|
86 | 0 | uint32_t i = 0; |
87 | 0 | if(total_height > 3) |
88 | 0 | { |
89 | 0 | uint32_t j; |
90 | 0 | for(i = 0, j = 1; i < (total_height - 3); i += 2, j++) |
91 | 0 | { |
92 | 0 | auto d1c_0 = d1n_0; |
93 | 0 | auto s0c_0 = s0n_0; |
94 | 0 | auto d1c_1 = d1n_1; |
95 | 0 | auto s0c_1 = s0n_1; |
96 | |
|
97 | 0 | s1n_0 = LoadU(di, bandL + j * strideL); |
98 | 0 | s1n_1 = LoadU(di, bandL + j * strideL + Lanes(di)); |
99 | 0 | d1n_0 = LoadU(di, bandH + j * strideH); |
100 | 0 | d1n_1 = LoadU(di, bandH + j * strideH + Lanes(di)); |
101 | | |
102 | | /*s0n = s1n - ((d1c + d1n + 2) >> 2);*/ |
103 | 0 | s0n_0 = s1n_0 - ShiftRight<2>(d1c_0 + d1n_0 + two); |
104 | 0 | s0n_1 = s1n_1 - ShiftRight<2>(d1c_1 + d1n_1 + two); |
105 | |
|
106 | 0 | Store(s0c_0, di, buf + HWY_PLL_COLS_53 * (i + 0)); |
107 | 0 | Store(s0c_1, di, buf + HWY_PLL_COLS_53 * (i + 0) + Lanes(di)); |
108 | | |
109 | | /* d1c + ((s0c + s0n) >> 1) */ |
110 | 0 | Store(d1c_0 + ShiftRight<1>(s0c_0 + s0n_0), di, buf + HWY_PLL_COLS_53 * (i + 1) + 0); |
111 | 0 | Store(d1c_1 + ShiftRight<1>(s0c_1 + s0n_1), di, |
112 | 0 | buf + HWY_PLL_COLS_53 * (i + 1) + Lanes(di)); |
113 | 0 | } |
114 | 0 | } |
115 | 0 | Store(s0n_0, di, buf + HWY_PLL_COLS_53 * (i + 0) + 0); |
116 | 0 | Store(s0n_1, di, buf + HWY_PLL_COLS_53 * (i + 0) + Lanes(di)); |
117 | |
|
118 | 0 | if(total_height & 1) |
119 | 0 | { |
120 | 0 | s1n_0 = LoadU(di, bandL + (size_t)((total_height - 1) / 2) * strideL); |
121 | | /* tmp_len_minus_1 = s1n - ((d1n + 1) >> 1); */ |
122 | 0 | auto tmp_len_minus_1 = s1n_0 - ShiftRight<2>(d1n_0 + d1n_0 + two); |
123 | 0 | Store(tmp_len_minus_1, di, buf + HWY_PLL_COLS_53 * (total_height - 1)); |
124 | | /* d1n + ((s0n + tmp_len_minus_1) >> 1) */ |
125 | 0 | Store(d1n_0 + ShiftRight<1>(s0n_0 + tmp_len_minus_1), di, |
126 | 0 | buf + HWY_PLL_COLS_53 * (total_height - 2)); |
127 | |
|
128 | 0 | s1n_1 = LoadU(di, bandL + (size_t)((total_height - 1) / 2) * strideL + Lanes(di)); |
129 | | /* tmp_len_minus_1 = s1n - ((d1n + 1) >> 1); */ |
130 | 0 | tmp_len_minus_1 = s1n_1 - ShiftRight<2>(d1n_1 + d1n_1 + two); |
131 | 0 | Store(tmp_len_minus_1, di, buf + HWY_PLL_COLS_53 * (total_height - 1) + Lanes(di)); |
132 | | /* d1n + ((s0n + tmp_len_minus_1) >> 1) */ |
133 | 0 | Store(d1n_1 + ShiftRight<1>(s0n_1 + tmp_len_minus_1), di, |
134 | 0 | buf + HWY_PLL_COLS_53 * (total_height - 2) + Lanes(di)); |
135 | 0 | } |
136 | 0 | else |
137 | 0 | { |
138 | 0 | Store(d1n_0 + s0n_0, di, buf + HWY_PLL_COLS_53 * (total_height - 1) + 0); |
139 | 0 | Store(d1n_1 + s0n_1, di, buf + HWY_PLL_COLS_53 * (total_height - 1) + Lanes(di)); |
140 | 0 | } |
141 | 0 | hwy_decompress_v_final_memcpy_53(buf, total_height, dest, strideDest); |
142 | 0 | } Unexecuted instantiation: WaveletReverse.cpp:grk::N_AVX3_SPR::hwy_decompress_v_parity_even_mcols_53(int*, int*, unsigned int, unsigned long, int*, unsigned int, unsigned long, int*, unsigned int) Unexecuted instantiation: WaveletReverse.cpp:grk::N_AVX3_ZEN4::hwy_decompress_v_parity_even_mcols_53(int*, int*, unsigned int, unsigned long, int*, unsigned int, unsigned long, int*, unsigned int) Unexecuted instantiation: WaveletReverse.cpp:grk::N_AVX3::hwy_decompress_v_parity_even_mcols_53(int*, int*, unsigned int, unsigned long, int*, unsigned int, unsigned long, int*, unsigned int) Unexecuted instantiation: WaveletReverse.cpp:grk::N_AVX2::hwy_decompress_v_parity_even_mcols_53(int*, int*, unsigned int, unsigned long, int*, unsigned int, unsigned long, int*, unsigned int) Unexecuted instantiation: WaveletReverse.cpp:grk::N_SSE4::hwy_decompress_v_parity_even_mcols_53(int*, int*, unsigned int, unsigned long, int*, unsigned int, unsigned long, int*, unsigned int) Unexecuted instantiation: WaveletReverse.cpp:grk::N_SSSE3::hwy_decompress_v_parity_even_mcols_53(int*, int*, unsigned int, unsigned long, int*, unsigned int, unsigned long, int*, unsigned int) Unexecuted instantiation: WaveletReverse.cpp:grk::N_SSE2::hwy_decompress_v_parity_even_mcols_53(int*, int*, unsigned int, unsigned long, int*, unsigned int, unsigned long, int*, unsigned int) |
143 | | |
144 | | /** Vertical inverse 5x3 wavelet transform for 8 columns in SSE2, or |
145 | | * 16 in AVX2, when top-most pixel is on odd coordinate */ |
146 | | static void hwy_decompress_v_parity_odd_mcols_53(int32_t* buf, int32_t* bandL, const uint32_t hL, |
147 | | const uint32_t strideL, int32_t* bandH, |
148 | | const uint32_t hH, const uint32_t strideH, |
149 | | int32_t* dest, const uint32_t strideDest) |
150 | 0 | { |
151 | 0 | const HWY_FULL(int32_t) di; |
152 | 0 | auto two = Set(di, 2); |
153 | |
|
154 | 0 | const uint32_t total_height = hL + hH; |
155 | 0 | assert(total_height > 2); |
156 | | /* Note: loads of input even/odd values must be done in a unaligned */ |
157 | | /* fashion. But stores in buf can be done with aligned store, since */ |
158 | | /* the temporary buffer is properly aligned */ |
159 | 0 | assert((size_t)buf % (sizeof(int32_t) * Lanes(di)) == 0); |
160 | |
|
161 | 0 | const int32_t* in_even = bandH; |
162 | 0 | const int32_t* in_odd = bandL; |
163 | 0 | auto s1_0 = LoadU(di, in_even + strideH); |
164 | | |
165 | | /* in_odd[0] - ((in_even[0] + s1 + 2) >> 2); */ |
166 | 0 | auto dc_0 = LoadU(di, in_odd + 0) - ShiftRight<2>(LoadU(di, in_even + 0) + s1_0 + two); |
167 | 0 | Store(LoadU(di, in_even + 0) + dc_0, di, buf + HWY_PLL_COLS_53 * 0); |
168 | 0 | auto s1_1 = LoadU(di, in_even + strideH + Lanes(di)); |
169 | | |
170 | | /* in_odd[0] - ((in_even[0] + s1 + 2) >> 2); */ |
171 | 0 | auto dc_1 = LoadU(di, in_odd + Lanes(di)) - |
172 | 0 | ShiftRight<2>(LoadU(di, in_even + Lanes(di)) + s1_1 + two); |
173 | 0 | Store(LoadU(di, in_even + Lanes(di)) + dc_1, di, buf + HWY_PLL_COLS_53 * 0 + Lanes(di)); |
174 | |
|
175 | 0 | uint32_t i; |
176 | 0 | size_t j; |
177 | 0 | for(i = 1, j = 1; i < (total_height - 2 - !(total_height & 1)); i += 2, j++) |
178 | 0 | { |
179 | 0 | auto s2_0 = LoadU(di, in_even + (j + 1) * strideH); |
180 | 0 | auto s2_1 = LoadU(di, in_even + (j + 1) * strideH + Lanes(di)); |
181 | | |
182 | | /* dn = in_odd[j * stride] - ((s1 + s2 + 2) >> 2); */ |
183 | 0 | auto dn_0 = LoadU(di, in_odd + j * strideL) - ShiftRight<2>(s1_0 + s2_0 + two); |
184 | 0 | auto dn_1 = LoadU(di, in_odd + j * strideL + Lanes(di)) - ShiftRight<2>(s1_1 + s2_1 + two); |
185 | |
|
186 | 0 | Store(dc_0, di, buf + HWY_PLL_COLS_53 * i); |
187 | 0 | Store(dc_1, di, buf + HWY_PLL_COLS_53 * i + Lanes(di)); |
188 | | |
189 | | /* buf[i + 1] = s1 + ((dn + dc) >> 1); */ |
190 | 0 | Store(s1_0 + ShiftRight<1>(dn_0 + dc_0), di, buf + HWY_PLL_COLS_53 * (i + 1) + 0); |
191 | 0 | Store(s1_1 + ShiftRight<1>(dn_1 + dc_1), di, buf + HWY_PLL_COLS_53 * (i + 1) + Lanes(di)); |
192 | |
|
193 | 0 | dc_0 = dn_0; |
194 | 0 | s1_0 = s2_0; |
195 | 0 | dc_1 = dn_1; |
196 | 0 | s1_1 = s2_1; |
197 | 0 | } |
198 | 0 | Store(dc_0, di, buf + HWY_PLL_COLS_53 * i); |
199 | 0 | Store(dc_1, di, buf + HWY_PLL_COLS_53 * i + Lanes(di)); |
200 | |
|
201 | 0 | if(!(total_height & 1)) |
202 | 0 | { |
203 | | /*dn = in_odd[(len / 2 - 1) * stride] - ((s1 + 1) >> 1); */ |
204 | 0 | auto dn_0 = LoadU(di, in_odd + (size_t)(total_height / 2 - 1) * strideL) - |
205 | 0 | ShiftRight<2>(s1_0 + s1_0 + two); |
206 | 0 | auto dn_1 = LoadU(di, in_odd + (size_t)(total_height / 2 - 1) * strideL + Lanes(di)) - |
207 | 0 | ShiftRight<2>(s1_1 + s1_1 + two); |
208 | | |
209 | | /* buf[len - 2] = s1 + ((dn + dc) >> 1); */ |
210 | 0 | Store(s1_0 + ShiftRight<1>(dn_0 + dc_0), di, |
211 | 0 | buf + HWY_PLL_COLS_53 * (total_height - 2) + 0); |
212 | 0 | Store(s1_1 + ShiftRight<1>(dn_1 + dc_1), di, |
213 | 0 | buf + HWY_PLL_COLS_53 * (total_height - 2) + Lanes(di)); |
214 | |
|
215 | 0 | Store(dn_0, di, buf + HWY_PLL_COLS_53 * (total_height - 1) + 0); |
216 | 0 | Store(dn_1, di, buf + HWY_PLL_COLS_53 * (total_height - 1) + Lanes(di)); |
217 | 0 | } |
218 | 0 | else |
219 | 0 | { |
220 | 0 | Store(s1_0 + dc_0, di, buf + HWY_PLL_COLS_53 * (total_height - 1) + 0); |
221 | 0 | Store(s1_1 + dc_1, di, buf + HWY_PLL_COLS_53 * (total_height - 1) + Lanes(di)); |
222 | 0 | } |
223 | 0 | hwy_decompress_v_final_memcpy_53(buf, total_height, dest, strideDest); |
224 | 0 | } Unexecuted instantiation: WaveletReverse.cpp:grk::N_AVX3_SPR::hwy_decompress_v_parity_odd_mcols_53(int*, int*, unsigned int, unsigned int, int*, unsigned int, unsigned int, int*, unsigned int) Unexecuted instantiation: WaveletReverse.cpp:grk::N_AVX3_ZEN4::hwy_decompress_v_parity_odd_mcols_53(int*, int*, unsigned int, unsigned int, int*, unsigned int, unsigned int, int*, unsigned int) Unexecuted instantiation: WaveletReverse.cpp:grk::N_AVX3::hwy_decompress_v_parity_odd_mcols_53(int*, int*, unsigned int, unsigned int, int*, unsigned int, unsigned int, int*, unsigned int) Unexecuted instantiation: WaveletReverse.cpp:grk::N_AVX2::hwy_decompress_v_parity_odd_mcols_53(int*, int*, unsigned int, unsigned int, int*, unsigned int, unsigned int, int*, unsigned int) Unexecuted instantiation: WaveletReverse.cpp:grk::N_SSE4::hwy_decompress_v_parity_odd_mcols_53(int*, int*, unsigned int, unsigned int, int*, unsigned int, unsigned int, int*, unsigned int) Unexecuted instantiation: WaveletReverse.cpp:grk::N_SSSE3::hwy_decompress_v_parity_odd_mcols_53(int*, int*, unsigned int, unsigned int, int*, unsigned int, unsigned int, int*, unsigned int) Unexecuted instantiation: WaveletReverse.cpp:grk::N_SSE2::hwy_decompress_v_parity_odd_mcols_53(int*, int*, unsigned int, unsigned int, int*, unsigned int, unsigned int, int*, unsigned int) |
225 | | |
226 | | } // namespace HWY_NAMESPACE |
227 | | } // namespace grk |
228 | | HWY_AFTER_NAMESPACE(); |
229 | | |
230 | | #if HWY_ONCE |
231 | | namespace grk |
232 | | { |
233 | | HWY_EXPORT(hwy_num_lanes); |
234 | | HWY_EXPORT(hwy_decompress_v_parity_even_mcols_53); |
235 | | HWY_EXPORT(hwy_decompress_v_parity_odd_mcols_53); |
236 | | /* <summary> */ |
237 | | /* Determine maximum computed resolution level for inverse wavelet transform */ |
238 | | /* </summary> */ |
239 | | uint32_t max_resolution(Resolution* GRK_RESTRICT r, uint32_t i) |
240 | 0 | { |
241 | 0 | uint32_t mr = 0; |
242 | 0 | while(--i) |
243 | 0 | { |
244 | 0 | ++r; |
245 | 0 | uint32_t w; |
246 | 0 | if(mr < (w = r->x1 - r->x0)) |
247 | 0 | mr = w; |
248 | 0 | if(mr < (w = r->y1 - r->y0)) |
249 | 0 | mr = w; |
250 | 0 | } |
251 | 0 | return mr; |
252 | 0 | } |
253 | | |
254 | | /********************************************************************************** |
255 | | * |
256 | | * Full 9/7 Inverse Wavelet |
257 | | * |
258 | | * |
259 | | * |
260 | | **********************************************************************************/ |
261 | | |
262 | | static const float dwt_alpha = 1.586134342f; /* 12994 */ |
263 | | static const float dwt_beta = 0.052980118f; /* 434 */ |
264 | | static const float dwt_gamma = -0.882911075f; /* -7233 */ |
265 | | static const float dwt_delta = -0.443506852f; /* -3633 */ |
266 | | static const float K = 1.230174105f; /* 10078 */ |
267 | | static const float twice_invK = 1.625732422f; |
268 | | |
269 | | // #undef __SSE__ |
270 | | |
271 | | #ifdef __SSE__ |
272 | | void WaveletReverse::decompress_step1_sse_97(Params97 d, const __m128 c) |
273 | 1.61M | { |
274 | | // process 4 floats at a time |
275 | 1.61M | auto mmData = (__m128*)d.data; |
276 | 1.61M | uint32_t i; |
277 | 17.3M | for(i = 0; i + 3 < d.len; i += 4, mmData += 8) |
278 | 15.7M | { |
279 | 15.7M | mmData[0] = _mm_mul_ps(mmData[0], c); |
280 | 15.7M | mmData[2] = _mm_mul_ps(mmData[2], c); |
281 | 15.7M | mmData[4] = _mm_mul_ps(mmData[4], c); |
282 | 15.7M | mmData[6] = _mm_mul_ps(mmData[6], c); |
283 | 15.7M | } |
284 | 4.02M | for(; i < d.len; ++i, mmData += 2) |
285 | 2.41M | mmData[0] = _mm_mul_ps(mmData[0], c); |
286 | 1.61M | } |
287 | | #endif |
288 | | |
289 | | void WaveletReverse::decompress_step1_97(const Params97& d, const float c) |
290 | 1.52M | { |
291 | 1.52M | #ifdef __SSE__ |
292 | 1.52M | decompress_step1_sse_97(d, _mm_set1_ps(c)); |
293 | | #else |
294 | | float* GRK_RESTRICT fw = (float*)d.data; |
295 | | |
296 | | for(uint32_t i = 0; i < d.len; ++i, fw += 8) |
297 | | { |
298 | | fw[0] *= c; |
299 | | fw[1] *= c; |
300 | | fw[2] *= c; |
301 | | fw[3] *= c; |
302 | | ; |
303 | | } |
304 | | #endif |
305 | 1.52M | } |
306 | | |
307 | | #ifdef __SSE__ |
308 | | static void decompress_step2_sse_97(const Params97& d, __m128 c) |
309 | 3.02M | { |
310 | 3.02M | __m128* GRK_RESTRICT vec_data = (__m128*)d.data; |
311 | | |
312 | 3.02M | uint32_t imax = (std::min<uint32_t>)(d.len, d.lenMax); |
313 | | |
314 | | // initial tmp1 value is only necessary when |
315 | | // absolute start of line is at 0 |
316 | 3.02M | auto tmp1 = ((__m128*)d.dataPrev)[0]; |
317 | 3.02M | uint32_t i = 0; |
318 | 34.0M | for(; i + 3 < imax; i += 4) |
319 | 31.0M | { |
320 | 31.0M | auto tmp2 = vec_data[-1]; |
321 | 31.0M | auto tmp3 = vec_data[0]; |
322 | 31.0M | auto tmp4 = vec_data[1]; |
323 | 31.0M | auto tmp5 = vec_data[2]; |
324 | 31.0M | auto tmp6 = vec_data[3]; |
325 | 31.0M | auto tmp7 = vec_data[4]; |
326 | 31.0M | auto tmp8 = vec_data[5]; |
327 | 31.0M | auto tmp9 = vec_data[6]; |
328 | 31.0M | vec_data[-1] = _mm_add_ps(tmp2, _mm_mul_ps(_mm_add_ps(tmp1, tmp3), c)); |
329 | 31.0M | vec_data[1] = _mm_add_ps(tmp4, _mm_mul_ps(_mm_add_ps(tmp3, tmp5), c)); |
330 | 31.0M | vec_data[3] = _mm_add_ps(tmp6, _mm_mul_ps(_mm_add_ps(tmp5, tmp7), c)); |
331 | 31.0M | vec_data[5] = _mm_add_ps(tmp8, _mm_mul_ps(_mm_add_ps(tmp7, tmp9), c)); |
332 | 31.0M | tmp1 = tmp9; |
333 | 31.0M | vec_data += 8; |
334 | 31.0M | } |
335 | | |
336 | 7.11M | for(; i < imax; ++i) |
337 | 4.09M | { |
338 | 4.09M | auto tmp2 = vec_data[-1]; |
339 | 4.09M | auto tmp3 = vec_data[0]; |
340 | 4.09M | vec_data[-1] = _mm_add_ps(tmp2, _mm_mul_ps(_mm_add_ps(tmp1, tmp3), c)); |
341 | 4.09M | tmp1 = tmp3; |
342 | 4.09M | vec_data += 2; |
343 | 4.09M | } |
344 | 3.02M | if(d.lenMax < d.len) |
345 | 1.48M | { |
346 | 1.48M | assert(d.lenMax + 1 == d.len); |
347 | 1.48M | c = _mm_add_ps(c, c); |
348 | 1.48M | c = _mm_mul_ps(c, vec_data[-2]); |
349 | 1.48M | vec_data[-1] = _mm_add_ps(vec_data[-1], c); |
350 | 1.48M | } |
351 | 3.02M | } |
352 | | #endif |
353 | | |
354 | | static void decompress_step2_97(const Params97& d, float c) |
355 | 3.00M | { |
356 | 3.00M | #ifdef __SSE__ |
357 | 3.00M | decompress_step2_sse_97(d, _mm_set1_ps(c)); |
358 | | #else |
359 | | |
360 | | float* dataPrev = (float*)d.dataPrev; |
361 | | float* data = (float*)d.data; |
362 | | |
363 | | uint32_t imax = (std::min<uint32_t>)(d.len, d.lenMax); |
364 | | for(uint32_t i = 0; i < imax; ++i) |
365 | | { |
366 | | float tmp1_1 = dataPrev[0]; |
367 | | float tmp1_2 = dataPrev[1]; |
368 | | float tmp1_3 = dataPrev[2]; |
369 | | float tmp1_4 = dataPrev[3]; |
370 | | float tmp2_1 = data[-4]; |
371 | | float tmp2_2 = data[-3]; |
372 | | float tmp2_3 = data[-2]; |
373 | | float tmp2_4 = data[-1]; |
374 | | float tmp3_1 = data[0]; |
375 | | float tmp3_2 = data[1]; |
376 | | float tmp3_3 = data[2]; |
377 | | float tmp3_4 = data[3]; |
378 | | data[-4] = tmp2_1 + ((tmp1_1 + tmp3_1) * c); |
379 | | data[-3] = tmp2_2 + ((tmp1_2 + tmp3_2) * c); |
380 | | data[-2] = tmp2_3 + ((tmp1_3 + tmp3_3) * c); |
381 | | data[-1] = tmp2_4 + ((tmp1_4 + tmp3_4) * c); |
382 | | dataPrev = data; |
383 | | data += 8; |
384 | | } |
385 | | if(d.lenMax < d.len) |
386 | | { |
387 | | assert(d.lenMax + 1 == d.len); |
388 | | c += c; |
389 | | data[-4] = data[-4] + dataPrev[0] * c; |
390 | | data[-3] = data[-3] + dataPrev[1] * c; |
391 | | data[-2] = data[-2] + dataPrev[2] * c; |
392 | | data[-1] = data[-1] + dataPrev[3] * c; |
393 | | } |
394 | | #endif |
395 | 3.00M | } |
396 | | /* <summary> */ |
397 | | /* Inverse 9-7 wavelet transform in 1-D. */ |
398 | | /* </summary> */ |
399 | | void WaveletReverse::decompress_step_97(dwt_data<vec4f>* GRK_RESTRICT dwt) |
400 | 878k | { |
401 | 878k | if((!dwt->parity && dwt->dn_full == 0 && dwt->sn_full <= 1) || |
402 | 878k | (dwt->parity && dwt->sn_full == 0 && dwt->dn_full >= 1)) |
403 | 22.4k | return; |
404 | | |
405 | 856k | decompress_step1_97(makeParams97(dwt, true, true), K); |
406 | 856k | decompress_step1_97(makeParams97(dwt, false, true), twice_invK); |
407 | 856k | decompress_step2_97(makeParams97(dwt, true, false), dwt_delta); |
408 | 856k | decompress_step2_97(makeParams97(dwt, false, false), dwt_gamma); |
409 | 856k | decompress_step2_97(makeParams97(dwt, true, false), dwt_beta); |
410 | 856k | decompress_step2_97(makeParams97(dwt, false, false), dwt_alpha); |
411 | 856k | } |
412 | | void WaveletReverse::interleave_h_97(dwt_data<vec4f>* GRK_RESTRICT dwt, |
413 | | grk_buf2d_simple<float> winL, grk_buf2d_simple<float> winH, |
414 | | uint32_t remaining_height) |
415 | 0 | { |
416 | 0 | float* GRK_RESTRICT bi = (float*)(dwt->mem + dwt->parity); |
417 | 0 | uint32_t x0 = dwt->win_l.x0; |
418 | 0 | uint32_t x1 = dwt->win_l.x1; |
419 | 0 | const size_t vec4f_elts = vec4f::NUM_ELTS; |
420 | 0 | for(uint32_t k = 0; k < 2; ++k) |
421 | 0 | { |
422 | 0 | auto band = (k == 0) ? winL.buf_ : winH.buf_; |
423 | 0 | uint32_t stride = (k == 0) ? winL.stride_ : winH.stride_; |
424 | 0 | if(remaining_height >= vec4f_elts && ((size_t)band & 0x0f) == 0 && ((size_t)bi & 0x0f) == 0 && |
425 | 0 | (stride & 0x0f) == 0) |
426 | 0 | { |
427 | | /* Fast code path */ |
428 | 0 | for(uint32_t i = x0; i < x1; ++i, bi += vec4f_elts * 2) |
429 | 0 | { |
430 | 0 | uint32_t j = i; |
431 | 0 | bi[0] = band[j]; |
432 | 0 | j += stride; |
433 | 0 | bi[1] = band[j]; |
434 | 0 | j += stride; |
435 | 0 | bi[2] = band[j]; |
436 | 0 | j += stride; |
437 | 0 | bi[3] = band[j]; |
438 | 0 | } |
439 | 0 | } |
440 | 0 | else |
441 | 0 | { |
442 | | /* Slow code path */ |
443 | 0 | for(uint32_t i = x0; i < x1; ++i, bi += vec4f_elts * 2) |
444 | 0 | { |
445 | 0 | uint32_t j = i; |
446 | 0 | bi[0] = band[j]; |
447 | 0 | j += stride; |
448 | 0 | if(remaining_height == 1) |
449 | 0 | continue; |
450 | 0 | bi[1] = band[j]; |
451 | 0 | j += stride; |
452 | 0 | if(remaining_height == 2) |
453 | 0 | continue; |
454 | 0 | bi[2] = band[j]; |
455 | 0 | j += stride; |
456 | 0 | if(remaining_height == 3) |
457 | 0 | continue; |
458 | 0 | bi[3] = band[j]; |
459 | 0 | } |
460 | 0 | } |
461 | 0 | bi = (float*)(dwt->mem + 1 - dwt->parity); |
462 | 0 | x0 = dwt->win_h.x0; |
463 | 0 | x1 = dwt->win_h.x1; |
464 | 0 | } |
465 | 0 | } |
466 | | void WaveletReverse::decompress_h_strip_97(dwt_data<vec4f>* GRK_RESTRICT horiz, |
467 | | const uint32_t resHeight, grk_buf2d_simple<float> winL, |
468 | | grk_buf2d_simple<float> winH, |
469 | | grk_buf2d_simple<float> winDest) |
470 | 0 | { |
471 | 0 | float* GRK_RESTRICT dest = winDest.buf_; |
472 | 0 | const uint32_t strideDest = winDest.stride_; |
473 | 0 | uint32_t j; |
474 | 0 | const size_t vec4f_elts = vec4f::NUM_ELTS; |
475 | 0 | for(j = 0; j < (resHeight & (uint32_t)(~(vec4f_elts - 1))); j += vec4f_elts) |
476 | 0 | { |
477 | 0 | interleave_h_97(horiz, winL, winH, resHeight - j); |
478 | 0 | decompress_step_97(horiz); |
479 | 0 | for(uint32_t k = 0; k < horiz->sn_full + horiz->dn_full; k++) |
480 | 0 | { |
481 | 0 | dest[k] = horiz->mem[k].val[0]; |
482 | 0 | dest[k + (size_t)strideDest] = horiz->mem[k].val[1]; |
483 | 0 | dest[k + (size_t)strideDest * 2] = horiz->mem[k].val[2]; |
484 | 0 | dest[k + (size_t)strideDest * 3] = horiz->mem[k].val[3]; |
485 | 0 | } |
486 | 0 | winL.buf_ += winL.stride_ << 2; |
487 | 0 | winH.buf_ += winH.stride_ << 2; |
488 | 0 | dest += strideDest << 2; |
489 | 0 | } |
490 | 0 | if(j < resHeight) |
491 | 0 | { |
492 | 0 | interleave_h_97(horiz, winL, winH, resHeight - j); |
493 | 0 | decompress_step_97(horiz); |
494 | 0 | for(uint32_t k = 0; k < horiz->sn_full + horiz->dn_full; k++) |
495 | 0 | { |
496 | 0 | switch(resHeight - j) |
497 | 0 | { |
498 | 0 | case 3: |
499 | 0 | dest[k + (strideDest << 1)] = horiz->mem[k].val[2]; |
500 | | /* FALLTHRU */ |
501 | 0 | case 2: |
502 | 0 | dest[k + strideDest] = horiz->mem[k].val[1]; |
503 | | /* FALLTHRU */ |
504 | 0 | case 1: |
505 | 0 | dest[k] = horiz->mem[k].val[0]; |
506 | 0 | } |
507 | 0 | } |
508 | 0 | } |
509 | 0 | } |
510 | | bool WaveletReverse::decompress_h_97(uint8_t res, uint32_t numThreads, size_t dataLength, |
511 | | dwt_data<vec4f>& GRK_RESTRICT horiz, const uint32_t resHeight, |
512 | | grk_buf2d_simple<float> winL, grk_buf2d_simple<float> winH, |
513 | | grk_buf2d_simple<float> winDest) |
514 | 0 | { |
515 | 0 | if(resHeight == 0) |
516 | 0 | return true; |
517 | 0 | if(numThreads == 1) |
518 | 0 | { |
519 | 0 | decompress_h_strip_97(&horiz, resHeight, winL, winH, winDest); |
520 | 0 | } |
521 | 0 | else |
522 | 0 | { |
523 | 0 | uint32_t numTasks = numThreads; |
524 | 0 | if(resHeight < numTasks) |
525 | 0 | numTasks = resHeight; |
526 | 0 | uint32_t incrPerJob = resHeight / numTasks; |
527 | 0 | auto imageComponentFlow = scheduler_->getImageComponentFlow(compno_); |
528 | 0 | if(!imageComponentFlow) |
529 | 0 | { |
530 | 0 | Logger::logger_.warn("Missing image component flow"); |
531 | 0 | return true; |
532 | 0 | } |
533 | 0 | auto resFlow = imageComponentFlow->getResFlow(res - 1); |
534 | 0 | for(uint32_t j = 0; j < numTasks; ++j) |
535 | 0 | { |
536 | 0 | auto indexMin = j * incrPerJob; |
537 | 0 | auto indexMax = (j < (numTasks - 1U) ? (j + 1U) * incrPerJob : resHeight) - indexMin; |
538 | 0 | auto myhoriz = new dwt_data<vec4f>(horiz); |
539 | 0 | if(!myhoriz->alloc(dataLength)) |
540 | 0 | { |
541 | 0 | Logger::logger_.error("Out of memory"); |
542 | 0 | return false; |
543 | 0 | } |
544 | 0 | resFlow->waveletHoriz_->nextTask().work([this, myhoriz, indexMax, winL, winH, winDest] { |
545 | 0 | decompress_h_strip_97(myhoriz, indexMax, winL, winH, winDest); |
546 | 0 | delete myhoriz; |
547 | 0 | }); |
548 | 0 | winL.incY_IN_PLACE(incrPerJob); |
549 | 0 | winH.incY_IN_PLACE(incrPerJob); |
550 | 0 | winDest.incY_IN_PLACE(incrPerJob); |
551 | 0 | } |
552 | 0 | } |
553 | 0 | return true; |
554 | 0 | } |
555 | | void WaveletReverse::interleave_v_97(dwt_data<vec4f>* GRK_RESTRICT dwt, |
556 | | grk_buf2d_simple<float> winL, grk_buf2d_simple<float> winH, |
557 | | uint32_t nb_elts_read) |
558 | 0 | { |
559 | 0 | auto bi = dwt->mem + dwt->parity; |
560 | 0 | auto band = winL.buf_ + dwt->win_l.x0 * winL.stride_; |
561 | 0 | for(uint32_t i = dwt->win_l.x0; i < dwt->win_l.x1; ++i, bi += 2) |
562 | 0 | { |
563 | 0 | memcpy((float*)bi, band, nb_elts_read * sizeof(float)); |
564 | 0 | band += winL.stride_; |
565 | 0 | } |
566 | 0 | bi = dwt->mem + 1 - dwt->parity; |
567 | 0 | band = winH.buf_ + dwt->win_h.x0 * winH.stride_; |
568 | 0 | for(uint32_t i = dwt->win_h.x0; i < dwt->win_h.x1; ++i, bi += 2) |
569 | 0 | { |
570 | 0 | memcpy((float*)bi, band, nb_elts_read * sizeof(float)); |
571 | 0 | band += winH.stride_; |
572 | 0 | } |
573 | 0 | } |
574 | | void WaveletReverse::decompress_v_strip_97(dwt_data<vec4f>* GRK_RESTRICT vert, |
575 | | const uint32_t resWidth, const uint32_t resHeight, |
576 | | grk_buf2d_simple<float> winL, |
577 | | grk_buf2d_simple<float> winH, |
578 | | grk_buf2d_simple<float> winDest) |
579 | 0 | { |
580 | 0 | uint32_t j; |
581 | 0 | const size_t vec4f_elts = vec4f::NUM_ELTS; |
582 | 0 | for(j = 0; j < (resWidth & (uint32_t) ~(vec4f_elts - 1)); j += vec4f_elts) |
583 | 0 | { |
584 | 0 | interleave_v_97(vert, winL, winH, vec4f_elts); |
585 | 0 | decompress_step_97(vert); |
586 | 0 | auto destPtr = winDest.buf_; |
587 | 0 | for(uint32_t k = 0; k < resHeight; ++k) |
588 | 0 | { |
589 | 0 | memcpy(destPtr, vert->mem + k, sizeof(vec4f)); |
590 | 0 | destPtr += winDest.stride_; |
591 | 0 | } |
592 | 0 | winL.buf_ += vec4f_elts; |
593 | 0 | winH.buf_ += vec4f_elts; |
594 | 0 | winDest.buf_ += vec4f_elts; |
595 | 0 | } |
596 | 0 | if(j < resWidth) |
597 | 0 | { |
598 | 0 | j = resWidth & (vec4f_elts - 1); |
599 | 0 | interleave_v_97(vert, winL, winH, j); |
600 | 0 | decompress_step_97(vert); |
601 | 0 | auto destPtr = winDest.buf_; |
602 | 0 | for(uint32_t k = 0; k < resHeight; ++k) |
603 | 0 | { |
604 | 0 | memcpy(destPtr, vert->mem + k, j * sizeof(float)); |
605 | 0 | destPtr += winDest.stride_; |
606 | 0 | } |
607 | 0 | } |
608 | 0 | } |
609 | | bool WaveletReverse::decompress_v_97(uint8_t res, uint32_t numThreads, size_t dataLength, |
610 | | dwt_data<vec4f>& GRK_RESTRICT vert, const uint32_t resWidth, |
611 | | const uint32_t resHeight, grk_buf2d_simple<float> winL, |
612 | | grk_buf2d_simple<float> winH, grk_buf2d_simple<float> winDest) |
613 | 0 | { |
614 | 0 | if(resWidth == 0) |
615 | 0 | return true; |
616 | 0 | if(numThreads == 1) |
617 | 0 | { |
618 | 0 | decompress_v_strip_97(&vert, resWidth, resHeight, winL, winH, winDest); |
619 | 0 | } |
620 | 0 | else |
621 | 0 | { |
622 | 0 | auto numTasks = numThreads; |
623 | 0 | if(resWidth < numTasks) |
624 | 0 | numTasks = resWidth; |
625 | 0 | auto incrPerJob = resWidth / numTasks; |
626 | 0 | auto imageComponentFlow = scheduler_->getImageComponentFlow(compno_); |
627 | 0 | if(!imageComponentFlow) |
628 | 0 | { |
629 | 0 | Logger::logger_.warn("Missing image component flow"); |
630 | 0 | return false; |
631 | 0 | } |
632 | 0 | auto resFlow = imageComponentFlow->getResFlow(res - 1); |
633 | 0 | for(uint32_t j = 0; j < numTasks; j++) |
634 | 0 | { |
635 | 0 | auto indexMin = j * incrPerJob; |
636 | 0 | auto indexMax = (j < (numTasks - 1U) ? (j + 1U) * incrPerJob : resWidth) - indexMin; |
637 | 0 | auto myvert = new dwt_data<vec4f>(vert); |
638 | 0 | if(!myvert->alloc(dataLength)) |
639 | 0 | { |
640 | 0 | Logger::logger_.error("Out of memory"); |
641 | 0 | delete myvert; |
642 | 0 | return false; |
643 | 0 | } |
644 | 0 | resFlow->waveletVert_->nextTask().work( |
645 | 0 | [this, myvert, resHeight, indexMax, winL, winH, winDest] { |
646 | 0 | decompress_v_strip_97(myvert, indexMax, resHeight, winL, winH, winDest); |
647 | 0 | delete myvert; |
648 | 0 | }); |
649 | 0 | winL.incX_IN_PLACE(incrPerJob); |
650 | 0 | winH.incX_IN_PLACE(incrPerJob); |
651 | 0 | winDest.incX_IN_PLACE(incrPerJob); |
652 | 0 | } |
653 | 0 | } |
654 | | |
655 | 0 | return true; |
656 | 0 | } |
657 | | /* <summary> */ |
658 | | /* Inverse 9-7 wavelet transform in 2-D. */ |
659 | | /* </summary> */ |
660 | | bool WaveletReverse::decompress_tile_97(void) |
661 | 0 | { |
662 | 0 | if(numres_ == 1U) |
663 | 0 | return true; |
664 | | |
665 | 0 | auto tr = tilec_->resolutions_; |
666 | 0 | auto buf = tilec_->getWindow(); |
667 | 0 | uint32_t resWidth = tr->width(); |
668 | 0 | uint32_t resHeight = tr->height(); |
669 | |
|
670 | 0 | size_t dataLength = max_resolution(tr, numres_); |
671 | 0 | if(!horizF_.alloc(dataLength)) |
672 | 0 | { |
673 | 0 | Logger::logger_.error("decompress_tile_97: out of memory"); |
674 | 0 | return false; |
675 | 0 | } |
676 | 0 | vertF_.mem = horizF_.mem; |
677 | 0 | uint32_t numThreads = (uint32_t)ExecSingleton::get().num_workers(); |
678 | 0 | for(uint8_t res = 1; res < numres_; ++res) |
679 | 0 | { |
680 | 0 | horizF_.sn_full = resWidth; |
681 | 0 | vertF_.sn_full = resHeight; |
682 | 0 | ++tr; |
683 | 0 | resWidth = tr->width(); |
684 | 0 | resHeight = tr->height(); |
685 | 0 | if(resWidth == 0 || resHeight == 0) |
686 | 0 | continue; |
687 | 0 | horizF_.dn_full = resWidth - horizF_.sn_full; |
688 | 0 | horizF_.parity = tr->x0 & 1; |
689 | 0 | horizF_.win_l = grk_line32(0, horizF_.sn_full); |
690 | 0 | horizF_.win_h = grk_line32(0, horizF_.dn_full); |
691 | 0 | auto winSplitL = buf->getResWindowBufferSplitSimpleF(res, SPLIT_L); |
692 | 0 | auto winSplitH = buf->getResWindowBufferSplitSimpleF(res, SPLIT_H); |
693 | 0 | if(!decompress_h_97(res, numThreads, dataLength, horizF_, vertF_.sn_full, |
694 | 0 | buf->getResWindowBufferSimpleF(res - 1U), |
695 | 0 | buf->getBandWindowBufferPaddedSimpleF(res, BAND_ORIENT_HL), winSplitL)) |
696 | 0 | return false; |
697 | 0 | if(!decompress_h_97(res, numThreads, dataLength, horizF_, resHeight - vertF_.sn_full, |
698 | 0 | buf->getBandWindowBufferPaddedSimpleF(res, BAND_ORIENT_LH), |
699 | 0 | buf->getBandWindowBufferPaddedSimpleF(res, BAND_ORIENT_HH), winSplitH)) |
700 | 0 | return false; |
701 | 0 | vertF_.dn_full = resHeight - vertF_.sn_full; |
702 | 0 | vertF_.parity = tr->y0 & 1; |
703 | 0 | vertF_.win_l = grk_line32(0, vertF_.sn_full); |
704 | 0 | vertF_.win_h = grk_line32(0, vertF_.dn_full); |
705 | 0 | if(!decompress_v_97(res, numThreads, dataLength, vertF_, resWidth, resHeight, winSplitL, |
706 | 0 | winSplitH, buf->getResWindowBufferSimpleF(res))) |
707 | 0 | return false; |
708 | 0 | } |
709 | | |
710 | 0 | return true; |
711 | 0 | } |
712 | | |
713 | | /************************************************************************************** |
714 | | * |
715 | | * Full 5/3 Inverse Wavelet |
716 | | * |
717 | | * |
718 | | *************************************************************************************/ |
719 | | |
720 | | void WaveletReverse::decompress_h_parity_even_53(int32_t* buf, int32_t* bandL, /* even */ |
721 | | const uint32_t wL, int32_t* bandH, |
722 | | const uint32_t wH, int32_t* dest) |
723 | 0 | { /* odd */ |
724 | 0 | const uint32_t total_width = wL + wH; |
725 | 0 | assert(total_width > 1); |
726 | | |
727 | | /* Improved version of the TWO_PASS_VERSION: */ |
728 | | /* Performs lifting in one single iteration. Saves memory */ |
729 | | /* accesses and explicit interleaving. */ |
730 | 0 | int32_t s1n = bandL[0]; |
731 | 0 | int32_t d1n = bandH[0]; |
732 | 0 | int32_t s0n = s1n - ((d1n + 1) >> 1); |
733 | 0 | uint32_t i = 0; |
734 | 0 | if(total_width > 2) |
735 | 0 | { |
736 | 0 | for(uint32_t j = 1; i < (total_width - 3); i += 2, j++) |
737 | 0 | { |
738 | 0 | int32_t d1c = d1n; |
739 | 0 | int32_t s0c = s0n; |
740 | |
|
741 | 0 | s1n = bandL[j]; |
742 | 0 | d1n = bandH[j]; |
743 | 0 | s0n = s1n - ((d1c + d1n + 2) >> 2); |
744 | 0 | buf[i] = s0c; |
745 | 0 | buf[i + 1] = d1c + ((s0c + s0n) >> 1); |
746 | 0 | } |
747 | 0 | } |
748 | 0 | buf[i] = s0n; |
749 | 0 | if(total_width & 1) |
750 | 0 | { |
751 | 0 | buf[total_width - 1] = bandL[(total_width - 1) >> 1] - ((d1n + 1) >> 1); |
752 | 0 | buf[total_width - 2] = d1n + ((s0n + buf[total_width - 1]) >> 1); |
753 | 0 | } |
754 | 0 | else |
755 | 0 | { |
756 | 0 | buf[total_width - 1] = d1n + s0n; |
757 | 0 | } |
758 | 0 | memcpy(dest, buf, total_width * sizeof(int32_t)); |
759 | 0 | } |
760 | | |
761 | | void WaveletReverse::decompress_h_parity_odd_53(int32_t* buf, int32_t* bandL, /* odd */ |
762 | | const uint32_t wL, int32_t* bandH, |
763 | | const uint32_t wH, int32_t* dest) |
764 | 0 | { /* even */ |
765 | 0 | const uint32_t total_width = wL + wH; |
766 | 0 | assert(total_width > 2); |
767 | | |
768 | | /* Improved version of the TWO_PASS_VERSION: |
769 | | Performs lifting in one single iteration. Saves memory |
770 | | accesses and explicit interleaving. */ |
771 | 0 | int32_t s1 = bandH[1]; |
772 | 0 | int32_t dc = bandL[0] - ((bandH[0] + s1 + 2) >> 2); |
773 | 0 | buf[0] = bandH[0] + dc; |
774 | 0 | uint32_t i, j; |
775 | 0 | for(i = 1, j = 1; i < (total_width - 2 - !(total_width & 1)); i += 2, j++) |
776 | 0 | { |
777 | 0 | int32_t s2 = bandH[j + 1]; |
778 | 0 | int32_t dn = bandL[j] - ((s1 + s2 + 2) >> 2); |
779 | |
|
780 | 0 | buf[i] = dc; |
781 | 0 | buf[i + 1] = s1 + ((dn + dc) >> 1); |
782 | 0 | dc = dn; |
783 | 0 | s1 = s2; |
784 | 0 | } |
785 | 0 | buf[i] = dc; |
786 | 0 | if(!(total_width & 1)) |
787 | 0 | { |
788 | 0 | int32_t dn = bandL[(total_width >> 1) - 1] - ((s1 + 1) >> 1); |
789 | 0 | buf[total_width - 2] = s1 + ((dn + dc) >> 1); |
790 | 0 | buf[total_width - 1] = dn; |
791 | 0 | } |
792 | 0 | else |
793 | 0 | { |
794 | 0 | buf[total_width - 1] = s1 + dc; |
795 | 0 | } |
796 | 0 | memcpy(dest, buf, total_width * sizeof(int32_t)); |
797 | 0 | } |
798 | | |
799 | | /** Vertical inverse 5x3 wavelet transform for one column, when top-most |
800 | | * pixel is on even coordinate */ |
801 | | void WaveletReverse::decompress_v_parity_even_53(int32_t* buf, int32_t* bandL, const uint32_t hL, |
802 | | const uint32_t strideL, int32_t* bandH, |
803 | | const uint32_t hH, const uint32_t strideH, |
804 | | int32_t* dest, const uint32_t strideDest) |
805 | 0 | { |
806 | 0 | const uint32_t total_height = hL + hH; |
807 | 0 | assert(total_height > 1); |
808 | | |
809 | | /* Performs lifting in one single iteration. Saves memory */ |
810 | | /* accesses and explicit interleaving. */ |
811 | 0 | int32_t s1n = bandL[0]; |
812 | 0 | int32_t d1n = bandH[0]; |
813 | 0 | int32_t s0n = s1n - ((d1n + 1) >> 1); |
814 | |
|
815 | 0 | uint32_t i = 0; |
816 | 0 | if(total_height > 2) |
817 | 0 | { |
818 | 0 | auto bL = bandL + strideL; |
819 | 0 | auto bH = bandH + strideH; |
820 | 0 | for(uint32_t j = 0; i < (total_height - 3); i += 2, j++) |
821 | 0 | { |
822 | 0 | int32_t d1c = d1n; |
823 | 0 | int32_t s0c = s0n; |
824 | 0 | s1n = *bL; |
825 | 0 | bL += strideL; |
826 | 0 | d1n = *bH; |
827 | 0 | bH += strideH; |
828 | 0 | s0n = s1n - ((d1c + d1n + 2) >> 2); |
829 | 0 | buf[i] = s0c; |
830 | 0 | buf[i + 1] = d1c + ((s0c + s0n) >> 1); |
831 | 0 | } |
832 | 0 | } |
833 | 0 | buf[i] = s0n; |
834 | 0 | if(total_height & 1) |
835 | 0 | { |
836 | 0 | buf[total_height - 1] = bandL[((total_height - 1) >> 1) * strideL] - ((d1n + 1) >> 1); |
837 | 0 | buf[total_height - 2] = d1n + ((s0n + buf[total_height - 1]) >> 1); |
838 | 0 | } |
839 | 0 | else |
840 | 0 | { |
841 | 0 | buf[total_height - 1] = d1n + s0n; |
842 | 0 | } |
843 | 0 | for(i = 0; i < total_height; ++i) |
844 | 0 | { |
845 | 0 | *dest = buf[i]; |
846 | 0 | dest += strideDest; |
847 | 0 | } |
848 | 0 | } |
849 | | /** Vertical inverse 5x3 wavelet transform for one column, when top-most |
850 | | * pixel is on odd coordinate */ |
851 | | void WaveletReverse::decompress_v_parity_odd_53(int32_t* buf, int32_t* bandL, const uint32_t hL, |
852 | | const uint32_t strideL, int32_t* bandH, |
853 | | const uint32_t hH, const uint32_t strideH, |
854 | | int32_t* dest, const uint32_t strideDest) |
855 | 0 | { |
856 | 0 | const uint32_t total_height = hL + hH; |
857 | 0 | assert(total_height > 2); |
858 | | |
859 | | /* Performs lifting in one single iteration. Saves memory */ |
860 | | /* accesses and explicit interleaving. */ |
861 | 0 | int32_t s1 = bandH[strideH]; |
862 | 0 | int32_t dc = bandL[0] - ((bandH[0] + s1 + 2) >> 2); |
863 | 0 | buf[0] = bandH[0] + dc; |
864 | 0 | auto s2_ptr = bandH + (strideH << 1); |
865 | 0 | auto dn_ptr = bandL + strideL; |
866 | 0 | uint32_t i, j; |
867 | 0 | for(i = 1, j = 1; i < (total_height - 2 - !(total_height & 1)); i += 2, j++) |
868 | 0 | { |
869 | 0 | int32_t s2 = *s2_ptr; |
870 | 0 | s2_ptr += strideH; |
871 | |
|
872 | 0 | int32_t dn = *dn_ptr - ((s1 + s2 + 2) >> 2); |
873 | 0 | dn_ptr += strideL; |
874 | |
|
875 | 0 | buf[i] = dc; |
876 | 0 | buf[i + 1] = s1 + ((dn + dc) >> 1); |
877 | 0 | dc = dn; |
878 | 0 | s1 = s2; |
879 | 0 | } |
880 | 0 | buf[i] = dc; |
881 | 0 | if(!(total_height & 1)) |
882 | 0 | { |
883 | 0 | int32_t dn = bandL[((total_height >> 1) - 1) * strideL] - ((s1 + 1) >> 1); |
884 | 0 | buf[total_height - 2] = s1 + ((dn + dc) >> 1); |
885 | 0 | buf[total_height - 1] = dn; |
886 | 0 | } |
887 | 0 | else |
888 | 0 | { |
889 | 0 | buf[total_height - 1] = s1 + dc; |
890 | 0 | } |
891 | 0 | for(i = 0; i < total_height; ++i) |
892 | 0 | { |
893 | 0 | *dest = buf[i]; |
894 | 0 | dest += strideDest; |
895 | 0 | } |
896 | 0 | } |
897 | | /* <summary> */ |
898 | | /* Inverse 5-3 wavelet transform in 1-D for one row. */ |
899 | | /* </summary> */ |
900 | | /* Performs interleave, inverse wavelet transform and copy back to buffer */ |
901 | | void WaveletReverse::decompress_h_53(const dwt_data<int32_t>* dwt, int32_t* bandL, int32_t* bandH, |
902 | | int32_t* dest) |
903 | 0 | { |
904 | 0 | const uint32_t total_width = dwt->sn_full + dwt->dn_full; |
905 | 0 | assert(total_width != 0); |
906 | 0 | if(dwt->parity == 0) |
907 | 0 | { /* Left-most sample is on even coordinate */ |
908 | 0 | if(total_width > 1) |
909 | 0 | { |
910 | 0 | decompress_h_parity_even_53(dwt->mem, bandL, dwt->sn_full, bandH, dwt->dn_full, dest); |
911 | 0 | } |
912 | 0 | else |
913 | 0 | { |
914 | 0 | assert(dwt->sn_full == 1); |
915 | | // only L op: only one sample in L band and H band is empty |
916 | 0 | dest[0] = bandL[0]; |
917 | 0 | } |
918 | 0 | } |
919 | 0 | else |
920 | 0 | { /* Left-most sample is on odd coordinate */ |
921 | 0 | if(total_width == 1) |
922 | 0 | { |
923 | 0 | assert(dwt->dn_full == 1); |
924 | | // only H op: only one sample in H band and L band is empty |
925 | 0 | dest[0] = bandH[0] >> 1; |
926 | 0 | } |
927 | 0 | else if(total_width == 2) |
928 | 0 | { |
929 | 0 | dwt->mem[1] = bandL[0] - ((bandH[0] + 1) >> 1); |
930 | 0 | dest[0] = bandH[0] + dwt->mem[1]; |
931 | 0 | dest[1] = dwt->mem[1]; |
932 | 0 | } |
933 | 0 | else |
934 | 0 | { |
935 | 0 | decompress_h_parity_odd_53(dwt->mem, bandL, dwt->sn_full, bandH, dwt->dn_full, dest); |
936 | 0 | } |
937 | 0 | } |
938 | 0 | } |
939 | | |
940 | | /* <summary> */ |
941 | | /* Inverse vertical 5-3 wavelet transform in 1-D for several columns. */ |
942 | | /* </summary> */ |
943 | | /* Performs interleave, inverse wavelet transform and copy back to buffer */ |
944 | | /** Number of columns that we can process in parallel in the vertical pass */ |
945 | 0 | #define PLL_COLS_53 (2 * uint32_t(HWY_DYNAMIC_DISPATCH(hwy_num_lanes)())) |
946 | | void WaveletReverse::decompress_v_53(const dwt_data<int32_t>* dwt, grk_buf2d_simple<int32_t> winL, |
947 | | grk_buf2d_simple<int32_t> winH, |
948 | | grk_buf2d_simple<int32_t> winDest, uint32_t nb_cols) |
949 | 0 | { |
950 | 0 | const uint32_t total_height = dwt->sn_full + dwt->dn_full; |
951 | 0 | assert(total_height != 0); |
952 | 0 | if(dwt->parity == 0) |
953 | 0 | { |
954 | 0 | if(total_height == 1) |
955 | 0 | { |
956 | 0 | for(uint32_t c = 0; c < nb_cols; c++, winL.buf_++, winDest.buf_++) |
957 | 0 | winDest.buf_[0] = winL.buf_[0]; |
958 | 0 | } |
959 | 0 | else |
960 | 0 | { |
961 | 0 | if(nb_cols == PLL_COLS_53) |
962 | 0 | { |
963 | | /* Same as below general case, except that thanks to SSE2/AVX2 */ |
964 | | /* we can efficiently process 8/16 columns in parallel */ |
965 | 0 | HWY_DYNAMIC_DISPATCH(hwy_decompress_v_parity_even_mcols_53) |
966 | 0 | (dwt->mem, winL.buf_, dwt->sn_full, winL.stride_, winH.buf_, dwt->dn_full, winH.stride_, |
967 | 0 | winDest.buf_, winDest.stride_); |
968 | 0 | } |
969 | 0 | else |
970 | 0 | { |
971 | 0 | for(uint32_t c = 0; c < nb_cols; c++, winL.buf_++, winH.buf_++, winDest.buf_++) |
972 | 0 | decompress_v_parity_even_53(dwt->mem, winL.buf_, dwt->sn_full, winL.stride_, |
973 | 0 | winH.buf_, dwt->dn_full, winL.stride_, winDest.buf_, |
974 | 0 | winDest.stride_); |
975 | 0 | } |
976 | 0 | } |
977 | 0 | } |
978 | 0 | else |
979 | 0 | { |
980 | 0 | if(total_height == 1) |
981 | 0 | { |
982 | 0 | for(uint32_t c = 0; c < nb_cols; c++, winL.buf_++, winDest.buf_++) |
983 | 0 | winDest.buf_[0] = winL.buf_[0] >> 1; |
984 | 0 | } |
985 | 0 | else if(total_height == 2) |
986 | 0 | { |
987 | 0 | auto out = dwt->mem; |
988 | 0 | for(uint32_t c = 0; c < nb_cols; c++, winL.buf_++, winH.buf_++, winDest.buf_++) |
989 | 0 | { |
990 | 0 | out[1] = winL.buf_[0] - ((winH.buf_[0] + 1) >> 1); |
991 | 0 | winDest.buf_[0] = winH.buf_[0] + out[1]; |
992 | 0 | winDest.buf_[1] = out[1]; |
993 | 0 | } |
994 | 0 | } |
995 | 0 | else |
996 | 0 | { |
997 | 0 | if(nb_cols == PLL_COLS_53) |
998 | 0 | { |
999 | | /* Same as below general case, except that thanks to SSE2/AVX2 */ |
1000 | | /* we can efficiently process 8/16 columns in parallel */ |
1001 | 0 | HWY_DYNAMIC_DISPATCH(hwy_decompress_v_parity_odd_mcols_53) |
1002 | 0 | (dwt->mem, winL.buf_, dwt->sn_full, winL.stride_, winH.buf_, dwt->dn_full, winH.stride_, |
1003 | 0 | winDest.buf_, winDest.stride_); |
1004 | 0 | } |
1005 | 0 | else |
1006 | 0 | { |
1007 | 0 | for(uint32_t c = 0; c < nb_cols; c++, winL.buf_++, winH.buf_++, winDest.buf_++) |
1008 | 0 | decompress_v_parity_odd_53(dwt->mem, winL.buf_, dwt->sn_full, winL.stride_, |
1009 | 0 | winH.buf_, dwt->dn_full, winH.stride_, winDest.buf_, |
1010 | 0 | winDest.stride_); |
1011 | 0 | } |
1012 | 0 | } |
1013 | 0 | } |
1014 | 0 | } |
1015 | | |
1016 | | void WaveletReverse::decompress_h_strip_53(const dwt_data<int32_t>* horiz, uint32_t hMin, |
1017 | | uint32_t hMax, grk_buf2d_simple<int32_t> winL, |
1018 | | grk_buf2d_simple<int32_t> winH, |
1019 | | grk_buf2d_simple<int32_t> winDest) |
1020 | 0 | { |
1021 | 0 | for(uint32_t j = hMin; j < hMax; ++j) |
1022 | 0 | { |
1023 | 0 | decompress_h_53(horiz, winL.buf_, winH.buf_, winDest.buf_); |
1024 | 0 | winL.incY_IN_PLACE(1); |
1025 | 0 | winH.incY_IN_PLACE(1); |
1026 | 0 | winDest.incY_IN_PLACE(1); |
1027 | 0 | } |
1028 | 0 | } |
1029 | | bool WaveletReverse::decompress_h_53(uint8_t res, TileComponentWindow<int32_t>* buf, |
1030 | | uint32_t resHeight, size_t dataLength) |
1031 | 0 | { |
1032 | 0 | uint32_t numThreads = (uint32_t)ExecSingleton::get().num_workers(); |
1033 | 0 | grk_buf2d_simple<int32_t> winL, winH, winDest; |
1034 | 0 | auto imageComponentFlow = scheduler_->getImageComponentFlow(compno_); |
1035 | 0 | auto resFlow = imageComponentFlow->getResFlow(res - 1); |
1036 | 0 | uint32_t numTasks[2] = {0, 0}; |
1037 | 0 | uint32_t height[2] = {0, 0}; |
1038 | 0 | for(uint32_t orient = 0; orient < 2; ++orient) |
1039 | 0 | { |
1040 | 0 | height[orient] = (orient == 0) ? vert_.sn_full : resHeight - vert_.sn_full; |
1041 | 0 | if(numThreads > 1) |
1042 | 0 | numTasks[orient] = height[orient] < numThreads ? height[orient] : numThreads; |
1043 | 0 | } |
1044 | 0 | for(uint32_t orient = 0; orient < 2; ++orient) |
1045 | 0 | { |
1046 | 0 | if(height[orient] == 0) |
1047 | 0 | continue; |
1048 | 0 | if(orient == 0) |
1049 | 0 | { |
1050 | 0 | winL = buf->getResWindowBufferSimple(res - 1U); |
1051 | 0 | winH = buf->getBandWindowBufferPaddedSimple(res, BAND_ORIENT_HL); |
1052 | 0 | winDest = buf->getResWindowBufferSplitSimple(res, SPLIT_L); |
1053 | 0 | } |
1054 | 0 | else |
1055 | 0 | { |
1056 | 0 | winL = buf->getBandWindowBufferPaddedSimple(res, BAND_ORIENT_LH); |
1057 | 0 | winH = buf->getBandWindowBufferPaddedSimple(res, BAND_ORIENT_HH); |
1058 | 0 | winDest = buf->getResWindowBufferSplitSimple(res, SPLIT_H); |
1059 | 0 | } |
1060 | 0 | if(numThreads == 1) |
1061 | 0 | { |
1062 | 0 | if(!horiz_.mem) |
1063 | 0 | { |
1064 | 0 | if(!horiz_.alloc(dataLength)) |
1065 | 0 | { |
1066 | 0 | Logger::logger_.error("Out of memory"); |
1067 | 0 | return false; |
1068 | 0 | } |
1069 | 0 | vert_.mem = horiz_.mem; |
1070 | 0 | } |
1071 | 0 | decompress_h_strip_53(&horiz_, 0, height[orient], winL, winH, winDest); |
1072 | 0 | } |
1073 | 0 | else |
1074 | 0 | { |
1075 | 0 | uint32_t incrPerJob = height[orient] / numTasks[orient]; |
1076 | 0 | for(uint32_t j = 0; j < numTasks[orient]; ++j) |
1077 | 0 | { |
1078 | 0 | auto indexMin = j * incrPerJob; |
1079 | 0 | auto indexMax = j < (numTasks[orient] - 1U) ? (j + 1U) * incrPerJob : height[orient]; |
1080 | 0 | auto horiz = new dwt_data<int32_t>(horiz_); |
1081 | 0 | if(!horiz->alloc(dataLength)) |
1082 | 0 | { |
1083 | 0 | Logger::logger_.error("Out of memory"); |
1084 | 0 | delete horiz; |
1085 | 0 | return false; |
1086 | 0 | } |
1087 | 0 | resFlow->waveletHoriz_->nextTask().work( |
1088 | 0 | [this, horiz, winL, winH, winDest, indexMin, indexMax] { |
1089 | 0 | decompress_h_strip_53(horiz, indexMin, indexMax, winL, winH, winDest); |
1090 | 0 | delete horiz; |
1091 | 0 | }); |
1092 | 0 | winL.incY_IN_PLACE(incrPerJob); |
1093 | 0 | winH.incY_IN_PLACE(incrPerJob); |
1094 | 0 | winDest.incY_IN_PLACE(incrPerJob); |
1095 | 0 | } |
1096 | 0 | } |
1097 | 0 | } |
1098 | | |
1099 | 0 | return true; |
1100 | 0 | } |
1101 | | |
1102 | | void WaveletReverse::decompress_v_strip_53(const dwt_data<int32_t>* vert, uint32_t wMin, |
1103 | | uint32_t wMax, grk_buf2d_simple<int32_t> winL, |
1104 | | grk_buf2d_simple<int32_t> winH, |
1105 | | grk_buf2d_simple<int32_t> winDest) |
1106 | 0 | { |
1107 | 0 | uint32_t j; |
1108 | 0 | for(j = wMin; j + PLL_COLS_53 <= wMax; j += PLL_COLS_53) |
1109 | 0 | { |
1110 | 0 | decompress_v_53(vert, winL, winH, winDest, PLL_COLS_53); |
1111 | 0 | winL.incX_IN_PLACE(PLL_COLS_53); |
1112 | 0 | winH.incX_IN_PLACE(PLL_COLS_53); |
1113 | 0 | winDest.incX_IN_PLACE(PLL_COLS_53); |
1114 | 0 | } |
1115 | 0 | if(j < wMax) |
1116 | 0 | decompress_v_53(vert, winL, winH, winDest, wMax - j); |
1117 | 0 | } |
1118 | | |
1119 | | bool WaveletReverse::decompress_v_53(uint8_t res, TileComponentWindow<int32_t>* buf, |
1120 | | uint32_t resWidth, size_t dataLength) |
1121 | 0 | { |
1122 | 0 | if(resWidth == 0) |
1123 | 0 | return true; |
1124 | 0 | uint32_t numThreads = (uint32_t)ExecSingleton::get().num_workers(); |
1125 | 0 | auto winL = buf->getResWindowBufferSplitSimple(res, SPLIT_L); |
1126 | 0 | auto winH = buf->getResWindowBufferSplitSimple(res, SPLIT_H); |
1127 | 0 | auto winDest = buf->getResWindowBufferSimple(res); |
1128 | 0 | if(numThreads == 1) |
1129 | 0 | { |
1130 | 0 | if(!horiz_.mem) |
1131 | 0 | { |
1132 | 0 | if(!horiz_.alloc(dataLength)) |
1133 | 0 | { |
1134 | 0 | Logger::logger_.error("Out of memory"); |
1135 | 0 | return false; |
1136 | 0 | } |
1137 | 0 | vert_.mem = horiz_.mem; |
1138 | 0 | } |
1139 | 0 | decompress_v_strip_53(&vert_, 0, resWidth, winL, winH, winDest); |
1140 | 0 | } |
1141 | 0 | else |
1142 | 0 | { |
1143 | 0 | auto imageComponentFlow = scheduler_->getImageComponentFlow(compno_); |
1144 | 0 | auto resFlow = imageComponentFlow->getResFlow(res - 1); |
1145 | 0 | const uint32_t numTasks = resWidth < numThreads ? resWidth : numThreads; |
1146 | 0 | uint32_t step = resWidth / numTasks; |
1147 | 0 | for(uint32_t j = 0; j < numTasks; j++) |
1148 | 0 | { |
1149 | 0 | auto indexMin = j * step; |
1150 | 0 | auto indexMax = j < (numTasks - 1U) ? (j + 1U) * step : resWidth; |
1151 | 0 | auto vert = new dwt_data<int32_t>(vert_); |
1152 | 0 | if(!vert->alloc(dataLength)) |
1153 | 0 | { |
1154 | 0 | Logger::logger_.error("Out of memory"); |
1155 | 0 | delete vert; |
1156 | 0 | return false; |
1157 | 0 | } |
1158 | 0 | resFlow->waveletVert_->nextTask().work( |
1159 | 0 | [this, vert, indexMin, indexMax, winL, winH, winDest] { |
1160 | 0 | decompress_v_strip_53(vert, indexMin, indexMax, winL, winH, winDest); |
1161 | 0 | delete vert; |
1162 | 0 | }); |
1163 | 0 | winL.incX_IN_PLACE(step); |
1164 | 0 | winH.incX_IN_PLACE(step); |
1165 | 0 | winDest.incX_IN_PLACE(step); |
1166 | 0 | } |
1167 | 0 | } |
1168 | 0 | return true; |
1169 | 0 | } |
1170 | | /* <summary> */ |
1171 | | /* Inverse wavelet transform in 2-D. */ |
1172 | | /* </summary> */ |
1173 | | bool WaveletReverse::decompress_tile_53(void) |
1174 | 0 | { |
1175 | 0 | if(numres_ == 1U) |
1176 | 0 | return true; |
1177 | | |
1178 | 0 | auto tileCompRes = tilec_->resolutions_; |
1179 | 0 | auto buf = tilec_->getWindow(); |
1180 | 0 | size_t dataLength = max_resolution(tileCompRes, numres_); |
1181 | | /* overflow check */ |
1182 | 0 | if(dataLength > (SIZE_MAX / PLL_COLS_53 / sizeof(int32_t))) |
1183 | 0 | { |
1184 | 0 | Logger::logger_.error("Overflow"); |
1185 | 0 | return false; |
1186 | 0 | } |
1187 | | /* We need PLL_COLS_53 times the height of the array, */ |
1188 | | /* since for the vertical pass */ |
1189 | | /* we process PLL_COLS_53 columns at a time */ |
1190 | 0 | dataLength *= PLL_COLS_53 * sizeof(int32_t); |
1191 | 0 | for(uint8_t res = 1; res < numres_; ++res) |
1192 | 0 | { |
1193 | 0 | horiz_.sn_full = tileCompRes->width(); |
1194 | 0 | vert_.sn_full = tileCompRes->height(); |
1195 | 0 | ++tileCompRes; |
1196 | 0 | auto resWidth = tileCompRes->width(); |
1197 | 0 | auto resHeight = tileCompRes->height(); |
1198 | 0 | if(resWidth == 0 || resHeight == 0) |
1199 | 0 | continue; |
1200 | 0 | horiz_.dn_full = resWidth - horiz_.sn_full; |
1201 | 0 | horiz_.parity = tileCompRes->x0 & 1; |
1202 | 0 | vert_.dn_full = resHeight - vert_.sn_full; |
1203 | 0 | vert_.parity = tileCompRes->y0 & 1; |
1204 | 0 | if(!decompress_h_53(res, buf, resHeight, dataLength)) |
1205 | 0 | return false; |
1206 | 0 | if(!decompress_v_53(res, buf, resWidth, dataLength)) |
1207 | 0 | return false; |
1208 | 0 | } |
1209 | | |
1210 | 0 | return true; |
1211 | 0 | } |
1212 | | |
1213 | | /************************************************************************************* |
1214 | | * |
1215 | | * Partial 5/3 or 9/7 Inverse Wavelet |
1216 | | * |
1217 | | ************************************************************************************** |
1218 | | * |
1219 | | * |
1220 | | * 5/3 operates on elements of type int32_t while 9/7 operates on elements of type vec4f |
1221 | | * |
1222 | | * Horizontal pass |
1223 | | * |
1224 | | * Each thread processes a strip running the length of the window, with height |
1225 | | * 5/3 |
1226 | | * Height : sizeof(T)/sizeof(int32_t) |
1227 | | * |
1228 | | * 9/7 |
1229 | | * Height : sizeof(T)/sizeof(int32_t) |
1230 | | * |
1231 | | * Vertical pass |
1232 | | * |
1233 | | * Each thread processes a strip running the height of the window, with width |
1234 | | * |
1235 | | * 5/3 |
1236 | | * Width : 4 |
1237 | | * |
1238 | | * 9/7 |
1239 | | * Width : 4 |
1240 | | * |
1241 | | ****************************************************************************/ |
1242 | | template<typename T, uint32_t FILTER_WIDTH, uint32_t VERT_PASS_WIDTH> |
1243 | | class PartialInterleaver |
1244 | | { |
1245 | | public: |
1246 | | bool interleave_h(dwt_data<T>* dwt, ISparseCanvas* sa, uint32_t y_offset, uint32_t height) |
1247 | 515k | { |
1248 | 515k | const uint32_t stripHeight = (uint32_t)(sizeof(T) / sizeof(int32_t)); |
1249 | 1.07M | for(uint32_t y = 0; y < height; y++) |
1250 | 583k | { |
1251 | | // read one row of L band |
1252 | 583k | if(dwt->sn_full) |
1253 | 576k | { |
1254 | 576k | bool ret = |
1255 | 576k | sa->read(dwt->resno, |
1256 | 576k | grk_rect32(dwt->win_l.x0, y_offset + y, |
1257 | 576k | std::min<uint32_t>(dwt->win_l.x1 + FILTER_WIDTH, dwt->sn_full), |
1258 | 576k | y_offset + y + 1), |
1259 | 576k | (int32_t*)dwt->memL + y, 2 * stripHeight, 0); |
1260 | 576k | if(!ret) |
1261 | 5.60k | return false; |
1262 | 576k | } |
1263 | | // read one row of H band |
1264 | 577k | if(dwt->dn_full) |
1265 | 589k | { |
1266 | 589k | bool ret = |
1267 | 589k | sa->read(dwt->resno, |
1268 | 589k | grk_rect32(dwt->sn_full + dwt->win_h.x0, y_offset + y, |
1269 | 589k | dwt->sn_full + std::min<uint32_t>(dwt->win_h.x1 + FILTER_WIDTH, |
1270 | 589k | dwt->dn_full), |
1271 | 589k | y_offset + y + 1), |
1272 | 589k | (int32_t*)dwt->memH + y, 2 * stripHeight, 0); |
1273 | 589k | if(!ret) |
1274 | 14.2k | return false; |
1275 | 589k | } |
1276 | 577k | } |
1277 | | |
1278 | 495k | return true; |
1279 | 515k | } grk::PartialInterleaver<int, 2u, 4u>::interleave_h(grk::dwt_data<int>*, grk::ISparseCanvas*, unsigned int, unsigned int) Line | Count | Source | 1247 | 84.3k | { | 1248 | 84.3k | const uint32_t stripHeight = (uint32_t)(sizeof(T) / sizeof(int32_t)); | 1249 | 168k | for(uint32_t y = 0; y < height; y++) | 1250 | 84.3k | { | 1251 | | // read one row of L band | 1252 | 84.3k | if(dwt->sn_full) | 1253 | 84.3k | { | 1254 | 84.3k | bool ret = | 1255 | 84.3k | sa->read(dwt->resno, | 1256 | 84.3k | grk_rect32(dwt->win_l.x0, y_offset + y, | 1257 | 84.3k | std::min<uint32_t>(dwt->win_l.x1 + FILTER_WIDTH, dwt->sn_full), | 1258 | 84.3k | y_offset + y + 1), | 1259 | 84.3k | (int32_t*)dwt->memL + y, 2 * stripHeight, 0); | 1260 | 84.3k | if(!ret) | 1261 | 0 | return false; | 1262 | 84.3k | } | 1263 | | // read one row of H band | 1264 | 84.3k | if(dwt->dn_full) | 1265 | 84.2k | { | 1266 | 84.2k | bool ret = | 1267 | 84.2k | sa->read(dwt->resno, | 1268 | 84.2k | grk_rect32(dwt->sn_full + dwt->win_h.x0, y_offset + y, | 1269 | 84.2k | dwt->sn_full + std::min<uint32_t>(dwt->win_h.x1 + FILTER_WIDTH, | 1270 | 84.2k | dwt->dn_full), | 1271 | 84.2k | y_offset + y + 1), | 1272 | 84.2k | (int32_t*)dwt->memH + y, 2 * stripHeight, 0); | 1273 | 84.2k | if(!ret) | 1274 | 0 | return false; | 1275 | 84.2k | } | 1276 | 84.3k | } | 1277 | | | 1278 | 84.3k | return true; | 1279 | 84.3k | } |
grk::PartialInterleaver<grk::vec<float, 4ul>, 2u, 1u>::interleave_h(grk::dwt_data<grk::vec<float, 4ul> >*, grk::ISparseCanvas*, unsigned int, unsigned int) Line | Count | Source | 1247 | 430k | { | 1248 | 430k | const uint32_t stripHeight = (uint32_t)(sizeof(T) / sizeof(int32_t)); | 1249 | 910k | for(uint32_t y = 0; y < height; y++) | 1250 | 499k | { | 1251 | | // read one row of L band | 1252 | 499k | if(dwt->sn_full) | 1253 | 492k | { | 1254 | 492k | bool ret = | 1255 | 492k | sa->read(dwt->resno, | 1256 | 492k | grk_rect32(dwt->win_l.x0, y_offset + y, | 1257 | 492k | std::min<uint32_t>(dwt->win_l.x1 + FILTER_WIDTH, dwt->sn_full), | 1258 | 492k | y_offset + y + 1), | 1259 | 492k | (int32_t*)dwt->memL + y, 2 * stripHeight, 0); | 1260 | 492k | if(!ret) | 1261 | 5.60k | return false; | 1262 | 492k | } | 1263 | | // read one row of H band | 1264 | 493k | if(dwt->dn_full) | 1265 | 505k | { | 1266 | 505k | bool ret = | 1267 | 505k | sa->read(dwt->resno, | 1268 | 505k | grk_rect32(dwt->sn_full + dwt->win_h.x0, y_offset + y, | 1269 | 505k | dwt->sn_full + std::min<uint32_t>(dwt->win_h.x1 + FILTER_WIDTH, | 1270 | 505k | dwt->dn_full), | 1271 | 505k | y_offset + y + 1), | 1272 | 505k | (int32_t*)dwt->memH + y, 2 * stripHeight, 0); | 1273 | 505k | if(!ret) | 1274 | 14.2k | return false; | 1275 | 505k | } | 1276 | 493k | } | 1277 | | | 1278 | 411k | return true; | 1279 | 430k | } |
|
1280 | | bool interleave_v(dwt_data<T>* GRK_RESTRICT dwt, ISparseCanvas* sa, uint32_t x_offset, |
1281 | | uint32_t xWidth) |
1282 | 489k | { |
1283 | 489k | const uint32_t stripWidth = (sizeof(T) / sizeof(int32_t)) * VERT_PASS_WIDTH; |
1284 | | // read one vertical strip (of width xWidth <= stripWidth) of L band |
1285 | 489k | bool ret = false; |
1286 | 489k | if(dwt->sn_full) |
1287 | 486k | { |
1288 | 486k | ret = sa->read(dwt->resno, |
1289 | 486k | grk_rect32(x_offset, dwt->win_l.x0, x_offset + xWidth, |
1290 | 486k | std::min<uint32_t>(dwt->win_l.x1 + FILTER_WIDTH, dwt->sn_full)), |
1291 | 486k | (int32_t*)dwt->memL, 1, 2 * stripWidth); |
1292 | 486k | } |
1293 | | // read one vertical strip (of width x_num_elements <= stripWidth) of H band |
1294 | 489k | if(dwt->dn_full) |
1295 | 502k | { |
1296 | 502k | ret = sa->read(dwt->resno, |
1297 | 502k | grk_rect32(x_offset, dwt->sn_full + dwt->win_h.x0, x_offset + xWidth, |
1298 | 502k | dwt->sn_full + std::min<uint32_t>(dwt->win_h.x1 + FILTER_WIDTH, |
1299 | 502k | dwt->dn_full)), |
1300 | 502k | (int32_t*)dwt->memH, 1, 2 * stripWidth); |
1301 | 502k | } |
1302 | | |
1303 | 489k | return ret; |
1304 | 489k | } grk::PartialInterleaver<int, 2u, 4u>::interleave_v(grk::dwt_data<int>*, grk::ISparseCanvas*, unsigned int, unsigned int) Line | Count | Source | 1282 | 29.9k | { | 1283 | 29.9k | const uint32_t stripWidth = (sizeof(T) / sizeof(int32_t)) * VERT_PASS_WIDTH; | 1284 | | // read one vertical strip (of width xWidth <= stripWidth) of L band | 1285 | 29.9k | bool ret = false; | 1286 | 29.9k | if(dwt->sn_full) | 1287 | 30.0k | { | 1288 | 30.0k | ret = sa->read(dwt->resno, | 1289 | 30.0k | grk_rect32(x_offset, dwt->win_l.x0, x_offset + xWidth, | 1290 | 30.0k | std::min<uint32_t>(dwt->win_l.x1 + FILTER_WIDTH, dwt->sn_full)), | 1291 | 30.0k | (int32_t*)dwt->memL, 1, 2 * stripWidth); | 1292 | 30.0k | } | 1293 | | // read one vertical strip (of width x_num_elements <= stripWidth) of H band | 1294 | 29.9k | if(dwt->dn_full) | 1295 | 30.2k | { | 1296 | 30.2k | ret = sa->read(dwt->resno, | 1297 | 30.2k | grk_rect32(x_offset, dwt->sn_full + dwt->win_h.x0, x_offset + xWidth, | 1298 | 30.2k | dwt->sn_full + std::min<uint32_t>(dwt->win_h.x1 + FILTER_WIDTH, | 1299 | 30.2k | dwt->dn_full)), | 1300 | 30.2k | (int32_t*)dwt->memH, 1, 2 * stripWidth); | 1301 | 30.2k | } | 1302 | | | 1303 | 29.9k | return ret; | 1304 | 29.9k | } |
grk::PartialInterleaver<grk::vec<float, 4ul>, 2u, 1u>::interleave_v(grk::dwt_data<grk::vec<float, 4ul> >*, grk::ISparseCanvas*, unsigned int, unsigned int) Line | Count | Source | 1282 | 459k | { | 1283 | 459k | const uint32_t stripWidth = (sizeof(T) / sizeof(int32_t)) * VERT_PASS_WIDTH; | 1284 | | // read one vertical strip (of width xWidth <= stripWidth) of L band | 1285 | 459k | bool ret = false; | 1286 | 459k | if(dwt->sn_full) | 1287 | 456k | { | 1288 | 456k | ret = sa->read(dwt->resno, | 1289 | 456k | grk_rect32(x_offset, dwt->win_l.x0, x_offset + xWidth, | 1290 | 456k | std::min<uint32_t>(dwt->win_l.x1 + FILTER_WIDTH, dwt->sn_full)), | 1291 | 456k | (int32_t*)dwt->memL, 1, 2 * stripWidth); | 1292 | 456k | } | 1293 | | // read one vertical strip (of width x_num_elements <= stripWidth) of H band | 1294 | 459k | if(dwt->dn_full) | 1295 | 472k | { | 1296 | 472k | ret = sa->read(dwt->resno, | 1297 | 472k | grk_rect32(x_offset, dwt->sn_full + dwt->win_h.x0, x_offset + xWidth, | 1298 | 472k | dwt->sn_full + std::min<uint32_t>(dwt->win_h.x1 + FILTER_WIDTH, | 1299 | 472k | dwt->dn_full)), | 1300 | 472k | (int32_t*)dwt->memH, 1, 2 * stripWidth); | 1301 | 472k | } | 1302 | | | 1303 | 459k | return ret; | 1304 | 459k | } |
|
1305 | | }; |
1306 | | template<typename T, uint32_t FILTER_WIDTH, uint32_t VERT_PASS_WIDTH> |
1307 | | class Partial53 : public PartialInterleaver<T, FILTER_WIDTH, VERT_PASS_WIDTH> |
1308 | | { |
1309 | | public: |
1310 | | void decompress_h(dwt_data<T>* dwt) |
1311 | 84.4k | { |
1312 | 84.4k | #ifndef GRK_DEBUG_SPARSE |
1313 | 407k | #define get_S(buf, i) buf[(i) << 1] |
1314 | 29.8M | #define get_D(buf, i) buf[(1 + ((i) << 1))] |
1315 | 84.4k | #endif |
1316 | | |
1317 | 44.6M | #define S(buf, i) buf[(i) << 1] |
1318 | 15.0M | #define D(buf, i) buf[(1 + ((i) << 1))] |
1319 | | |
1320 | | // parity == 0 |
1321 | 84.4k | #define S_(buf, i) \ |
1322 | 100k | ((i) < -win_l_x0 ? get_S(buf, -win_l_x0) : ((i) >= sn ? get_S(buf, sn - 1) : get_S(buf, i))) |
1323 | 84.4k | #define D_(buf, i) \ |
1324 | 174k | ((i) < -win_h_x0 ? get_D(buf, -win_h_x0) : ((i) >= dn ? get_D(buf, dn - 1) : get_D(buf, i))) |
1325 | | |
1326 | | // parity == 1 |
1327 | 84.4k | #define SS_(buf, i) \ |
1328 | 84.4k | ((i) < -win_h_x0 ? get_S(buf, -win_h_x0) : ((i) >= dn ? get_S(buf, dn - 1) : get_S(buf, i))) |
1329 | 84.4k | #define DD_(buf, i) \ |
1330 | 84.4k | ((i) < -win_l_x0 ? get_D(buf, -win_l_x0) : ((i) >= sn ? get_D(buf, sn - 1) : get_D(buf, i))) |
1331 | | |
1332 | 84.4k | int64_t i; |
1333 | 84.4k | int64_t parity = dwt->parity; |
1334 | 84.4k | int64_t win_l_x0 = dwt->win_l.x0; |
1335 | 84.4k | int64_t win_l_x1 = dwt->win_l.x1; |
1336 | 84.4k | int64_t win_h_x0 = dwt->win_h.x0; |
1337 | 84.4k | int64_t win_h_x1 = dwt->win_h.x1; |
1338 | 84.4k | assert(dwt->win_l.x0 <= dwt->sn_full); |
1339 | 84.4k | int64_t sn = (int64_t)dwt->sn_full - (int64_t)dwt->win_l.x0; |
1340 | 84.4k | int64_t sn_full = dwt->sn_full; |
1341 | 84.4k | assert(dwt->win_h.x0 <= dwt->dn_full); |
1342 | 84.4k | int64_t dn = (int64_t)dwt->dn_full - (int64_t)dwt->win_h.x0; |
1343 | 84.4k | int64_t dn_full = dwt->dn_full; |
1344 | | |
1345 | 84.4k | adjust_bounds(dwt, sn_full, dn_full, &sn, &dn); |
1346 | | |
1347 | 84.4k | assert(dwt->win_l.x1 <= sn_full && dwt->win_h.x1 <= dn_full); |
1348 | | |
1349 | 84.4k | auto buf = dwt->mem; |
1350 | 84.4k | if(!parity) |
1351 | 84.1k | { |
1352 | 84.1k | if((dn_full != 0) || (sn_full > 1)) |
1353 | 84.1k | { |
1354 | | /* Naive version is : |
1355 | | for (i = win_l_x0; i < i_max; i++) { |
1356 | | S(i) -= (D_(i - 1) + D_(i) + 2) >> 2; |
1357 | | } |
1358 | | for (i = win_h_x0; i < win_h_x1; i++) { |
1359 | | D(i) += (S_(i) + S_(i + 1)) >> 1; |
1360 | | } |
1361 | | but the compiler doesn't manage to unroll it to avoid bound |
1362 | | checking in S_ and D_ macros |
1363 | | */ |
1364 | 84.1k | i = 0; |
1365 | 84.1k | int64_t i_max = win_l_x1 - win_l_x0; |
1366 | 84.1k | if(i < i_max) |
1367 | 84.1k | { |
1368 | | /* Left-most case */ |
1369 | 84.1k | S(buf, i) -= (D_(buf, i - 1) + D_(buf, i) + 2) >> 2; |
1370 | 84.1k | i++; |
1371 | | |
1372 | 84.1k | if(i_max > dn) |
1373 | 3.15k | i_max = dn; |
1374 | 14.7M | for(; i < i_max; i++) |
1375 | | /* No bound checking */ |
1376 | 14.6M | S(buf, i) -= (get_D(buf, i - 1) + get_D(buf, i) + 2) >> 2; |
1377 | 87.3k | for(; i < win_l_x1 - win_l_x0; i++) |
1378 | | /* Right-most case */ |
1379 | 3.16k | S(buf, i) -= (D_(buf, i - 1) + D_(buf, i) + 2) >> 2; |
1380 | 84.1k | } |
1381 | 84.1k | i = 0; |
1382 | 84.1k | i_max = win_h_x1 - win_h_x0; |
1383 | 84.1k | if(i < i_max) |
1384 | 85.5k | { |
1385 | 85.5k | if(i_max >= sn) |
1386 | 44.6k | i_max = sn - 1; |
1387 | 15.0M | for(; i < i_max; i++) |
1388 | | /* No bound checking */ |
1389 | 14.9M | D(buf, i) += (S(buf, i) + S(buf, i + 1)) >> 1; |
1390 | 135k | for(; i < win_h_x1 - win_h_x0; i++) |
1391 | | /* Right-most case */ |
1392 | 50.1k | D(buf, i) += (S_(buf, i) + S_(buf, i + 1)) >> 1; |
1393 | 85.5k | } |
1394 | 84.1k | } |
1395 | 84.1k | } |
1396 | 273 | else |
1397 | 273 | { |
1398 | 273 | if(sn_full == 0 && dn_full == 1) |
1399 | 15 | { |
1400 | | // only do L band (high pass) |
1401 | 15 | S(buf, 0) >>= 1; |
1402 | 15 | } |
1403 | 258 | else |
1404 | 258 | { |
1405 | 1.05k | for(i = 0; i < win_l_x1 - win_l_x0; i++) |
1406 | 795 | D(buf, i) -= (SS_(buf, i) + SS_(buf, i + 1) + 2) >> 2; |
1407 | 1.19k | for(i = 0; i < win_h_x1 - win_h_x0; i++) |
1408 | 933 | S(buf, i) += (DD_(buf, i) + DD_(buf, i - 1)) >> 1; |
1409 | 258 | } |
1410 | 273 | } |
1411 | 84.4k | } |
1412 | | void decompress_v(dwt_data<T>* dwt) |
1413 | 30.2k | { |
1414 | 30.2k | #ifndef GRK_DEBUG_SPARSE |
1415 | 542k | #define get_S_off(buf, i, off) buf[((i) << 1) * VERT_PASS_WIDTH + off] |
1416 | 989k | #define get_D_off(buf, i, off) buf[(1 + ((i) << 1)) * VERT_PASS_WIDTH + off] |
1417 | 30.2k | #endif |
1418 | | |
1419 | 323k | #define S_off(buf, i, off) buf[((i) << 1) * VERT_PASS_WIDTH + off] |
1420 | 224k | #define D_off(buf, i, off) buf[(1 + ((i) << 1)) * VERT_PASS_WIDTH + off] |
1421 | | |
1422 | | // parity == 0 |
1423 | 174k | #define S_off_(buf, i, off) (((i) >= sn ? get_S_off(buf, sn - 1, off) : get_S_off(buf, i, off))) |
1424 | 227k | #define D_off_(buf, i, off) (((i) >= dn ? get_D_off(buf, dn - 1, off) : get_D_off(buf, i, off))) |
1425 | | |
1426 | 30.2k | #define S_sgnd_off_(buf, i, off) \ |
1427 | 30.2k | (((i) < (-win_l_x0) ? get_S_off(buf, -win_l_x0, off) : S_off_(buf, i, off))) |
1428 | 30.2k | #define D_sgnd_off_(buf, i, off) \ |
1429 | 193k | (((i) < (-win_h_x0) ? get_D_off(buf, -win_h_x0, off) : D_off_(buf, i, off))) |
1430 | | |
1431 | | // case == 1 |
1432 | 30.2k | #define SS_sgnd_off_(buf, i, off) \ |
1433 | 30.2k | ((i) < (-win_h_x0) ? get_S_off(buf, -win_h_x0, off) \ |
1434 | 30.2k | : ((i) >= dn ? get_S_off(buf, dn - 1, off) : get_S_off(buf, i, off))) |
1435 | 30.2k | #define DD_sgnd_off_(buf, i, off) \ |
1436 | 30.2k | ((i) < (-win_l_x0) ? get_D_off(buf, -win_l_x0, off) \ |
1437 | 3.78k | : ((i) >= sn ? get_D_off(buf, sn - 1, off) : get_D_off(buf, i, off))) |
1438 | | |
1439 | 30.2k | #define SS_off_(buf, i, off) (((i) >= dn ? get_S_off(buf, dn - 1, off) : get_S_off(buf, i, off))) |
1440 | 30.2k | #define DD_off_(buf, i, off) (((i) >= sn ? get_D_off(buf, sn - 1, off) : get_D_off(buf, i, off))) |
1441 | | |
1442 | 30.2k | int64_t i; |
1443 | 30.2k | int64_t parity = dwt->parity; |
1444 | 30.2k | int64_t win_l_x0 = dwt->win_l.x0; |
1445 | 30.2k | int64_t win_l_x1 = dwt->win_l.x1; |
1446 | 30.2k | int64_t win_h_x0 = dwt->win_h.x0; |
1447 | 30.2k | int64_t win_h_x1 = dwt->win_h.x1; |
1448 | 30.2k | int64_t sn = (int64_t)dwt->sn_full - (int64_t)dwt->win_l.x0; |
1449 | 30.2k | int64_t sn_full = dwt->sn_full; |
1450 | 30.2k | int64_t dn = (int64_t)dwt->dn_full - (int64_t)dwt->win_h.x0; |
1451 | 30.2k | int64_t dn_full = dwt->dn_full; |
1452 | | |
1453 | 30.2k | adjust_bounds(dwt, sn_full, dn_full, &sn, &dn); |
1454 | | |
1455 | 30.2k | assert(dwt->win_l.x1 <= sn_full && dwt->win_h.x1 <= dn_full); |
1456 | | |
1457 | 30.2k | auto buf = dwt->mem; |
1458 | 30.2k | if(!parity) |
1459 | 29.9k | { |
1460 | 29.9k | if((dn_full != 0) || (sn_full > 1)) |
1461 | 29.9k | { |
1462 | | /* Naive version is : |
1463 | | for (i = win_l_x0; i < i_max; i++) { |
1464 | | S(i) -= (D_(i - 1) + D_(i) + 2) >> 2; |
1465 | | } |
1466 | | for (i = win_h_x0; i < win_h_x1; i++) { |
1467 | | D(i) += (S_(i) + S_(i + 1)) >> 1; |
1468 | | } |
1469 | | but the compiler doesn't manage to unroll it to avoid bound |
1470 | | checking in S_ and D_ macros |
1471 | | */ |
1472 | | |
1473 | | // 1. low pass |
1474 | 29.9k | i = 0; |
1475 | 29.9k | int64_t i_max = win_l_x1 - win_l_x0; |
1476 | 29.9k | assert(win_l_x1 >= win_l_x0); |
1477 | 29.9k | if(i < i_max) |
1478 | 29.8k | { |
1479 | | /* Left-most case */ |
1480 | 149k | for(int64_t off = 0; off < VERT_PASS_WIDTH; off++) |
1481 | 119k | S_off(buf, i, off) -= |
1482 | 119k | (D_sgnd_off_(buf, i - 1, off) + D_off_(buf, i, off) + 2) >> 2; |
1483 | 29.8k | i++; |
1484 | 29.8k | if(i_max > dn) |
1485 | 964 | i_max = dn; |
1486 | 29.8k | #ifdef __SSE2__ |
1487 | 29.8k | if(i + 1 < i_max) |
1488 | 29.7k | { |
1489 | 29.7k | const __m128i two = _mm_set1_epi32(2); |
1490 | 29.7k | auto Dm1 = _mm_load_si128((__m128i*)(buf + ((i << 1) - 1) * VERT_PASS_WIDTH)); |
1491 | 2.03M | for(; i + 1 < i_max; i += 2) |
1492 | 2.00M | { |
1493 | | /* No bound checking */ |
1494 | 2.00M | auto S = _mm_load_si128((__m128i*)(buf + (i << 1) * VERT_PASS_WIDTH)); |
1495 | 2.00M | auto D = _mm_load_si128((__m128i*)(buf + ((i << 1) + 1) * VERT_PASS_WIDTH)); |
1496 | 2.00M | auto S1 = _mm_load_si128((__m128i*)(buf + ((i << 1) + 2) * VERT_PASS_WIDTH)); |
1497 | 2.00M | auto D1 = _mm_load_si128((__m128i*)(buf + ((i << 1) + 3) * VERT_PASS_WIDTH)); |
1498 | 2.00M | S = _mm_sub_epi32( |
1499 | 2.00M | S, _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(Dm1, D), two), 2)); |
1500 | 2.00M | S1 = _mm_sub_epi32( |
1501 | 2.00M | S1, _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(D, D1), two), 2)); |
1502 | 2.00M | _mm_store_si128((__m128i*)(buf + (i << 1) * VERT_PASS_WIDTH), S); |
1503 | 2.00M | _mm_store_si128((__m128i*)(buf + ((i + 1) << 1) * VERT_PASS_WIDTH), S1); |
1504 | 2.00M | Dm1 = D1; |
1505 | 2.00M | } |
1506 | 29.7k | } |
1507 | 29.8k | #endif |
1508 | 47.5k | for(; i < i_max; i++) |
1509 | 17.6k | { |
1510 | | /* No bound checking */ |
1511 | 88.1k | for(uint32_t off = 0; off < VERT_PASS_WIDTH; off++) |
1512 | 70.5k | S_off(buf, i, off) -= |
1513 | 70.5k | (D_sgnd_off_(buf, i - 1, off) + D_off(buf, i, off) + 2) >> 2; |
1514 | 17.6k | } |
1515 | 30.8k | for(; i < win_l_x1 - win_l_x0; i++) |
1516 | 970 | { |
1517 | | /* Right-most case */ |
1518 | 4.83k | for(uint32_t off = 0; off < VERT_PASS_WIDTH; off++) |
1519 | 3.86k | S_off(buf, i, off) -= |
1520 | 3.86k | (D_sgnd_off_(buf, i - 1, off) + D_off_(buf, i, off) + 2) >> 2; |
1521 | 970 | } |
1522 | 29.8k | } |
1523 | | |
1524 | | // 2. high pass |
1525 | 29.9k | i = 0; |
1526 | 29.9k | assert(win_h_x1 >= win_h_x0); |
1527 | 29.9k | i_max = win_h_x1 - win_h_x0; |
1528 | 29.9k | if(i < i_max) |
1529 | 30.0k | { |
1530 | 30.0k | if(i_max >= sn) |
1531 | 18.8k | i_max = sn - 1; |
1532 | 30.0k | #ifdef __SSE2__ |
1533 | 30.0k | if(i + 1 < i_max) |
1534 | 29.9k | { |
1535 | 29.9k | auto S = _mm_load_si128((__m128i*)(buf + (i << 1) * VERT_PASS_WIDTH)); |
1536 | 2.05M | for(; i + 1 < i_max; i += 2) |
1537 | 2.02M | { |
1538 | | /* No bound checking */ |
1539 | 2.02M | auto D = _mm_load_si128((__m128i*)(buf + (1 + (i << 1)) * VERT_PASS_WIDTH)); |
1540 | 2.02M | auto S1 = _mm_load_si128((__m128i*)(buf + ((i + 1) << 1) * VERT_PASS_WIDTH)); |
1541 | 2.02M | auto D1 = |
1542 | 2.02M | _mm_load_si128((__m128i*)(buf + (1 + ((i + 1) << 1)) * VERT_PASS_WIDTH)); |
1543 | 2.02M | auto S2 = _mm_load_si128((__m128i*)(buf + ((i + 2) << 1) * VERT_PASS_WIDTH)); |
1544 | 2.02M | D = _mm_add_epi32(D, _mm_srai_epi32(_mm_add_epi32(S, S1), 1)); |
1545 | 2.02M | D1 = _mm_add_epi32(D1, _mm_srai_epi32(_mm_add_epi32(S1, S2), 1)); |
1546 | 2.02M | _mm_store_si128((__m128i*)(buf + (1 + (i << 1)) * VERT_PASS_WIDTH), D); |
1547 | 2.02M | _mm_store_si128((__m128i*)(buf + (1 + ((i + 1) << 1)) * VERT_PASS_WIDTH), D1); |
1548 | 2.02M | S = S2; |
1549 | 2.02M | } |
1550 | 29.9k | } |
1551 | 30.0k | #endif |
1552 | 45.8k | for(; i < i_max; i++) |
1553 | 15.7k | { |
1554 | | /* No bound checking */ |
1555 | 78.8k | for(uint32_t off = 0; off < VERT_PASS_WIDTH; off++) |
1556 | 63.0k | D_off(buf, i, off) += (S_off(buf, i, off) + S_off(buf, i + 1, off)) >> 1; |
1557 | 15.7k | } |
1558 | 51.8k | for(; i < win_h_x1 - win_h_x0; i++) |
1559 | 21.8k | { |
1560 | | /* Right-most case */ |
1561 | 109k | for(uint32_t off = 0; off < VERT_PASS_WIDTH; off++) |
1562 | 87.2k | D_off(buf, i, off) += (S_off_(buf, i, off) + S_off_(buf, i + 1, off)) >> 1; |
1563 | 21.8k | } |
1564 | 30.0k | } |
1565 | 29.9k | } |
1566 | 29.9k | } |
1567 | 276 | else |
1568 | 276 | { |
1569 | 276 | if(sn_full == 0 && dn_full == 1) |
1570 | 6 | { |
1571 | | // edge case at origin |
1572 | 30 | for(uint32_t off = 0; off < VERT_PASS_WIDTH; off++) |
1573 | 24 | S_off(buf, 0, off) >>= 1; |
1574 | 6 | } |
1575 | 270 | else |
1576 | 270 | { |
1577 | 270 | assert((uint64_t)(dwt->memL + (win_l_x1 - win_l_x0) * VERT_PASS_WIDTH) - |
1578 | 270 | (uint64_t)dwt->allocatedMem < |
1579 | 270 | dwt->lenBytes_); |
1580 | 1.08k | for(i = 0; i < win_l_x1 - win_l_x0; i++) |
1581 | 815 | { |
1582 | 4.07k | for(uint32_t off = 0; off < VERT_PASS_WIDTH; off++) |
1583 | 3.26k | D_off(buf, i, off) -= (SS_off_(buf, i, off) + SS_off_(buf, i + 1, off) + 2) >> 2; |
1584 | 815 | } |
1585 | 270 | assert((uint64_t)(dwt->memH + (win_h_x1 - win_h_x0) * VERT_PASS_WIDTH) - |
1586 | 270 | (uint64_t)dwt->allocatedMem < |
1587 | 270 | dwt->lenBytes_); |
1588 | 1.21k | for(i = 0; i < win_h_x1 - win_h_x0; i++) |
1589 | 947 | { |
1590 | 4.73k | for(uint32_t off = 0; off < VERT_PASS_WIDTH; off++) |
1591 | 3.78k | S_off(buf, i, off) += (DD_off_(buf, i, off) + DD_sgnd_off_(buf, i - 1, off)) >> 1; |
1592 | 947 | } |
1593 | 270 | } |
1594 | 276 | } |
1595 | 30.2k | } |
1596 | | |
1597 | | private: |
1598 | | void adjust_bounds(dwt_data<T>* dwt, [[maybe_unused]] int64_t sn_full, |
1599 | | [[maybe_unused]] int64_t dn_full, int64_t* sn, int64_t* dn) |
1600 | 114k | { |
1601 | 114k | if((uint64_t)dwt->memH < (uint64_t)dwt->memL && *sn == *dn) |
1602 | 0 | { |
1603 | 0 | assert(dn_full == sn_full - 1); |
1604 | 0 | (*dn)--; |
1605 | 0 | } |
1606 | 114k | if((uint64_t)dwt->memL < (uint64_t)dwt->memH && *sn == *dn) |
1607 | 0 | { |
1608 | 0 | assert(sn_full == dn_full - 1); |
1609 | 0 | (*sn)--; |
1610 | 0 | } |
1611 | 114k | } |
1612 | | #ifdef GRK_DEBUG_SPARSE |
1613 | | inline T get_S(T* const buf, int64_t i) |
1614 | | { |
1615 | | auto ret = buf[(i) << 1]; |
1616 | | assert(abs(ret) < 0xFFFFFFF); |
1617 | | return ret; |
1618 | | } |
1619 | | inline T get_D(T* const buf, int64_t i) |
1620 | | { |
1621 | | auto ret = buf[(1 + ((i) << 1))]; |
1622 | | assert(abs(ret) < 0xFFFFFFF); |
1623 | | return ret; |
1624 | | } |
1625 | | inline T get_S_off(T* const buf, int64_t i, int64_t off) |
1626 | | { |
1627 | | auto ret = buf[(i) * 2 * VERT_PASS_WIDTH + off]; |
1628 | | assert(abs(ret) < 0xFFFFFFF); |
1629 | | return ret; |
1630 | | } |
1631 | | inline T get_D_off(T* const buf, int64_t i, int64_t off) |
1632 | | { |
1633 | | auto ret = buf[(1 + (i) * 2) * VERT_PASS_WIDTH + off]; |
1634 | | assert(abs(ret) < 0xFFFFFFF); |
1635 | | return ret; |
1636 | | } |
1637 | | #endif |
1638 | | }; |
1639 | | template<typename T, uint32_t FILTER_WIDTH, uint32_t VERT_PASS_WIDTH> |
1640 | | class Partial97 : public PartialInterleaver<T, FILTER_WIDTH, VERT_PASS_WIDTH> |
1641 | | { |
1642 | | public: |
1643 | | void decompress_h(dwt_data<T>* dwt) |
1644 | 436k | { |
1645 | 436k | WaveletReverse::decompress_step_97(dwt); |
1646 | 436k | } |
1647 | | void decompress_v(dwt_data<T>* dwt) |
1648 | 449k | { |
1649 | 449k | WaveletReverse::decompress_step_97(dwt); |
1650 | 449k | } |
1651 | | }; |
1652 | | // Notes: |
1653 | | // 1. line buffer 0 offset == dwt->win_l.x0 |
1654 | | // 2. dwt->memL and dwt->memH are only set for partial decode |
1655 | | Params97 WaveletReverse::makeParams97(dwt_data<vec4f>* dwt, bool isBandL, bool step1) |
1656 | 4.38M | { |
1657 | 4.38M | Params97 rc; |
1658 | | // band_0 specifies absolute start of line buffer |
1659 | 4.38M | int64_t band_0 = isBandL ? dwt->win_l.x0 : dwt->win_h.x0; |
1660 | 4.38M | int64_t band_1 = isBandL ? dwt->win_l.x1 : dwt->win_h.x1; |
1661 | 4.38M | auto memPartial = isBandL ? dwt->memL : dwt->memH; |
1662 | 4.38M | int64_t parityOffset = isBandL ? dwt->parity : !dwt->parity; |
1663 | 4.38M | int64_t lenMax = isBandL |
1664 | 4.38M | ? (std::min<int64_t>)(dwt->sn_full, (int64_t)dwt->dn_full - parityOffset) |
1665 | 4.38M | : (std::min<int64_t>)(dwt->dn_full, (int64_t)dwt->sn_full - parityOffset); |
1666 | 4.38M | if(lenMax < 0) |
1667 | 4.49k | lenMax = 0; |
1668 | 4.38M | assert(lenMax >= band_0); |
1669 | 4.38M | lenMax -= band_0; |
1670 | 18.4E | rc.data = memPartial ? memPartial : dwt->mem; |
1671 | | |
1672 | 4.38M | assert(!memPartial || (dwt->win_l.x1 <= dwt->sn_full && dwt->win_h.x1 <= dwt->dn_full)); |
1673 | 4.38M | assert(band_1 >= band_0); |
1674 | | |
1675 | 4.38M | rc.data += parityOffset + band_0 - dwt->win_l.x0; |
1676 | 4.38M | rc.len = (uint32_t)(band_1 - band_0); |
1677 | 4.38M | if(!step1) |
1678 | 3.00M | { |
1679 | 3.00M | rc.data += 1; |
1680 | 3.00M | rc.dataPrev = parityOffset ? rc.data - 2 : rc.data; |
1681 | 3.00M | rc.lenMax = (uint32_t)lenMax; |
1682 | 3.00M | } |
1683 | 4.38M | if(memPartial) |
1684 | 4.43M | { |
1685 | 4.43M | assert((uint64_t)rc.data >= (uint64_t)dwt->allocatedMem); |
1686 | 4.43M | assert((uint64_t)rc.data <= (uint64_t)dwt->allocatedMem + dwt->lenBytes_); |
1687 | 4.43M | } |
1688 | | |
1689 | 4.38M | return rc; |
1690 | 4.38M | }; |
1691 | | |
1692 | | template<uint32_t FILTER_WIDTH> |
1693 | | struct PartialBandInfo |
1694 | | { |
1695 | | // 1. set up windows for horizontal and vertical passes |
1696 | | grk_rect32 bandWindowREL_[BAND_NUM_ORIENTATIONS]; |
1697 | | // two windows formed by horizontal pass and used as input for vertical pass |
1698 | | grk_rect32 splitWindowREL_[SPLIT_NUM_ORIENTATIONS]; |
1699 | | grk_rect32 resWindowREL_; |
1700 | | |
1701 | | bool alloc(ISparseCanvas* sa, uint8_t resno, Resolution* fullRes, |
1702 | | TileComponentWindow<int32_t>* tileWindow) |
1703 | 40.4k | { |
1704 | 40.4k | bandWindowREL_[BAND_ORIENT_LL] = |
1705 | 40.4k | tileWindow->getBandWindowBufferPaddedREL(resno, BAND_ORIENT_LL); |
1706 | 40.4k | bandWindowREL_[BAND_ORIENT_HL] = |
1707 | 40.4k | tileWindow->getBandWindowBufferPaddedREL(resno, BAND_ORIENT_HL); |
1708 | 40.4k | bandWindowREL_[BAND_ORIENT_LH] = |
1709 | 40.4k | tileWindow->getBandWindowBufferPaddedREL(resno, BAND_ORIENT_LH); |
1710 | 40.4k | bandWindowREL_[BAND_ORIENT_HH] = |
1711 | 40.4k | tileWindow->getBandWindowBufferPaddedREL(resno, BAND_ORIENT_HH); |
1712 | | |
1713 | | // band windows in band coordinates - needed to pre-allocate sparse blocks |
1714 | 40.4k | grk_rect32 tileBandWindowREL[BAND_NUM_ORIENTATIONS]; |
1715 | | |
1716 | 40.4k | tileBandWindowREL[BAND_ORIENT_LL] = bandWindowREL_[BAND_ORIENT_LL]; |
1717 | 40.4k | tileBandWindowREL[BAND_ORIENT_HL] = |
1718 | 40.4k | bandWindowREL_[BAND_ORIENT_HL].pan(fullRes->tileBand[BAND_INDEX_LH].width(), 0); |
1719 | 40.4k | tileBandWindowREL[BAND_ORIENT_LH] = |
1720 | 40.4k | bandWindowREL_[BAND_ORIENT_LH].pan(0, fullRes->tileBand[BAND_INDEX_HL].height()); |
1721 | 40.4k | tileBandWindowREL[BAND_ORIENT_HH] = bandWindowREL_[BAND_ORIENT_HH].pan( |
1722 | 40.4k | fullRes->tileBand[BAND_INDEX_LH].width(), fullRes->tileBand[BAND_INDEX_HL].height()); |
1723 | | // 2. pre-allocate sparse blocks |
1724 | 201k | for(uint32_t i = 0; i < BAND_NUM_ORIENTATIONS; ++i) |
1725 | 160k | { |
1726 | 160k | auto temp = tileBandWindowREL[i]; |
1727 | 160k | if(!sa->alloc(temp.grow_IN_PLACE(2 * FILTER_WIDTH, fullRes->width(), fullRes->height()), |
1728 | 160k | true)) |
1729 | 0 | return false; |
1730 | 160k | } |
1731 | 40.4k | resWindowREL_ = tileWindow->getResWindowBufferREL(resno); |
1732 | 40.4k | if(!sa->alloc(resWindowREL_, true)) |
1733 | 0 | return false; |
1734 | 40.4k | splitWindowREL_[SPLIT_L] = tileWindow->getResWindowBufferSplitREL(resno, SPLIT_L); |
1735 | 40.4k | splitWindowREL_[SPLIT_H] = tileWindow->getResWindowBufferSplitREL(resno, SPLIT_H); |
1736 | | |
1737 | 40.4k | auto fullResNext = fullRes + 1; |
1738 | 121k | for(uint32_t k = 0; k < SPLIT_NUM_ORIENTATIONS; ++k) |
1739 | 80.6k | { |
1740 | 80.6k | auto temp = splitWindowREL_[k]; |
1741 | 80.6k | if(!sa->alloc( |
1742 | 80.6k | temp.grow_IN_PLACE(2 * FILTER_WIDTH, fullResNext->width(), fullResNext->height()), |
1743 | 80.6k | true)) |
1744 | 0 | return false; |
1745 | 80.6k | } |
1746 | | |
1747 | 40.4k | return true; |
1748 | 40.4k | } grk::PartialBandInfo<1u>::alloc(grk::ISparseCanvas*, unsigned char, grk::Resolution*, grk::TileComponentWindow<int>*) Line | Count | Source | 1703 | 853 | { | 1704 | 853 | bandWindowREL_[BAND_ORIENT_LL] = | 1705 | 853 | tileWindow->getBandWindowBufferPaddedREL(resno, BAND_ORIENT_LL); | 1706 | 853 | bandWindowREL_[BAND_ORIENT_HL] = | 1707 | 853 | tileWindow->getBandWindowBufferPaddedREL(resno, BAND_ORIENT_HL); | 1708 | 853 | bandWindowREL_[BAND_ORIENT_LH] = | 1709 | 853 | tileWindow->getBandWindowBufferPaddedREL(resno, BAND_ORIENT_LH); | 1710 | 853 | bandWindowREL_[BAND_ORIENT_HH] = | 1711 | 853 | tileWindow->getBandWindowBufferPaddedREL(resno, BAND_ORIENT_HH); | 1712 | | | 1713 | | // band windows in band coordinates - needed to pre-allocate sparse blocks | 1714 | 853 | grk_rect32 tileBandWindowREL[BAND_NUM_ORIENTATIONS]; | 1715 | | | 1716 | 853 | tileBandWindowREL[BAND_ORIENT_LL] = bandWindowREL_[BAND_ORIENT_LL]; | 1717 | 853 | tileBandWindowREL[BAND_ORIENT_HL] = | 1718 | 853 | bandWindowREL_[BAND_ORIENT_HL].pan(fullRes->tileBand[BAND_INDEX_LH].width(), 0); | 1719 | 853 | tileBandWindowREL[BAND_ORIENT_LH] = | 1720 | 853 | bandWindowREL_[BAND_ORIENT_LH].pan(0, fullRes->tileBand[BAND_INDEX_HL].height()); | 1721 | 853 | tileBandWindowREL[BAND_ORIENT_HH] = bandWindowREL_[BAND_ORIENT_HH].pan( | 1722 | 853 | fullRes->tileBand[BAND_INDEX_LH].width(), fullRes->tileBand[BAND_INDEX_HL].height()); | 1723 | | // 2. pre-allocate sparse blocks | 1724 | 4.26k | for(uint32_t i = 0; i < BAND_NUM_ORIENTATIONS; ++i) | 1725 | 3.41k | { | 1726 | 3.41k | auto temp = tileBandWindowREL[i]; | 1727 | 3.41k | if(!sa->alloc(temp.grow_IN_PLACE(2 * FILTER_WIDTH, fullRes->width(), fullRes->height()), | 1728 | 3.41k | true)) | 1729 | 0 | return false; | 1730 | 3.41k | } | 1731 | 853 | resWindowREL_ = tileWindow->getResWindowBufferREL(resno); | 1732 | 853 | if(!sa->alloc(resWindowREL_, true)) | 1733 | 0 | return false; | 1734 | 853 | splitWindowREL_[SPLIT_L] = tileWindow->getResWindowBufferSplitREL(resno, SPLIT_L); | 1735 | 853 | splitWindowREL_[SPLIT_H] = tileWindow->getResWindowBufferSplitREL(resno, SPLIT_H); | 1736 | | | 1737 | 853 | auto fullResNext = fullRes + 1; | 1738 | 2.55k | for(uint32_t k = 0; k < SPLIT_NUM_ORIENTATIONS; ++k) | 1739 | 1.70k | { | 1740 | 1.70k | auto temp = splitWindowREL_[k]; | 1741 | 1.70k | if(!sa->alloc( | 1742 | 1.70k | temp.grow_IN_PLACE(2 * FILTER_WIDTH, fullResNext->width(), fullResNext->height()), | 1743 | 1.70k | true)) | 1744 | 0 | return false; | 1745 | 1.70k | } | 1746 | | | 1747 | 853 | return true; | 1748 | 853 | } |
grk::PartialBandInfo<2u>::alloc(grk::ISparseCanvas*, unsigned char, grk::Resolution*, grk::TileComponentWindow<int>*) Line | Count | Source | 1703 | 39.5k | { | 1704 | 39.5k | bandWindowREL_[BAND_ORIENT_LL] = | 1705 | 39.5k | tileWindow->getBandWindowBufferPaddedREL(resno, BAND_ORIENT_LL); | 1706 | 39.5k | bandWindowREL_[BAND_ORIENT_HL] = | 1707 | 39.5k | tileWindow->getBandWindowBufferPaddedREL(resno, BAND_ORIENT_HL); | 1708 | 39.5k | bandWindowREL_[BAND_ORIENT_LH] = | 1709 | 39.5k | tileWindow->getBandWindowBufferPaddedREL(resno, BAND_ORIENT_LH); | 1710 | 39.5k | bandWindowREL_[BAND_ORIENT_HH] = | 1711 | 39.5k | tileWindow->getBandWindowBufferPaddedREL(resno, BAND_ORIENT_HH); | 1712 | | | 1713 | | // band windows in band coordinates - needed to pre-allocate sparse blocks | 1714 | 39.5k | grk_rect32 tileBandWindowREL[BAND_NUM_ORIENTATIONS]; | 1715 | | | 1716 | 39.5k | tileBandWindowREL[BAND_ORIENT_LL] = bandWindowREL_[BAND_ORIENT_LL]; | 1717 | 39.5k | tileBandWindowREL[BAND_ORIENT_HL] = | 1718 | 39.5k | bandWindowREL_[BAND_ORIENT_HL].pan(fullRes->tileBand[BAND_INDEX_LH].width(), 0); | 1719 | 39.5k | tileBandWindowREL[BAND_ORIENT_LH] = | 1720 | 39.5k | bandWindowREL_[BAND_ORIENT_LH].pan(0, fullRes->tileBand[BAND_INDEX_HL].height()); | 1721 | 39.5k | tileBandWindowREL[BAND_ORIENT_HH] = bandWindowREL_[BAND_ORIENT_HH].pan( | 1722 | 39.5k | fullRes->tileBand[BAND_INDEX_LH].width(), fullRes->tileBand[BAND_INDEX_HL].height()); | 1723 | | // 2. pre-allocate sparse blocks | 1724 | 197k | for(uint32_t i = 0; i < BAND_NUM_ORIENTATIONS; ++i) | 1725 | 157k | { | 1726 | 157k | auto temp = tileBandWindowREL[i]; | 1727 | 157k | if(!sa->alloc(temp.grow_IN_PLACE(2 * FILTER_WIDTH, fullRes->width(), fullRes->height()), | 1728 | 157k | true)) | 1729 | 0 | return false; | 1730 | 157k | } | 1731 | 39.5k | resWindowREL_ = tileWindow->getResWindowBufferREL(resno); | 1732 | 39.5k | if(!sa->alloc(resWindowREL_, true)) | 1733 | 0 | return false; | 1734 | 39.5k | splitWindowREL_[SPLIT_L] = tileWindow->getResWindowBufferSplitREL(resno, SPLIT_L); | 1735 | 39.5k | splitWindowREL_[SPLIT_H] = tileWindow->getResWindowBufferSplitREL(resno, SPLIT_H); | 1736 | | | 1737 | 39.5k | auto fullResNext = fullRes + 1; | 1738 | 118k | for(uint32_t k = 0; k < SPLIT_NUM_ORIENTATIONS; ++k) | 1739 | 78.9k | { | 1740 | 78.9k | auto temp = splitWindowREL_[k]; | 1741 | 78.9k | if(!sa->alloc( | 1742 | 78.9k | temp.grow_IN_PLACE(2 * FILTER_WIDTH, fullResNext->width(), fullResNext->height()), | 1743 | 78.9k | true)) | 1744 | 0 | return false; | 1745 | 78.9k | } | 1746 | | | 1747 | 39.5k | return true; | 1748 | 39.5k | } |
|
1749 | | }; |
1750 | | |
1751 | | /** |
1752 | | * ************************************************************************************ |
1753 | | * |
1754 | | * 5/3 operates on elements of type int32_t while 9/7 operates on elements of type vec4f |
1755 | | * |
1756 | | * Horizontal pass |
1757 | | * |
1758 | | * Each thread processes a strip running the length of the window, of the following dimensions: |
1759 | | * |
1760 | | * 5/3 |
1761 | | * Height : 1 |
1762 | | * |
1763 | | * 9/7 |
1764 | | * Height : 4 |
1765 | | * |
1766 | | * Vertical pass |
1767 | | * |
1768 | | * 5/3 |
1769 | | * Width : 4 |
1770 | | * |
1771 | | * 9/7 |
1772 | | * Height : 1 |
1773 | | * |
1774 | | **************************************************************************** |
1775 | | * |
1776 | | * FILTER_WIDTH value matches the maximum left/right extension given in tables |
1777 | | * F.2 and F.3 of the standard |
1778 | | */ |
1779 | | template<typename T, uint32_t FILTER_WIDTH, uint32_t VERT_PASS_WIDTH, typename D> |
1780 | | |
1781 | | bool WaveletReverse::decompress_partial_tile(ISparseCanvas* sa, |
1782 | | std::vector<TaskInfo<T, dwt_data<T>>*>& tasks) |
1783 | 18.3k | { |
1784 | 18.3k | uint8_t numresolutions = tilec_->numresolutions; |
1785 | 18.3k | auto buf = tilec_->getWindow(); |
1786 | 18.3k | auto simpleBuf = buf->getResWindowBufferHighestSimple(); |
1787 | 18.3k | auto fullRes = tilec_->resolutions_; |
1788 | 18.3k | auto fullResTopLevel = tilec_->resolutions_ + numres_ - 1; |
1789 | 18.3k | if(!fullResTopLevel->width() || !fullResTopLevel->height()) |
1790 | 6.93k | return true; |
1791 | | |
1792 | 11.3k | [[maybe_unused]] const uint16_t debug_compno = 0; |
1793 | 11.3k | const uint32_t HORIZ_PASS_HEIGHT = sizeof(T) / sizeof(int32_t); |
1794 | 11.3k | const uint32_t pad = FILTER_WIDTH * std::max<uint32_t>(HORIZ_PASS_HEIGHT, VERT_PASS_WIDTH) * |
1795 | 11.3k | sizeof(T) / sizeof(int32_t); |
1796 | | // reduce window |
1797 | 11.3k | auto synthesisWindow = unreducedWindow_.scaleDownCeilPow2(numresolutions - numres_); |
1798 | 11.3k | assert(fullResTopLevel->intersection(synthesisWindow) == synthesisWindow); |
1799 | | // shift to relative coordinates |
1800 | 11.3k | synthesisWindow = |
1801 | 11.3k | synthesisWindow.pan(-(int64_t)fullResTopLevel->x0, -(int64_t)fullResTopLevel->y0); |
1802 | 11.3k | if(synthesisWindow.empty()) |
1803 | 33 | return true; |
1804 | 11.3k | uint32_t numThreads = (uint32_t)ExecSingleton::get().num_workers(); |
1805 | 11.3k | auto imageComponentFlow = scheduler_->getImageComponentFlow(compno_); |
1806 | | // imageComponentFlow == nullptr ==> no blocks were decompressed for this component |
1807 | 11.3k | if(!imageComponentFlow) |
1808 | 2.30k | return true; |
1809 | 9.04k | if(numres_ == 1U) |
1810 | 2.46k | { |
1811 | 2.46k | auto final_read = [sa, synthesisWindow, simpleBuf]() { |
1812 | | // final read into tile buffer |
1813 | 2.46k | bool ret = sa->read(0, synthesisWindow, simpleBuf.buf_, 1, simpleBuf.stride_); |
1814 | | |
1815 | 2.46k | return ret; |
1816 | 2.46k | }; grk::WaveletReverse::decompress_partial_tile<int, 1u, 4u, grk::Partial53<int, 2u, 4u> >(grk::ISparseCanvas*, std::__1::vector<grk::TaskInfo<int, grk::dwt_data<int> >*, std::__1::allocator<grk::TaskInfo<int, grk::dwt_data<int> >*> >&)::{lambda()#1}::operator()() const Line | Count | Source | 1811 | 2.28k | auto final_read = [sa, synthesisWindow, simpleBuf]() { | 1812 | | // final read into tile buffer | 1813 | 2.28k | bool ret = sa->read(0, synthesisWindow, simpleBuf.buf_, 1, simpleBuf.stride_); | 1814 | | | 1815 | 2.28k | return ret; | 1816 | 2.28k | }; |
grk::WaveletReverse::decompress_partial_tile<grk::vec<float, 4ul>, 2u, 1u, grk::Partial97<grk::vec<float, 4ul>, 2u, 1u> >(grk::ISparseCanvas*, std::__1::vector<grk::TaskInfo<grk::vec<float, 4ul>, grk::dwt_data<grk::vec<float, 4ul> > >*, std::__1::allocator<grk::TaskInfo<grk::vec<float, 4ul>, grk::dwt_data<grk::vec<float, 4ul> > >*> >&)::{lambda()#1}::operator()() const Line | Count | Source | 1811 | 180 | auto final_read = [sa, synthesisWindow, simpleBuf]() { | 1812 | | // final read into tile buffer | 1813 | 180 | bool ret = sa->read(0, synthesisWindow, simpleBuf.buf_, 1, simpleBuf.stride_); | 1814 | | | 1815 | 180 | return ret; | 1816 | 180 | }; |
|
1817 | 2.46k | if(numThreads > 1) |
1818 | 2.46k | imageComponentFlow->waveletFinalCopy_->nextTask().work([final_read] { final_read(); }); grk::WaveletReverse::decompress_partial_tile<int, 1u, 4u, grk::Partial53<int, 2u, 4u> >(grk::ISparseCanvas*, std::__1::vector<grk::TaskInfo<int, grk::dwt_data<int> >*, std::__1::allocator<grk::TaskInfo<int, grk::dwt_data<int> >*> >&)::{lambda()#2}::operator()() const Line | Count | Source | 1818 | 2.28k | imageComponentFlow->waveletFinalCopy_->nextTask().work([final_read] { final_read(); }); |
grk::WaveletReverse::decompress_partial_tile<grk::vec<float, 4ul>, 2u, 1u, grk::Partial97<grk::vec<float, 4ul>, 2u, 1u> >(grk::ISparseCanvas*, std::__1::vector<grk::TaskInfo<grk::vec<float, 4ul>, grk::dwt_data<grk::vec<float, 4ul> > >*, std::__1::allocator<grk::TaskInfo<grk::vec<float, 4ul>, grk::dwt_data<grk::vec<float, 4ul> > >*> >&)::{lambda()#2}::operator()() const Line | Count | Source | 1818 | 180 | imageComponentFlow->waveletFinalCopy_->nextTask().work([final_read] { final_read(); }); |
|
1819 | 3 | else |
1820 | 3 | final_read(); |
1821 | | |
1822 | 2.46k | return true; |
1823 | 2.46k | } |
1824 | 6.57k | auto final_read = [this, sa, synthesisWindow, simpleBuf]() { |
1825 | | // final read into tile buffer |
1826 | 6.56k | bool ret = sa->read(numres_ - 1, synthesisWindow, simpleBuf.buf_, 1, simpleBuf.stride_); |
1827 | | |
1828 | 6.56k | return ret; |
1829 | 6.56k | }; grk::WaveletReverse::decompress_partial_tile<int, 1u, 4u, grk::Partial53<int, 2u, 4u> >(grk::ISparseCanvas*, std::__1::vector<grk::TaskInfo<int, grk::dwt_data<int> >*, std::__1::allocator<grk::TaskInfo<int, grk::dwt_data<int> >*> >&)::{lambda()#3}::operator()() const Line | Count | Source | 1824 | 271 | auto final_read = [this, sa, synthesisWindow, simpleBuf]() { | 1825 | | // final read into tile buffer | 1826 | 271 | bool ret = sa->read(numres_ - 1, synthesisWindow, simpleBuf.buf_, 1, simpleBuf.stride_); | 1827 | | | 1828 | 271 | return ret; | 1829 | 271 | }; |
grk::WaveletReverse::decompress_partial_tile<grk::vec<float, 4ul>, 2u, 1u, grk::Partial97<grk::vec<float, 4ul>, 2u, 1u> >(grk::ISparseCanvas*, std::__1::vector<grk::TaskInfo<grk::vec<float, 4ul>, grk::dwt_data<grk::vec<float, 4ul> > >*, std::__1::allocator<grk::TaskInfo<grk::vec<float, 4ul>, grk::dwt_data<grk::vec<float, 4ul> > >*> >&)::{lambda()#3}::operator()() const Line | Count | Source | 1824 | 6.29k | auto final_read = [this, sa, synthesisWindow, simpleBuf]() { | 1825 | | // final read into tile buffer | 1826 | 6.29k | bool ret = sa->read(numres_ - 1, synthesisWindow, simpleBuf.buf_, 1, simpleBuf.stride_); | 1827 | | | 1828 | 6.29k | return ret; | 1829 | 6.29k | }; |
|
1830 | | // pre-allocate all blocks |
1831 | 6.57k | std::vector<PartialBandInfo<FILTER_WIDTH>> resBandInfo; |
1832 | 47.0k | for(uint8_t resno = 1; resno < numres_; resno++) |
1833 | 40.4k | { |
1834 | 40.4k | PartialBandInfo<FILTER_WIDTH> bandInfo; |
1835 | 40.4k | if(!bandInfo.alloc(sa, resno, fullRes + resno - 1, buf)) |
1836 | 0 | return false; |
1837 | 40.4k | resBandInfo.push_back(bandInfo); |
1838 | 40.4k | } |
1839 | 6.57k | D decompressor; |
1840 | 47.0k | for(uint8_t resno = 1; resno < numres_; resno++) |
1841 | 40.4k | { |
1842 | 40.4k | dwt_data<T> horiz; |
1843 | 40.4k | dwt_data<T> vert; |
1844 | 40.4k | horiz.sn_full = fullRes->width(); |
1845 | 40.4k | vert.sn_full = fullRes->height(); |
1846 | 40.4k | fullRes++; |
1847 | 40.4k | horiz.dn_full = fullRes->width() - horiz.sn_full; |
1848 | 40.4k | horiz.parity = fullRes->x0 & 1; |
1849 | 40.4k | vert.dn_full = fullRes->height() - vert.sn_full; |
1850 | 40.4k | vert.parity = fullRes->y0 & 1; |
1851 | 40.4k | PartialBandInfo<FILTER_WIDTH>& bandInfo = resBandInfo[resno - 1]; |
1852 | | |
1853 | 457k | auto executor_h = [resno, sa, bandInfo, &decompressor](TaskInfo<T, dwt_data<T>>* taskInfo) { |
1854 | 948k | for(uint32_t yPos = taskInfo->indexMin_; yPos < taskInfo->indexMax_; |
1855 | 491k | yPos += HORIZ_PASS_HEIGHT) |
1856 | 513k | { |
1857 | 513k | auto height = |
1858 | 513k | std::min<uint32_t>((uint32_t)HORIZ_PASS_HEIGHT, taskInfo->indexMax_ - yPos); |
1859 | 513k | taskInfo->data.memL = taskInfo->data.mem + taskInfo->data.parity; |
1860 | 513k | taskInfo->data.memH = |
1861 | 513k | taskInfo->data.mem + (int64_t)(!taskInfo->data.parity) + |
1862 | 513k | 2 * ((int64_t)taskInfo->data.win_h.x0 - (int64_t)taskInfo->data.win_l.x0); |
1863 | 513k | if(!decompressor.interleave_h(&taskInfo->data, sa, yPos, height)) |
1864 | 19.7k | { |
1865 | 19.7k | return false; |
1866 | 19.7k | } |
1867 | 493k | taskInfo->data.memL = taskInfo->data.mem; |
1868 | 493k | taskInfo->data.memH = taskInfo->data.mem + ((int64_t)taskInfo->data.win_h.x0 - |
1869 | 493k | (int64_t)taskInfo->data.win_l.x0); |
1870 | 493k | decompressor.decompress_h(&taskInfo->data); |
1871 | 493k | if(!sa->write(resno, |
1872 | 493k | grk_rect32(bandInfo.resWindowREL_.x0, yPos, bandInfo.resWindowREL_.x1, |
1873 | 493k | yPos + height), |
1874 | 493k | (int32_t*)(taskInfo->data.mem + (int64_t)bandInfo.resWindowREL_.x0 - |
1875 | 493k | 2 * (int64_t)taskInfo->data.win_l.x0), |
1876 | 493k | HORIZ_PASS_HEIGHT, 1)) |
1877 | 2.46k | { |
1878 | 2.46k | return false; |
1879 | 2.46k | } |
1880 | 493k | } |
1881 | | |
1882 | 435k | return true; |
1883 | 457k | }; grk::WaveletReverse::decompress_partial_tile<int, 1u, 4u, grk::Partial53<int, 2u, 4u> >(grk::ISparseCanvas*, std::__1::vector<grk::TaskInfo<int, grk::dwt_data<int> >*, std::__1::allocator<grk::TaskInfo<int, grk::dwt_data<int> >*> >&)::{lambda(grk::TaskInfo<int, grk::dwt_data<int> >*)#1}::operator()(grk::TaskInfo<int, grk::dwt_data<int> >*) const Line | Count | Source | 1853 | 37.1k | auto executor_h = [resno, sa, bandInfo, &decompressor](TaskInfo<T, dwt_data<T>>* taskInfo) { | 1854 | 121k | for(uint32_t yPos = taskInfo->indexMin_; yPos < taskInfo->indexMax_; | 1855 | 84.3k | yPos += HORIZ_PASS_HEIGHT) | 1856 | 84.4k | { | 1857 | 84.4k | auto height = | 1858 | 84.4k | std::min<uint32_t>((uint32_t)HORIZ_PASS_HEIGHT, taskInfo->indexMax_ - yPos); | 1859 | 84.4k | taskInfo->data.memL = taskInfo->data.mem + taskInfo->data.parity; | 1860 | 84.4k | taskInfo->data.memH = | 1861 | 84.4k | taskInfo->data.mem + (int64_t)(!taskInfo->data.parity) + | 1862 | 84.4k | 2 * ((int64_t)taskInfo->data.win_h.x0 - (int64_t)taskInfo->data.win_l.x0); | 1863 | 84.4k | if(!decompressor.interleave_h(&taskInfo->data, sa, yPos, height)) | 1864 | 0 | { | 1865 | 0 | return false; | 1866 | 0 | } | 1867 | 84.4k | taskInfo->data.memL = taskInfo->data.mem; | 1868 | 84.4k | taskInfo->data.memH = taskInfo->data.mem + ((int64_t)taskInfo->data.win_h.x0 - | 1869 | 84.4k | (int64_t)taskInfo->data.win_l.x0); | 1870 | 84.4k | decompressor.decompress_h(&taskInfo->data); | 1871 | 84.4k | if(!sa->write(resno, | 1872 | 84.4k | grk_rect32(bandInfo.resWindowREL_.x0, yPos, bandInfo.resWindowREL_.x1, | 1873 | 84.4k | yPos + height), | 1874 | 84.4k | (int32_t*)(taskInfo->data.mem + (int64_t)bandInfo.resWindowREL_.x0 - | 1875 | 84.4k | 2 * (int64_t)taskInfo->data.win_l.x0), | 1876 | 84.4k | HORIZ_PASS_HEIGHT, 1)) | 1877 | 54 | { | 1878 | 54 | return false; | 1879 | 54 | } | 1880 | 84.4k | } | 1881 | | | 1882 | 37.1k | return true; | 1883 | 37.1k | }; |
grk::WaveletReverse::decompress_partial_tile<grk::vec<float, 4ul>, 2u, 1u, grk::Partial97<grk::vec<float, 4ul>, 2u, 1u> >(grk::ISparseCanvas*, std::__1::vector<grk::TaskInfo<grk::vec<float, 4ul>, grk::dwt_data<grk::vec<float, 4ul> > >*, std::__1::allocator<grk::TaskInfo<grk::vec<float, 4ul>, grk::dwt_data<grk::vec<float, 4ul> > >*> >&)::{lambda(grk::TaskInfo<grk::vec<float, 4ul>, grk::dwt_data<grk::vec<float, 4ul> > >*)#1}::operator()(grk::TaskInfo<grk::vec<float, 4ul>, grk::dwt_data<grk::vec<float, 4ul> > >*) const Line | Count | Source | 1853 | 420k | auto executor_h = [resno, sa, bandInfo, &decompressor](TaskInfo<T, dwt_data<T>>* taskInfo) { | 1854 | 826k | for(uint32_t yPos = taskInfo->indexMin_; yPos < taskInfo->indexMax_; | 1855 | 420k | yPos += HORIZ_PASS_HEIGHT) | 1856 | 428k | { | 1857 | 428k | auto height = | 1858 | 428k | std::min<uint32_t>((uint32_t)HORIZ_PASS_HEIGHT, taskInfo->indexMax_ - yPos); | 1859 | 428k | taskInfo->data.memL = taskInfo->data.mem + taskInfo->data.parity; | 1860 | 428k | taskInfo->data.memH = | 1861 | 428k | taskInfo->data.mem + (int64_t)(!taskInfo->data.parity) + | 1862 | 428k | 2 * ((int64_t)taskInfo->data.win_h.x0 - (int64_t)taskInfo->data.win_l.x0); | 1863 | 428k | if(!decompressor.interleave_h(&taskInfo->data, sa, yPos, height)) | 1864 | 19.7k | { | 1865 | 19.7k | return false; | 1866 | 19.7k | } | 1867 | 409k | taskInfo->data.memL = taskInfo->data.mem; | 1868 | 409k | taskInfo->data.memH = taskInfo->data.mem + ((int64_t)taskInfo->data.win_h.x0 - | 1869 | 409k | (int64_t)taskInfo->data.win_l.x0); | 1870 | 409k | decompressor.decompress_h(&taskInfo->data); | 1871 | 409k | if(!sa->write(resno, | 1872 | 409k | grk_rect32(bandInfo.resWindowREL_.x0, yPos, bandInfo.resWindowREL_.x1, | 1873 | 409k | yPos + height), | 1874 | 409k | (int32_t*)(taskInfo->data.mem + (int64_t)bandInfo.resWindowREL_.x0 - | 1875 | 409k | 2 * (int64_t)taskInfo->data.win_l.x0), | 1876 | 409k | HORIZ_PASS_HEIGHT, 1)) | 1877 | 2.41k | { | 1878 | 2.41k | return false; | 1879 | 2.41k | } | 1880 | 409k | } | 1881 | | | 1882 | 398k | return true; | 1883 | 420k | }; |
|
1884 | 357k | auto executor_v = [resno, sa, bandInfo, &decompressor](TaskInfo<T, dwt_data<T>>* taskInfo) { |
1885 | 816k | for(uint32_t xPos = taskInfo->indexMin_; xPos < taskInfo->indexMax_; |
1886 | 459k | xPos += VERT_PASS_WIDTH) |
1887 | 487k | { |
1888 | 487k | auto width = std::min<uint32_t>(VERT_PASS_WIDTH, (taskInfo->indexMax_ - xPos)); |
1889 | 487k | taskInfo->data.memL = taskInfo->data.mem + (taskInfo->data.parity) * VERT_PASS_WIDTH; |
1890 | 487k | taskInfo->data.memH = |
1891 | 487k | taskInfo->data.mem + |
1892 | 487k | ((!taskInfo->data.parity) + |
1893 | 487k | 2 * ((int64_t)taskInfo->data.win_h.x0 - (int64_t)taskInfo->data.win_l.x0)) * |
1894 | 487k | VERT_PASS_WIDTH; |
1895 | 487k | if(!decompressor.interleave_v(&taskInfo->data, sa, xPos, width)) |
1896 | 28.8k | { |
1897 | 28.8k | return false; |
1898 | 28.8k | } |
1899 | 459k | taskInfo->data.memL = taskInfo->data.mem; |
1900 | 459k | taskInfo->data.memH = taskInfo->data.mem + ((int64_t)taskInfo->data.win_h.x0 - |
1901 | 459k | (int64_t)taskInfo->data.win_l.x0) * |
1902 | 459k | VERT_PASS_WIDTH; |
1903 | 459k | decompressor.decompress_v(&taskInfo->data); |
1904 | | // write to buffer for final res |
1905 | 459k | if(!sa->write(resno, |
1906 | 459k | grk_rect32(xPos, bandInfo.resWindowREL_.y0, xPos + width, |
1907 | 459k | bandInfo.resWindowREL_.y0 + taskInfo->data.win_l.length() + |
1908 | 459k | taskInfo->data.win_h.length()), |
1909 | 459k | (int32_t*)(taskInfo->data.mem + ((int64_t)bandInfo.resWindowREL_.y0 - |
1910 | 459k | 2 * (int64_t)taskInfo->data.win_l.x0) * |
1911 | 459k | VERT_PASS_WIDTH), |
1912 | 459k | 1, VERT_PASS_WIDTH * (sizeof(T) / sizeof(int32_t)))) |
1913 | 0 | { |
1914 | 0 | Logger::logger_.error("Sparse array write failure"); |
1915 | 0 | return false; |
1916 | 0 | } |
1917 | 459k | } |
1918 | | |
1919 | 329k | return true; |
1920 | 357k | }; grk::WaveletReverse::decompress_partial_tile<int, 1u, 4u, grk::Partial53<int, 2u, 4u> >(grk::ISparseCanvas*, std::__1::vector<grk::TaskInfo<int, grk::dwt_data<int> >*, std::__1::allocator<grk::TaskInfo<int, grk::dwt_data<int> >*> >&)::{lambda(grk::TaskInfo<int, grk::dwt_data<int> >*)#2}::operator()(grk::TaskInfo<int, grk::dwt_data<int> >*) const Line | Count | Source | 1884 | 20.3k | auto executor_v = [resno, sa, bandInfo, &decompressor](TaskInfo<T, dwt_data<T>>* taskInfo) { | 1885 | 50.3k | for(uint32_t xPos = taskInfo->indexMin_; xPos < taskInfo->indexMax_; | 1886 | 30.0k | xPos += VERT_PASS_WIDTH) | 1887 | 30.0k | { | 1888 | 30.0k | auto width = std::min<uint32_t>(VERT_PASS_WIDTH, (taskInfo->indexMax_ - xPos)); | 1889 | 30.0k | taskInfo->data.memL = taskInfo->data.mem + (taskInfo->data.parity) * VERT_PASS_WIDTH; | 1890 | 30.0k | taskInfo->data.memH = | 1891 | 30.0k | taskInfo->data.mem + | 1892 | 30.0k | ((!taskInfo->data.parity) + | 1893 | 30.0k | 2 * ((int64_t)taskInfo->data.win_h.x0 - (int64_t)taskInfo->data.win_l.x0)) * | 1894 | 30.0k | VERT_PASS_WIDTH; | 1895 | 30.0k | if(!decompressor.interleave_v(&taskInfo->data, sa, xPos, width)) | 1896 | 3 | { | 1897 | 3 | return false; | 1898 | 3 | } | 1899 | 30.0k | taskInfo->data.memL = taskInfo->data.mem; | 1900 | 30.0k | taskInfo->data.memH = taskInfo->data.mem + ((int64_t)taskInfo->data.win_h.x0 - | 1901 | 30.0k | (int64_t)taskInfo->data.win_l.x0) * | 1902 | 30.0k | VERT_PASS_WIDTH; | 1903 | 30.0k | decompressor.decompress_v(&taskInfo->data); | 1904 | | // write to buffer for final res | 1905 | 30.0k | if(!sa->write(resno, | 1906 | 30.0k | grk_rect32(xPos, bandInfo.resWindowREL_.y0, xPos + width, | 1907 | 30.0k | bandInfo.resWindowREL_.y0 + taskInfo->data.win_l.length() + | 1908 | 30.0k | taskInfo->data.win_h.length()), | 1909 | 30.0k | (int32_t*)(taskInfo->data.mem + ((int64_t)bandInfo.resWindowREL_.y0 - | 1910 | 30.0k | 2 * (int64_t)taskInfo->data.win_l.x0) * | 1911 | 30.0k | VERT_PASS_WIDTH), | 1912 | 30.0k | 1, VERT_PASS_WIDTH * (sizeof(T) / sizeof(int32_t)))) | 1913 | 0 | { | 1914 | 0 | Logger::logger_.error("Sparse array write failure"); | 1915 | 0 | return false; | 1916 | 0 | } | 1917 | 30.0k | } | 1918 | | | 1919 | 20.3k | return true; | 1920 | 20.3k | }; |
grk::WaveletReverse::decompress_partial_tile<grk::vec<float, 4ul>, 2u, 1u, grk::Partial97<grk::vec<float, 4ul>, 2u, 1u> >(grk::ISparseCanvas*, std::__1::vector<grk::TaskInfo<grk::vec<float, 4ul>, grk::dwt_data<grk::vec<float, 4ul> > >*, std::__1::allocator<grk::TaskInfo<grk::vec<float, 4ul>, grk::dwt_data<grk::vec<float, 4ul> > >*> >&)::{lambda(grk::TaskInfo<grk::vec<float, 4ul>, grk::dwt_data<grk::vec<float, 4ul> > >*)#2}::operator()(grk::TaskInfo<grk::vec<float, 4ul>, grk::dwt_data<grk::vec<float, 4ul> > >*) const Line | Count | Source | 1884 | 337k | auto executor_v = [resno, sa, bandInfo, &decompressor](TaskInfo<T, dwt_data<T>>* taskInfo) { | 1885 | 766k | for(uint32_t xPos = taskInfo->indexMin_; xPos < taskInfo->indexMax_; | 1886 | 429k | xPos += VERT_PASS_WIDTH) | 1887 | 457k | { | 1888 | 457k | auto width = std::min<uint32_t>(VERT_PASS_WIDTH, (taskInfo->indexMax_ - xPos)); | 1889 | 457k | taskInfo->data.memL = taskInfo->data.mem + (taskInfo->data.parity) * VERT_PASS_WIDTH; | 1890 | 457k | taskInfo->data.memH = | 1891 | 457k | taskInfo->data.mem + | 1892 | 457k | ((!taskInfo->data.parity) + | 1893 | 457k | 2 * ((int64_t)taskInfo->data.win_h.x0 - (int64_t)taskInfo->data.win_l.x0)) * | 1894 | 457k | VERT_PASS_WIDTH; | 1895 | 457k | if(!decompressor.interleave_v(&taskInfo->data, sa, xPos, width)) | 1896 | 28.8k | { | 1897 | 28.8k | return false; | 1898 | 28.8k | } | 1899 | 429k | taskInfo->data.memL = taskInfo->data.mem; | 1900 | 429k | taskInfo->data.memH = taskInfo->data.mem + ((int64_t)taskInfo->data.win_h.x0 - | 1901 | 429k | (int64_t)taskInfo->data.win_l.x0) * | 1902 | 429k | VERT_PASS_WIDTH; | 1903 | 429k | decompressor.decompress_v(&taskInfo->data); | 1904 | | // write to buffer for final res | 1905 | 429k | if(!sa->write(resno, | 1906 | 429k | grk_rect32(xPos, bandInfo.resWindowREL_.y0, xPos + width, | 1907 | 429k | bandInfo.resWindowREL_.y0 + taskInfo->data.win_l.length() + | 1908 | 429k | taskInfo->data.win_h.length()), | 1909 | 429k | (int32_t*)(taskInfo->data.mem + ((int64_t)bandInfo.resWindowREL_.y0 - | 1910 | 429k | 2 * (int64_t)taskInfo->data.win_l.x0) * | 1911 | 429k | VERT_PASS_WIDTH), | 1912 | 429k | 1, VERT_PASS_WIDTH * (sizeof(T) / sizeof(int32_t)))) | 1913 | 0 | { | 1914 | 0 | Logger::logger_.error("Sparse array write failure"); | 1915 | 0 | return false; | 1916 | 0 | } | 1917 | 429k | } | 1918 | | | 1919 | 308k | return true; | 1920 | 337k | }; |
|
1921 | | |
1922 | | // 3. calculate synthesis |
1923 | 40.4k | horiz.win_l = bandInfo.bandWindowREL_[BAND_ORIENT_LL].dimX(); |
1924 | 40.4k | horiz.win_h = bandInfo.bandWindowREL_[BAND_ORIENT_HL].dimX(); |
1925 | 40.4k | horiz.resno = resno; |
1926 | 40.4k | size_t dataLength = |
1927 | 40.4k | (bandInfo.splitWindowREL_[0].width() + 2 * FILTER_WIDTH) * HORIZ_PASS_HEIGHT; |
1928 | 40.4k | auto resFlow = imageComponentFlow->getResFlow(resno - 1); |
1929 | 120k | for(uint32_t k = 0; k < 2 && dataLength; ++k) |
1930 | 80.4k | { |
1931 | 80.4k | uint32_t numTasks = numThreads; |
1932 | 80.4k | uint32_t num_rows = bandInfo.splitWindowREL_[k].height(); |
1933 | 80.4k | if(num_rows < numTasks) |
1934 | 77.4k | numTasks = num_rows; |
1935 | 80.4k | uint32_t incrPerJob = numTasks ? (num_rows / numTasks) : 0; |
1936 | 80.4k | if(numThreads == 1) |
1937 | 0 | numTasks = 1; |
1938 | 80.4k | if(incrPerJob == 0) |
1939 | 13.0k | continue; |
1940 | 547k | for(uint32_t j = 0; j < numTasks; ++j) |
1941 | 480k | { |
1942 | 480k | uint32_t indexMin = bandInfo.splitWindowREL_[k].y0 + j * incrPerJob; |
1943 | 480k | uint32_t indexMax = j < (numTasks - 1U) |
1944 | 480k | ? bandInfo.splitWindowREL_[k].y0 + (j + 1U) * incrPerJob |
1945 | 480k | : bandInfo.splitWindowREL_[k].y1; |
1946 | 480k | if(indexMin == indexMax) |
1947 | 0 | continue; |
1948 | 480k | auto taskInfo = new TaskInfo<T, dwt_data<T>>(horiz, indexMin, indexMax); |
1949 | 480k | if(!taskInfo->data.alloc(dataLength, pad)) |
1950 | 0 | { |
1951 | 0 | delete taskInfo; |
1952 | 0 | return false; |
1953 | 0 | } |
1954 | 480k | tasks.push_back(taskInfo); |
1955 | 480k | if(numThreads > 1) |
1956 | 492k | resFlow->waveletHoriz_->nextTask().work( |
1957 | 492k | [taskInfo, executor_h] { executor_h(taskInfo); }); grk::WaveletReverse::decompress_partial_tile<int, 1u, 4u, grk::Partial53<int, 2u, 4u> >(grk::ISparseCanvas*, std::__1::vector<grk::TaskInfo<int, grk::dwt_data<int> >*, std::__1::allocator<grk::TaskInfo<int, grk::dwt_data<int> >*> >&)::{lambda()#4}::operator()() const Line | Count | Source | 1957 | 37.1k | [taskInfo, executor_h] { executor_h(taskInfo); }); |
grk::WaveletReverse::decompress_partial_tile<grk::vec<float, 4ul>, 2u, 1u, grk::Partial97<grk::vec<float, 4ul>, 2u, 1u> >(grk::ISparseCanvas*, std::__1::vector<grk::TaskInfo<grk::vec<float, 4ul>, grk::dwt_data<grk::vec<float, 4ul> > >*, std::__1::allocator<grk::TaskInfo<grk::vec<float, 4ul>, grk::dwt_data<grk::vec<float, 4ul> > >*> >&)::{lambda()#4}::operator()() const Line | Count | Source | 1957 | 421k | [taskInfo, executor_h] { executor_h(taskInfo); }); |
|
1958 | 18.4E | else |
1959 | 18.4E | executor_h(taskInfo); |
1960 | 480k | } |
1961 | 67.4k | } |
1962 | 40.4k | dataLength = (bandInfo.resWindowREL_.height() + 2 * FILTER_WIDTH) * VERT_PASS_WIDTH * |
1963 | 40.4k | sizeof(T) / sizeof(int32_t); |
1964 | 40.4k | vert.win_l = bandInfo.bandWindowREL_[BAND_ORIENT_LL].dimY(); |
1965 | 40.4k | vert.win_h = bandInfo.bandWindowREL_[BAND_ORIENT_LH].dimY(); |
1966 | 40.4k | vert.resno = resno; |
1967 | 40.4k | uint32_t numTasks = numThreads; |
1968 | 40.4k | uint32_t numColumns = bandInfo.resWindowREL_.width(); |
1969 | 40.4k | if(numColumns < numTasks) |
1970 | 34.4k | numTasks = numColumns; |
1971 | 40.4k | uint32_t incrPerJob = numTasks ? (numColumns / numTasks) : 0; |
1972 | 40.4k | if(numThreads == 1) |
1973 | 0 | numTasks = 1; |
1974 | 412k | for(uint32_t j = 0; j < numTasks && incrPerJob > 0 && dataLength; ++j) |
1975 | 372k | { |
1976 | 372k | uint32_t indexMin = bandInfo.resWindowREL_.x0 + j * incrPerJob; |
1977 | 372k | uint32_t indexMax = j < (numTasks - 1U) ? bandInfo.resWindowREL_.x0 + (j + 1U) * incrPerJob |
1978 | 372k | : bandInfo.resWindowREL_.x1; |
1979 | 372k | if(indexMin == indexMax) |
1980 | 0 | continue; |
1981 | 372k | auto taskInfo = new TaskInfo<T, dwt_data<T>>(vert, indexMin, indexMax); |
1982 | 372k | if(!taskInfo->data.alloc(dataLength, pad)) |
1983 | 0 | { |
1984 | 0 | delete taskInfo; |
1985 | 0 | return false; |
1986 | 0 | } |
1987 | 372k | tasks.push_back(taskInfo); |
1988 | 372k | if(numThreads > 1) |
1989 | 381k | resFlow->waveletVert_->nextTask().work( |
1990 | 381k | [taskInfo, executor_v] { executor_v(taskInfo); }); grk::WaveletReverse::decompress_partial_tile<int, 1u, 4u, grk::Partial53<int, 2u, 4u> >(grk::ISparseCanvas*, std::__1::vector<grk::TaskInfo<int, grk::dwt_data<int> >*, std::__1::allocator<grk::TaskInfo<int, grk::dwt_data<int> >*> >&)::{lambda()#5}::operator()() const Line | Count | Source | 1990 | 20.3k | [taskInfo, executor_v] { executor_v(taskInfo); }); |
grk::WaveletReverse::decompress_partial_tile<grk::vec<float, 4ul>, 2u, 1u, grk::Partial97<grk::vec<float, 4ul>, 2u, 1u> >(grk::ISparseCanvas*, std::__1::vector<grk::TaskInfo<grk::vec<float, 4ul>, grk::dwt_data<grk::vec<float, 4ul> > >*, std::__1::allocator<grk::TaskInfo<grk::vec<float, 4ul>, grk::dwt_data<grk::vec<float, 4ul> > >*> >&)::{lambda()#5}::operator()() const Line | Count | Source | 1990 | 338k | [taskInfo, executor_v] { executor_v(taskInfo); }); |
|
1991 | 18.4E | else |
1992 | 18.4E | executor_v(taskInfo); |
1993 | 372k | } |
1994 | 40.4k | } |
1995 | | |
1996 | 6.57k | if(numThreads > 1) |
1997 | 6.58k | imageComponentFlow->waveletFinalCopy_->nextTask().work([final_read] { final_read(); }); grk::WaveletReverse::decompress_partial_tile<int, 1u, 4u, grk::Partial53<int, 2u, 4u> >(grk::ISparseCanvas*, std::__1::vector<grk::TaskInfo<int, grk::dwt_data<int> >*, std::__1::allocator<grk::TaskInfo<int, grk::dwt_data<int> >*> >&)::{lambda()#6}::operator()() const Line | Count | Source | 1997 | 271 | imageComponentFlow->waveletFinalCopy_->nextTask().work([final_read] { final_read(); }); |
grk::WaveletReverse::decompress_partial_tile<grk::vec<float, 4ul>, 2u, 1u, grk::Partial97<grk::vec<float, 4ul>, 2u, 1u> >(grk::ISparseCanvas*, std::__1::vector<grk::TaskInfo<grk::vec<float, 4ul>, grk::dwt_data<grk::vec<float, 4ul> > >*, std::__1::allocator<grk::TaskInfo<grk::vec<float, 4ul>, grk::dwt_data<grk::vec<float, 4ul> > >*> >&)::{lambda()#6}::operator()() const Line | Count | Source | 1997 | 6.29k | imageComponentFlow->waveletFinalCopy_->nextTask().work([final_read] { final_read(); }); |
|
1998 | 18.4E | else |
1999 | 18.4E | final_read(); |
2000 | | |
2001 | 6.57k | return true; |
2002 | 6.57k | } bool grk::WaveletReverse::decompress_partial_tile<int, 1u, 4u, grk::Partial53<int, 2u, 4u> >(grk::ISparseCanvas*, std::__1::vector<grk::TaskInfo<int, grk::dwt_data<int> >*, std::__1::allocator<grk::TaskInfo<int, grk::dwt_data<int> >*> >&) Line | Count | Source | 1783 | 6.85k | { | 1784 | 6.85k | uint8_t numresolutions = tilec_->numresolutions; | 1785 | 6.85k | auto buf = tilec_->getWindow(); | 1786 | 6.85k | auto simpleBuf = buf->getResWindowBufferHighestSimple(); | 1787 | 6.85k | auto fullRes = tilec_->resolutions_; | 1788 | 6.85k | auto fullResTopLevel = tilec_->resolutions_ + numres_ - 1; | 1789 | 6.85k | if(!fullResTopLevel->width() || !fullResTopLevel->height()) | 1790 | 2.20k | return true; | 1791 | | | 1792 | 4.64k | [[maybe_unused]] const uint16_t debug_compno = 0; | 1793 | 4.64k | const uint32_t HORIZ_PASS_HEIGHT = sizeof(T) / sizeof(int32_t); | 1794 | 4.64k | const uint32_t pad = FILTER_WIDTH * std::max<uint32_t>(HORIZ_PASS_HEIGHT, VERT_PASS_WIDTH) * | 1795 | 4.64k | sizeof(T) / sizeof(int32_t); | 1796 | | // reduce window | 1797 | 4.64k | auto synthesisWindow = unreducedWindow_.scaleDownCeilPow2(numresolutions - numres_); | 1798 | 4.64k | assert(fullResTopLevel->intersection(synthesisWindow) == synthesisWindow); | 1799 | | // shift to relative coordinates | 1800 | 4.64k | synthesisWindow = | 1801 | 4.64k | synthesisWindow.pan(-(int64_t)fullResTopLevel->x0, -(int64_t)fullResTopLevel->y0); | 1802 | 4.64k | if(synthesisWindow.empty()) | 1803 | 30 | return true; | 1804 | 4.61k | uint32_t numThreads = (uint32_t)ExecSingleton::get().num_workers(); | 1805 | 4.61k | auto imageComponentFlow = scheduler_->getImageComponentFlow(compno_); | 1806 | | // imageComponentFlow == nullptr ==> no blocks were decompressed for this component | 1807 | 4.61k | if(!imageComponentFlow) | 1808 | 2.07k | return true; | 1809 | 2.54k | if(numres_ == 1U) | 1810 | 2.28k | { | 1811 | 2.28k | auto final_read = [sa, synthesisWindow, simpleBuf]() { | 1812 | | // final read into tile buffer | 1813 | 2.28k | bool ret = sa->read(0, synthesisWindow, simpleBuf.buf_, 1, simpleBuf.stride_); | 1814 | | | 1815 | 2.28k | return ret; | 1816 | 2.28k | }; | 1817 | 2.28k | if(numThreads > 1) | 1818 | 2.28k | imageComponentFlow->waveletFinalCopy_->nextTask().work([final_read] { final_read(); }); | 1819 | 3 | else | 1820 | 3 | final_read(); | 1821 | | | 1822 | 2.28k | return true; | 1823 | 2.28k | } | 1824 | 261 | auto final_read = [this, sa, synthesisWindow, simpleBuf]() { | 1825 | | // final read into tile buffer | 1826 | 261 | bool ret = sa->read(numres_ - 1, synthesisWindow, simpleBuf.buf_, 1, simpleBuf.stride_); | 1827 | | | 1828 | 261 | return ret; | 1829 | 261 | }; | 1830 | | // pre-allocate all blocks | 1831 | 261 | std::vector<PartialBandInfo<FILTER_WIDTH>> resBandInfo; | 1832 | 1.11k | for(uint8_t resno = 1; resno < numres_; resno++) | 1833 | 853 | { | 1834 | 853 | PartialBandInfo<FILTER_WIDTH> bandInfo; | 1835 | 853 | if(!bandInfo.alloc(sa, resno, fullRes + resno - 1, buf)) | 1836 | 0 | return false; | 1837 | 853 | resBandInfo.push_back(bandInfo); | 1838 | 853 | } | 1839 | 261 | D decompressor; | 1840 | 1.11k | for(uint8_t resno = 1; resno < numres_; resno++) | 1841 | 853 | { | 1842 | 853 | dwt_data<T> horiz; | 1843 | 853 | dwt_data<T> vert; | 1844 | 853 | horiz.sn_full = fullRes->width(); | 1845 | 853 | vert.sn_full = fullRes->height(); | 1846 | 853 | fullRes++; | 1847 | 853 | horiz.dn_full = fullRes->width() - horiz.sn_full; | 1848 | 853 | horiz.parity = fullRes->x0 & 1; | 1849 | 853 | vert.dn_full = fullRes->height() - vert.sn_full; | 1850 | 853 | vert.parity = fullRes->y0 & 1; | 1851 | 853 | PartialBandInfo<FILTER_WIDTH>& bandInfo = resBandInfo[resno - 1]; | 1852 | | | 1853 | 853 | auto executor_h = [resno, sa, bandInfo, &decompressor](TaskInfo<T, dwt_data<T>>* taskInfo) { | 1854 | 853 | for(uint32_t yPos = taskInfo->indexMin_; yPos < taskInfo->indexMax_; | 1855 | 853 | yPos += HORIZ_PASS_HEIGHT) | 1856 | 853 | { | 1857 | 853 | auto height = | 1858 | 853 | std::min<uint32_t>((uint32_t)HORIZ_PASS_HEIGHT, taskInfo->indexMax_ - yPos); | 1859 | 853 | taskInfo->data.memL = taskInfo->data.mem + taskInfo->data.parity; | 1860 | 853 | taskInfo->data.memH = | 1861 | 853 | taskInfo->data.mem + (int64_t)(!taskInfo->data.parity) + | 1862 | 853 | 2 * ((int64_t)taskInfo->data.win_h.x0 - (int64_t)taskInfo->data.win_l.x0); | 1863 | 853 | if(!decompressor.interleave_h(&taskInfo->data, sa, yPos, height)) | 1864 | 853 | { | 1865 | 853 | return false; | 1866 | 853 | } | 1867 | 853 | taskInfo->data.memL = taskInfo->data.mem; | 1868 | 853 | taskInfo->data.memH = taskInfo->data.mem + ((int64_t)taskInfo->data.win_h.x0 - | 1869 | 853 | (int64_t)taskInfo->data.win_l.x0); | 1870 | 853 | decompressor.decompress_h(&taskInfo->data); | 1871 | 853 | if(!sa->write(resno, | 1872 | 853 | grk_rect32(bandInfo.resWindowREL_.x0, yPos, bandInfo.resWindowREL_.x1, | 1873 | 853 | yPos + height), | 1874 | 853 | (int32_t*)(taskInfo->data.mem + (int64_t)bandInfo.resWindowREL_.x0 - | 1875 | 853 | 2 * (int64_t)taskInfo->data.win_l.x0), | 1876 | 853 | HORIZ_PASS_HEIGHT, 1)) | 1877 | 853 | { | 1878 | 853 | return false; | 1879 | 853 | } | 1880 | 853 | } | 1881 | | | 1882 | 853 | return true; | 1883 | 853 | }; | 1884 | 853 | auto executor_v = [resno, sa, bandInfo, &decompressor](TaskInfo<T, dwt_data<T>>* taskInfo) { | 1885 | 853 | for(uint32_t xPos = taskInfo->indexMin_; xPos < taskInfo->indexMax_; | 1886 | 853 | xPos += VERT_PASS_WIDTH) | 1887 | 853 | { | 1888 | 853 | auto width = std::min<uint32_t>(VERT_PASS_WIDTH, (taskInfo->indexMax_ - xPos)); | 1889 | 853 | taskInfo->data.memL = taskInfo->data.mem + (taskInfo->data.parity) * VERT_PASS_WIDTH; | 1890 | 853 | taskInfo->data.memH = | 1891 | 853 | taskInfo->data.mem + | 1892 | 853 | ((!taskInfo->data.parity) + | 1893 | 853 | 2 * ((int64_t)taskInfo->data.win_h.x0 - (int64_t)taskInfo->data.win_l.x0)) * | 1894 | 853 | VERT_PASS_WIDTH; | 1895 | 853 | if(!decompressor.interleave_v(&taskInfo->data, sa, xPos, width)) | 1896 | 853 | { | 1897 | 853 | return false; | 1898 | 853 | } | 1899 | 853 | taskInfo->data.memL = taskInfo->data.mem; | 1900 | 853 | taskInfo->data.memH = taskInfo->data.mem + ((int64_t)taskInfo->data.win_h.x0 - | 1901 | 853 | (int64_t)taskInfo->data.win_l.x0) * | 1902 | 853 | VERT_PASS_WIDTH; | 1903 | 853 | decompressor.decompress_v(&taskInfo->data); | 1904 | | // write to buffer for final res | 1905 | 853 | if(!sa->write(resno, | 1906 | 853 | grk_rect32(xPos, bandInfo.resWindowREL_.y0, xPos + width, | 1907 | 853 | bandInfo.resWindowREL_.y0 + taskInfo->data.win_l.length() + | 1908 | 853 | taskInfo->data.win_h.length()), | 1909 | 853 | (int32_t*)(taskInfo->data.mem + ((int64_t)bandInfo.resWindowREL_.y0 - | 1910 | 853 | 2 * (int64_t)taskInfo->data.win_l.x0) * | 1911 | 853 | VERT_PASS_WIDTH), | 1912 | 853 | 1, VERT_PASS_WIDTH * (sizeof(T) / sizeof(int32_t)))) | 1913 | 853 | { | 1914 | 853 | Logger::logger_.error("Sparse array write failure"); | 1915 | 853 | return false; | 1916 | 853 | } | 1917 | 853 | } | 1918 | | | 1919 | 853 | return true; | 1920 | 853 | }; | 1921 | | | 1922 | | // 3. calculate synthesis | 1923 | 853 | horiz.win_l = bandInfo.bandWindowREL_[BAND_ORIENT_LL].dimX(); | 1924 | 853 | horiz.win_h = bandInfo.bandWindowREL_[BAND_ORIENT_HL].dimX(); | 1925 | 853 | horiz.resno = resno; | 1926 | 853 | size_t dataLength = | 1927 | 853 | (bandInfo.splitWindowREL_[0].width() + 2 * FILTER_WIDTH) * HORIZ_PASS_HEIGHT; | 1928 | 853 | auto resFlow = imageComponentFlow->getResFlow(resno - 1); | 1929 | 2.55k | for(uint32_t k = 0; k < 2 && dataLength; ++k) | 1930 | 1.70k | { | 1931 | 1.70k | uint32_t numTasks = numThreads; | 1932 | 1.70k | uint32_t num_rows = bandInfo.splitWindowREL_[k].height(); | 1933 | 1.70k | if(num_rows < numTasks) | 1934 | 768 | numTasks = num_rows; | 1935 | 1.70k | uint32_t incrPerJob = numTasks ? (num_rows / numTasks) : 0; | 1936 | 1.70k | if(numThreads == 1) | 1937 | 0 | numTasks = 1; | 1938 | 1.70k | if(incrPerJob == 0) | 1939 | 66 | continue; | 1940 | 39.0k | for(uint32_t j = 0; j < numTasks; ++j) | 1941 | 37.3k | { | 1942 | 37.3k | uint32_t indexMin = bandInfo.splitWindowREL_[k].y0 + j * incrPerJob; | 1943 | 37.3k | uint32_t indexMax = j < (numTasks - 1U) | 1944 | 37.3k | ? bandInfo.splitWindowREL_[k].y0 + (j + 1U) * incrPerJob | 1945 | 37.3k | : bandInfo.splitWindowREL_[k].y1; | 1946 | 37.3k | if(indexMin == indexMax) | 1947 | 0 | continue; | 1948 | 37.3k | auto taskInfo = new TaskInfo<T, dwt_data<T>>(horiz, indexMin, indexMax); | 1949 | 37.3k | if(!taskInfo->data.alloc(dataLength, pad)) | 1950 | 0 | { | 1951 | 0 | delete taskInfo; | 1952 | 0 | return false; | 1953 | 0 | } | 1954 | 37.3k | tasks.push_back(taskInfo); | 1955 | 37.3k | if(numThreads > 1) | 1956 | 37.8k | resFlow->waveletHoriz_->nextTask().work( | 1957 | 37.8k | [taskInfo, executor_h] { executor_h(taskInfo); }); | 1958 | 18.4E | else | 1959 | 18.4E | executor_h(taskInfo); | 1960 | 37.3k | } | 1961 | 1.64k | } | 1962 | 853 | dataLength = (bandInfo.resWindowREL_.height() + 2 * FILTER_WIDTH) * VERT_PASS_WIDTH * | 1963 | 853 | sizeof(T) / sizeof(int32_t); | 1964 | 853 | vert.win_l = bandInfo.bandWindowREL_[BAND_ORIENT_LL].dimY(); | 1965 | 853 | vert.win_h = bandInfo.bandWindowREL_[BAND_ORIENT_LH].dimY(); | 1966 | 853 | vert.resno = resno; | 1967 | 853 | uint32_t numTasks = numThreads; | 1968 | 853 | uint32_t numColumns = bandInfo.resWindowREL_.width(); | 1969 | 853 | if(numColumns < numTasks) | 1970 | 237 | numTasks = numColumns; | 1971 | 853 | uint32_t incrPerJob = numTasks ? (numColumns / numTasks) : 0; | 1972 | 853 | if(numThreads == 1) | 1973 | 0 | numTasks = 1; | 1974 | 21.3k | for(uint32_t j = 0; j < numTasks && incrPerJob > 0 && dataLength; ++j) | 1975 | 20.4k | { | 1976 | 20.4k | uint32_t indexMin = bandInfo.resWindowREL_.x0 + j * incrPerJob; | 1977 | 20.4k | uint32_t indexMax = j < (numTasks - 1U) ? bandInfo.resWindowREL_.x0 + (j + 1U) * incrPerJob | 1978 | 20.4k | : bandInfo.resWindowREL_.x1; | 1979 | 20.4k | if(indexMin == indexMax) | 1980 | 0 | continue; | 1981 | 20.4k | auto taskInfo = new TaskInfo<T, dwt_data<T>>(vert, indexMin, indexMax); | 1982 | 20.4k | if(!taskInfo->data.alloc(dataLength, pad)) | 1983 | 0 | { | 1984 | 0 | delete taskInfo; | 1985 | 0 | return false; | 1986 | 0 | } | 1987 | 20.4k | tasks.push_back(taskInfo); | 1988 | 20.4k | if(numThreads > 1) | 1989 | 20.6k | resFlow->waveletVert_->nextTask().work( | 1990 | 20.6k | [taskInfo, executor_v] { executor_v(taskInfo); }); | 1991 | 18.4E | else | 1992 | 18.4E | executor_v(taskInfo); | 1993 | 20.4k | } | 1994 | 853 | } | 1995 | | | 1996 | 261 | if(numThreads > 1) | 1997 | 271 | imageComponentFlow->waveletFinalCopy_->nextTask().work([final_read] { final_read(); }); | 1998 | 18.4E | else | 1999 | 18.4E | final_read(); | 2000 | | | 2001 | 261 | return true; | 2002 | 261 | } |
bool grk::WaveletReverse::decompress_partial_tile<grk::vec<float, 4ul>, 2u, 1u, grk::Partial97<grk::vec<float, 4ul>, 2u, 1u> >(grk::ISparseCanvas*, std::__1::vector<grk::TaskInfo<grk::vec<float, 4ul>, grk::dwt_data<grk::vec<float, 4ul> > >*, std::__1::allocator<grk::TaskInfo<grk::vec<float, 4ul>, grk::dwt_data<grk::vec<float, 4ul> > >*> >&) Line | Count | Source | 1783 | 11.4k | { | 1784 | 11.4k | uint8_t numresolutions = tilec_->numresolutions; | 1785 | 11.4k | auto buf = tilec_->getWindow(); | 1786 | 11.4k | auto simpleBuf = buf->getResWindowBufferHighestSimple(); | 1787 | 11.4k | auto fullRes = tilec_->resolutions_; | 1788 | 11.4k | auto fullResTopLevel = tilec_->resolutions_ + numres_ - 1; | 1789 | 11.4k | if(!fullResTopLevel->width() || !fullResTopLevel->height()) | 1790 | 4.72k | return true; | 1791 | | | 1792 | 6.73k | [[maybe_unused]] const uint16_t debug_compno = 0; | 1793 | 6.73k | const uint32_t HORIZ_PASS_HEIGHT = sizeof(T) / sizeof(int32_t); | 1794 | 6.73k | const uint32_t pad = FILTER_WIDTH * std::max<uint32_t>(HORIZ_PASS_HEIGHT, VERT_PASS_WIDTH) * | 1795 | 6.73k | sizeof(T) / sizeof(int32_t); | 1796 | | // reduce window | 1797 | 6.73k | auto synthesisWindow = unreducedWindow_.scaleDownCeilPow2(numresolutions - numres_); | 1798 | 6.73k | assert(fullResTopLevel->intersection(synthesisWindow) == synthesisWindow); | 1799 | | // shift to relative coordinates | 1800 | 6.73k | synthesisWindow = | 1801 | 6.73k | synthesisWindow.pan(-(int64_t)fullResTopLevel->x0, -(int64_t)fullResTopLevel->y0); | 1802 | 6.73k | if(synthesisWindow.empty()) | 1803 | 3 | return true; | 1804 | 6.72k | uint32_t numThreads = (uint32_t)ExecSingleton::get().num_workers(); | 1805 | 6.72k | auto imageComponentFlow = scheduler_->getImageComponentFlow(compno_); | 1806 | | // imageComponentFlow == nullptr ==> no blocks were decompressed for this component | 1807 | 6.72k | if(!imageComponentFlow) | 1808 | 232 | return true; | 1809 | 6.49k | if(numres_ == 1U) | 1810 | 182 | { | 1811 | 182 | auto final_read = [sa, synthesisWindow, simpleBuf]() { | 1812 | | // final read into tile buffer | 1813 | 182 | bool ret = sa->read(0, synthesisWindow, simpleBuf.buf_, 1, simpleBuf.stride_); | 1814 | | | 1815 | 182 | return ret; | 1816 | 182 | }; | 1817 | 182 | if(numThreads > 1) | 1818 | 182 | imageComponentFlow->waveletFinalCopy_->nextTask().work([final_read] { final_read(); }); | 1819 | 0 | else | 1820 | 0 | final_read(); | 1821 | | | 1822 | 182 | return true; | 1823 | 182 | } | 1824 | 6.31k | auto final_read = [this, sa, synthesisWindow, simpleBuf]() { | 1825 | | // final read into tile buffer | 1826 | 6.31k | bool ret = sa->read(numres_ - 1, synthesisWindow, simpleBuf.buf_, 1, simpleBuf.stride_); | 1827 | | | 1828 | 6.31k | return ret; | 1829 | 6.31k | }; | 1830 | | // pre-allocate all blocks | 1831 | 6.31k | std::vector<PartialBandInfo<FILTER_WIDTH>> resBandInfo; | 1832 | 45.9k | for(uint8_t resno = 1; resno < numres_; resno++) | 1833 | 39.6k | { | 1834 | 39.6k | PartialBandInfo<FILTER_WIDTH> bandInfo; | 1835 | 39.6k | if(!bandInfo.alloc(sa, resno, fullRes + resno - 1, buf)) | 1836 | 0 | return false; | 1837 | 39.6k | resBandInfo.push_back(bandInfo); | 1838 | 39.6k | } | 1839 | 6.31k | D decompressor; | 1840 | 45.8k | for(uint8_t resno = 1; resno < numres_; resno++) | 1841 | 39.5k | { | 1842 | 39.5k | dwt_data<T> horiz; | 1843 | 39.5k | dwt_data<T> vert; | 1844 | 39.5k | horiz.sn_full = fullRes->width(); | 1845 | 39.5k | vert.sn_full = fullRes->height(); | 1846 | 39.5k | fullRes++; | 1847 | 39.5k | horiz.dn_full = fullRes->width() - horiz.sn_full; | 1848 | 39.5k | horiz.parity = fullRes->x0 & 1; | 1849 | 39.5k | vert.dn_full = fullRes->height() - vert.sn_full; | 1850 | 39.5k | vert.parity = fullRes->y0 & 1; | 1851 | 39.5k | PartialBandInfo<FILTER_WIDTH>& bandInfo = resBandInfo[resno - 1]; | 1852 | | | 1853 | 39.5k | auto executor_h = [resno, sa, bandInfo, &decompressor](TaskInfo<T, dwt_data<T>>* taskInfo) { | 1854 | 39.5k | for(uint32_t yPos = taskInfo->indexMin_; yPos < taskInfo->indexMax_; | 1855 | 39.5k | yPos += HORIZ_PASS_HEIGHT) | 1856 | 39.5k | { | 1857 | 39.5k | auto height = | 1858 | 39.5k | std::min<uint32_t>((uint32_t)HORIZ_PASS_HEIGHT, taskInfo->indexMax_ - yPos); | 1859 | 39.5k | taskInfo->data.memL = taskInfo->data.mem + taskInfo->data.parity; | 1860 | 39.5k | taskInfo->data.memH = | 1861 | 39.5k | taskInfo->data.mem + (int64_t)(!taskInfo->data.parity) + | 1862 | 39.5k | 2 * ((int64_t)taskInfo->data.win_h.x0 - (int64_t)taskInfo->data.win_l.x0); | 1863 | 39.5k | if(!decompressor.interleave_h(&taskInfo->data, sa, yPos, height)) | 1864 | 39.5k | { | 1865 | 39.5k | return false; | 1866 | 39.5k | } | 1867 | 39.5k | taskInfo->data.memL = taskInfo->data.mem; | 1868 | 39.5k | taskInfo->data.memH = taskInfo->data.mem + ((int64_t)taskInfo->data.win_h.x0 - | 1869 | 39.5k | (int64_t)taskInfo->data.win_l.x0); | 1870 | 39.5k | decompressor.decompress_h(&taskInfo->data); | 1871 | 39.5k | if(!sa->write(resno, | 1872 | 39.5k | grk_rect32(bandInfo.resWindowREL_.x0, yPos, bandInfo.resWindowREL_.x1, | 1873 | 39.5k | yPos + height), | 1874 | 39.5k | (int32_t*)(taskInfo->data.mem + (int64_t)bandInfo.resWindowREL_.x0 - | 1875 | 39.5k | 2 * (int64_t)taskInfo->data.win_l.x0), | 1876 | 39.5k | HORIZ_PASS_HEIGHT, 1)) | 1877 | 39.5k | { | 1878 | 39.5k | return false; | 1879 | 39.5k | } | 1880 | 39.5k | } | 1881 | | | 1882 | 39.5k | return true; | 1883 | 39.5k | }; | 1884 | 39.5k | auto executor_v = [resno, sa, bandInfo, &decompressor](TaskInfo<T, dwt_data<T>>* taskInfo) { | 1885 | 39.5k | for(uint32_t xPos = taskInfo->indexMin_; xPos < taskInfo->indexMax_; | 1886 | 39.5k | xPos += VERT_PASS_WIDTH) | 1887 | 39.5k | { | 1888 | 39.5k | auto width = std::min<uint32_t>(VERT_PASS_WIDTH, (taskInfo->indexMax_ - xPos)); | 1889 | 39.5k | taskInfo->data.memL = taskInfo->data.mem + (taskInfo->data.parity) * VERT_PASS_WIDTH; | 1890 | 39.5k | taskInfo->data.memH = | 1891 | 39.5k | taskInfo->data.mem + | 1892 | 39.5k | ((!taskInfo->data.parity) + | 1893 | 39.5k | 2 * ((int64_t)taskInfo->data.win_h.x0 - (int64_t)taskInfo->data.win_l.x0)) * | 1894 | 39.5k | VERT_PASS_WIDTH; | 1895 | 39.5k | if(!decompressor.interleave_v(&taskInfo->data, sa, xPos, width)) | 1896 | 39.5k | { | 1897 | 39.5k | return false; | 1898 | 39.5k | } | 1899 | 39.5k | taskInfo->data.memL = taskInfo->data.mem; | 1900 | 39.5k | taskInfo->data.memH = taskInfo->data.mem + ((int64_t)taskInfo->data.win_h.x0 - | 1901 | 39.5k | (int64_t)taskInfo->data.win_l.x0) * | 1902 | 39.5k | VERT_PASS_WIDTH; | 1903 | 39.5k | decompressor.decompress_v(&taskInfo->data); | 1904 | | // write to buffer for final res | 1905 | 39.5k | if(!sa->write(resno, | 1906 | 39.5k | grk_rect32(xPos, bandInfo.resWindowREL_.y0, xPos + width, | 1907 | 39.5k | bandInfo.resWindowREL_.y0 + taskInfo->data.win_l.length() + | 1908 | 39.5k | taskInfo->data.win_h.length()), | 1909 | 39.5k | (int32_t*)(taskInfo->data.mem + ((int64_t)bandInfo.resWindowREL_.y0 - | 1910 | 39.5k | 2 * (int64_t)taskInfo->data.win_l.x0) * | 1911 | 39.5k | VERT_PASS_WIDTH), | 1912 | 39.5k | 1, VERT_PASS_WIDTH * (sizeof(T) / sizeof(int32_t)))) | 1913 | 39.5k | { | 1914 | 39.5k | Logger::logger_.error("Sparse array write failure"); | 1915 | 39.5k | return false; | 1916 | 39.5k | } | 1917 | 39.5k | } | 1918 | | | 1919 | 39.5k | return true; | 1920 | 39.5k | }; | 1921 | | | 1922 | | // 3. calculate synthesis | 1923 | 39.5k | horiz.win_l = bandInfo.bandWindowREL_[BAND_ORIENT_LL].dimX(); | 1924 | 39.5k | horiz.win_h = bandInfo.bandWindowREL_[BAND_ORIENT_HL].dimX(); | 1925 | 39.5k | horiz.resno = resno; | 1926 | 39.5k | size_t dataLength = | 1927 | 39.5k | (bandInfo.splitWindowREL_[0].width() + 2 * FILTER_WIDTH) * HORIZ_PASS_HEIGHT; | 1928 | 39.5k | auto resFlow = imageComponentFlow->getResFlow(resno - 1); | 1929 | 118k | for(uint32_t k = 0; k < 2 && dataLength; ++k) | 1930 | 78.7k | { | 1931 | 78.7k | uint32_t numTasks = numThreads; | 1932 | 78.7k | uint32_t num_rows = bandInfo.splitWindowREL_[k].height(); | 1933 | 78.7k | if(num_rows < numTasks) | 1934 | 76.7k | numTasks = num_rows; | 1935 | 78.7k | uint32_t incrPerJob = numTasks ? (num_rows / numTasks) : 0; | 1936 | 78.7k | if(numThreads == 1) | 1937 | 0 | numTasks = 1; | 1938 | 78.7k | if(incrPerJob == 0) | 1939 | 12.9k | continue; | 1940 | 508k | for(uint32_t j = 0; j < numTasks; ++j) | 1941 | 442k | { | 1942 | 442k | uint32_t indexMin = bandInfo.splitWindowREL_[k].y0 + j * incrPerJob; | 1943 | 442k | uint32_t indexMax = j < (numTasks - 1U) | 1944 | 442k | ? bandInfo.splitWindowREL_[k].y0 + (j + 1U) * incrPerJob | 1945 | 442k | : bandInfo.splitWindowREL_[k].y1; | 1946 | 442k | if(indexMin == indexMax) | 1947 | 0 | continue; | 1948 | 442k | auto taskInfo = new TaskInfo<T, dwt_data<T>>(horiz, indexMin, indexMax); | 1949 | 442k | if(!taskInfo->data.alloc(dataLength, pad)) | 1950 | 0 | { | 1951 | 0 | delete taskInfo; | 1952 | 0 | return false; | 1953 | 0 | } | 1954 | 442k | tasks.push_back(taskInfo); | 1955 | 442k | if(numThreads > 1) | 1956 | 455k | resFlow->waveletHoriz_->nextTask().work( | 1957 | 455k | [taskInfo, executor_h] { executor_h(taskInfo); }); | 1958 | 18.4E | else | 1959 | 18.4E | executor_h(taskInfo); | 1960 | 442k | } | 1961 | 65.7k | } | 1962 | 39.5k | dataLength = (bandInfo.resWindowREL_.height() + 2 * FILTER_WIDTH) * VERT_PASS_WIDTH * | 1963 | 39.5k | sizeof(T) / sizeof(int32_t); | 1964 | 39.5k | vert.win_l = bandInfo.bandWindowREL_[BAND_ORIENT_LL].dimY(); | 1965 | 39.5k | vert.win_h = bandInfo.bandWindowREL_[BAND_ORIENT_LH].dimY(); | 1966 | 39.5k | vert.resno = resno; | 1967 | 39.5k | uint32_t numTasks = numThreads; | 1968 | 39.5k | uint32_t numColumns = bandInfo.resWindowREL_.width(); | 1969 | 39.5k | if(numColumns < numTasks) | 1970 | 34.2k | numTasks = numColumns; | 1971 | 39.5k | uint32_t incrPerJob = numTasks ? (numColumns / numTasks) : 0; | 1972 | 39.5k | if(numThreads == 1) | 1973 | 0 | numTasks = 1; | 1974 | 391k | for(uint32_t j = 0; j < numTasks && incrPerJob > 0 && dataLength; ++j) | 1975 | 351k | { | 1976 | 351k | uint32_t indexMin = bandInfo.resWindowREL_.x0 + j * incrPerJob; | 1977 | 351k | uint32_t indexMax = j < (numTasks - 1U) ? bandInfo.resWindowREL_.x0 + (j + 1U) * incrPerJob | 1978 | 351k | : bandInfo.resWindowREL_.x1; | 1979 | 351k | if(indexMin == indexMax) | 1980 | 0 | continue; | 1981 | 351k | auto taskInfo = new TaskInfo<T, dwt_data<T>>(vert, indexMin, indexMax); | 1982 | 351k | if(!taskInfo->data.alloc(dataLength, pad)) | 1983 | 0 | { | 1984 | 0 | delete taskInfo; | 1985 | 0 | return false; | 1986 | 0 | } | 1987 | 351k | tasks.push_back(taskInfo); | 1988 | 351k | if(numThreads > 1) | 1989 | 360k | resFlow->waveletVert_->nextTask().work( | 1990 | 360k | [taskInfo, executor_v] { executor_v(taskInfo); }); | 1991 | 18.4E | else | 1992 | 18.4E | executor_v(taskInfo); | 1993 | 351k | } | 1994 | 39.5k | } | 1995 | | | 1996 | 6.31k | if(numThreads > 1) | 1997 | 6.31k | imageComponentFlow->waveletFinalCopy_->nextTask().work([final_read] { final_read(); }); | 1998 | 3 | else | 1999 | 3 | final_read(); | 2000 | | | 2001 | 6.31k | return true; | 2002 | 6.31k | } |
|
2003 | | WaveletReverse::WaveletReverse(TileProcessor* tileProcessor, TileComponent* tilec, uint16_t compno, |
2004 | | grk_rect32 unreducedWindow, uint8_t numres, uint8_t qmfbid) |
2005 | | : tileProcessor_(tileProcessor), scheduler_(tileProcessor->getScheduler()), tilec_(tilec), |
2006 | | compno_(compno), unreducedWindow_(unreducedWindow), numres_(numres), qmfbid_(qmfbid) |
2007 | 18.3k | {} |
2008 | | WaveletReverse::~WaveletReverse(void) |
2009 | 18.3k | { |
2010 | 18.3k | for(const auto& t : tasks_) |
2011 | 58.6k | delete t; |
2012 | 18.3k | for(const auto& t : tasksF_) |
2013 | 813k | delete t; |
2014 | 18.3k | } |
2015 | | bool WaveletReverse::decompress(void) |
2016 | 18.3k | { |
2017 | 18.3k | if(qmfbid_ == 1) |
2018 | 6.85k | { |
2019 | 6.85k | if(tileProcessor_->cp_->wholeTileDecompress_) |
2020 | 0 | return decompress_tile_53(); |
2021 | 6.85k | else |
2022 | 6.85k | { |
2023 | 6.85k | constexpr uint32_t VERT_PASS_WIDTH = 4; |
2024 | 6.85k | return decompress_partial_tile< |
2025 | 6.85k | int32_t, getFilterPad<uint32_t>(true), VERT_PASS_WIDTH, |
2026 | 6.85k | Partial53<int32_t, getFilterPad<uint32_t>(false), VERT_PASS_WIDTH>>( |
2027 | 6.85k | tilec_->getRegionWindow(), tasks_); |
2028 | 6.85k | } |
2029 | 6.85k | } |
2030 | 11.4k | else |
2031 | 11.4k | { |
2032 | 11.4k | if(tileProcessor_->cp_->wholeTileDecompress_) |
2033 | 0 | return decompress_tile_97(); |
2034 | 11.4k | else |
2035 | 11.4k | { |
2036 | 11.4k | constexpr uint32_t VERT_PASS_WIDTH = 1; |
2037 | 11.4k | return decompress_partial_tile< |
2038 | 11.4k | vec4f, getFilterPad<uint32_t>(false), VERT_PASS_WIDTH, |
2039 | 11.4k | Partial97<vec4f, getFilterPad<uint32_t>(false), VERT_PASS_WIDTH>>( |
2040 | 11.4k | tilec_->getRegionWindow(), tasksF_); |
2041 | 11.4k | } |
2042 | 11.4k | } |
2043 | 18.3k | } |
2044 | | |
2045 | | } // namespace grk |
2046 | | #endif |