/src/libjxl/lib/jxl/modular/transform/squeeze.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) the JPEG XL Project Authors. All rights reserved. |
2 | | // |
3 | | // Use of this source code is governed by a BSD-style |
4 | | // license that can be found in the LICENSE file. |
5 | | |
6 | | #include "lib/jxl/modular/transform/squeeze.h" |
7 | | |
8 | | #include <jxl/memory_manager.h> |
9 | | |
10 | | #include <algorithm> |
11 | | #include <cstdint> |
12 | | #include <cstdlib> |
13 | | #include <utility> |
14 | | #include <vector> |
15 | | |
16 | | #include "lib/jxl/base/common.h" |
17 | | #include "lib/jxl/base/compiler_specific.h" |
18 | | #include "lib/jxl/base/data_parallel.h" |
19 | | #include "lib/jxl/base/printf_macros.h" |
20 | | #include "lib/jxl/base/status.h" |
21 | | #include "lib/jxl/modular/modular_image.h" |
22 | | #include "lib/jxl/modular/transform/squeeze_params.h" |
23 | | #undef HWY_TARGET_INCLUDE |
24 | | #define HWY_TARGET_INCLUDE "lib/jxl/modular/transform/squeeze.cc" |
25 | | #include <hwy/foreach_target.h> |
26 | | #include <hwy/highway.h> |
27 | | |
28 | | #include "lib/jxl/simd_util-inl.h" |
29 | | |
30 | | HWY_BEFORE_NAMESPACE(); |
31 | | namespace jxl { |
32 | | namespace HWY_NAMESPACE { |
33 | | |
34 | | #if HWY_TARGET != HWY_SCALAR |
35 | | |
36 | | // These templates are not found via ADL. |
37 | | using hwy::HWY_NAMESPACE::Abs; |
38 | | using hwy::HWY_NAMESPACE::Add; |
39 | | using hwy::HWY_NAMESPACE::And; |
40 | | using hwy::HWY_NAMESPACE::DupEven; |
41 | | using hwy::HWY_NAMESPACE::DupOdd; |
42 | | using hwy::HWY_NAMESPACE::Gt; |
43 | | using hwy::HWY_NAMESPACE::IfThenElse; |
44 | | using hwy::HWY_NAMESPACE::IfThenZeroElse; |
45 | | using hwy::HWY_NAMESPACE::Lt; |
46 | | using hwy::HWY_NAMESPACE::MulEven; |
47 | | using hwy::HWY_NAMESPACE::MulOdd; |
48 | | using hwy::HWY_NAMESPACE::Ne; |
49 | | using hwy::HWY_NAMESPACE::Neg; |
50 | | using hwy::HWY_NAMESPACE::OddEven; |
51 | | using hwy::HWY_NAMESPACE::RebindToUnsigned; |
52 | | using hwy::HWY_NAMESPACE::ShiftLeft; |
53 | | using hwy::HWY_NAMESPACE::ShiftRight; |
54 | | using hwy::HWY_NAMESPACE::Sub; |
55 | | using hwy::HWY_NAMESPACE::Xor; |
56 | | |
57 | | using D = HWY_CAPPED(pixel_type, 8); |
58 | | using DU = RebindToUnsigned<D>; |
59 | | constexpr D d; |
60 | | constexpr DU du; |
61 | | |
62 | | JXL_INLINE void FastUnsqueeze(const pixel_type *JXL_RESTRICT p_residual, |
63 | | const pixel_type *JXL_RESTRICT p_avg, |
64 | | const pixel_type *JXL_RESTRICT p_navg, |
65 | | const pixel_type *p_pout, |
66 | | pixel_type *JXL_RESTRICT p_out, |
67 | 8.71M | pixel_type *p_nout) { |
68 | 8.71M | const size_t N = Lanes(d); |
69 | 8.71M | auto onethird = Set(d, 0x55555556); |
70 | 17.4M | for (size_t x = 0; x < 8; x += N) { |
71 | 8.71M | auto avg = Load(d, p_avg + x); |
72 | 8.71M | auto next_avg = Load(d, p_navg + x); |
73 | 8.71M | auto top = Load(d, p_pout + x); |
74 | | // Equivalent to SmoothTendency(top,avg,next_avg), but without branches |
75 | | // typo:off |
76 | 8.71M | auto Ba = Sub(top, avg); |
77 | 8.71M | auto an = Sub(avg, next_avg); |
78 | 8.71M | auto nonmono = Xor(Ba, an); |
79 | 8.71M | auto absBa = Abs(Ba); |
80 | 8.71M | auto absan = Abs(an); |
81 | 8.71M | auto absBn = Abs(Sub(top, next_avg)); |
82 | | // Compute a3 = absBa / 3 |
83 | 8.71M | auto a3eh = MulEven(absBa, onethird); |
84 | 8.71M | auto a3oh = MulOdd(absBa, onethird); |
85 | | |
86 | 8.71M | #if (HWY_MAJOR > 1 || (HWY_MAJOR == 1 && HWY_MINOR >= 2)) |
87 | 8.71M | #if HWY_IS_LITTLE_ENDIAN |
88 | 8.71M | auto a3 = InterleaveOdd(d, BitCast(d, a3eh), BitCast(d, a3oh)); |
89 | | #else // not little endian |
90 | | auto a3 = InterleaveEven(d, BitCast(d, a3eh), BitCast(d, a3oh)); |
91 | | #endif // endianness |
92 | | #else // hwy < 1.2 |
93 | | #if HWY_IS_LITTLE_ENDIAN |
94 | | auto a3 = OddEven(BitCast(d, a3oh), DupOdd(BitCast(d, a3eh))); |
95 | | #else // not little endian |
96 | | auto a3 = OddEven(DupEven(BitCast(d, a3oh)), BitCast(d, a3eh)) |
97 | | #endif // endianness |
98 | | #endif // hwy version |
99 | | |
100 | 8.71M | a3 = Add(a3, Add(absBn, Set(d, 2))); |
101 | 8.71M | auto absdiff = ShiftRight<2>(a3); |
102 | 8.71M | auto skipdiff = Ne(Ba, Zero(d)); |
103 | 8.71M | skipdiff = And(skipdiff, Ne(an, Zero(d))); |
104 | 8.71M | skipdiff = And(skipdiff, Lt(nonmono, Zero(d))); |
105 | 8.71M | auto absBa2 = Add(ShiftLeft<1>(absBa), And(absdiff, Set(d, 1))); |
106 | 8.71M | absdiff = IfThenElse(Gt(absdiff, absBa2), |
107 | 8.71M | Add(ShiftLeft<1>(absBa), Set(d, 1)), absdiff); |
108 | | // typo:on |
109 | 8.71M | auto absan2 = ShiftLeft<1>(absan); |
110 | 8.71M | absdiff = IfThenElse(Gt(Add(absdiff, And(absdiff, Set(d, 1))), absan2), |
111 | 8.71M | absan2, absdiff); |
112 | 8.71M | auto diff1 = IfThenElse(Lt(top, next_avg), Neg(absdiff), absdiff); |
113 | 8.71M | auto tendency = IfThenZeroElse(skipdiff, diff1); |
114 | | |
115 | 8.71M | auto diff_minus_tendency = Load(d, p_residual + x); |
116 | 8.71M | auto diff = Add(diff_minus_tendency, tendency); |
117 | 8.71M | auto out = |
118 | 8.71M | Add(avg, ShiftRight<1>( |
119 | 8.71M | Add(diff, BitCast(d, ShiftRight<31>(BitCast(du, diff)))))); |
120 | 8.71M | Store(out, d, p_out + x); |
121 | 8.71M | Store(Sub(out, diff), d, p_nout + x); |
122 | 8.71M | } |
123 | 8.71M | } Unexecuted instantiation: jxl::N_SSE4::FastUnsqueeze(int const*, int const*, int const*, int const*, int*, int*) jxl::N_AVX2::FastUnsqueeze(int const*, int const*, int const*, int const*, int*, int*) Line | Count | Source | 67 | 8.71M | pixel_type *p_nout) { | 68 | 8.71M | const size_t N = Lanes(d); | 69 | 8.71M | auto onethird = Set(d, 0x55555556); | 70 | 17.4M | for (size_t x = 0; x < 8; x += N) { | 71 | 8.71M | auto avg = Load(d, p_avg + x); | 72 | 8.71M | auto next_avg = Load(d, p_navg + x); | 73 | 8.71M | auto top = Load(d, p_pout + x); | 74 | | // Equivalent to SmoothTendency(top,avg,next_avg), but without branches | 75 | | // typo:off | 76 | 8.71M | auto Ba = Sub(top, avg); | 77 | 8.71M | auto an = Sub(avg, next_avg); | 78 | 8.71M | auto nonmono = Xor(Ba, an); | 79 | 8.71M | auto absBa = Abs(Ba); | 80 | 8.71M | auto absan = Abs(an); | 81 | 8.71M | auto absBn = Abs(Sub(top, next_avg)); | 82 | | // Compute a3 = absBa / 3 | 83 | 8.71M | auto a3eh = MulEven(absBa, onethird); | 84 | 8.71M | auto a3oh = MulOdd(absBa, onethird); | 85 | | | 86 | 8.71M | #if (HWY_MAJOR > 1 || (HWY_MAJOR == 1 && HWY_MINOR >= 2)) | 87 | 8.71M | #if HWY_IS_LITTLE_ENDIAN | 88 | 8.71M | auto a3 = InterleaveOdd(d, BitCast(d, a3eh), BitCast(d, a3oh)); | 89 | | #else // not little endian | 90 | | auto a3 = InterleaveEven(d, BitCast(d, a3eh), BitCast(d, a3oh)); | 91 | | #endif // endianness | 92 | | #else // hwy < 1.2 | 93 | | #if HWY_IS_LITTLE_ENDIAN | 94 | | auto a3 = OddEven(BitCast(d, a3oh), DupOdd(BitCast(d, a3eh))); | 95 | | #else // not little endian | 96 | | auto a3 = OddEven(DupEven(BitCast(d, a3oh)), BitCast(d, a3eh)) | 97 | | #endif // endianness | 98 | | #endif // hwy version | 99 | | | 100 | 8.71M | a3 = Add(a3, Add(absBn, Set(d, 2))); | 101 | 8.71M | auto absdiff = ShiftRight<2>(a3); | 102 | 8.71M | auto skipdiff = Ne(Ba, Zero(d)); | 103 | 8.71M | skipdiff = And(skipdiff, Ne(an, Zero(d))); | 104 | 8.71M | skipdiff = And(skipdiff, Lt(nonmono, Zero(d))); | 105 | 8.71M | auto absBa2 = Add(ShiftLeft<1>(absBa), And(absdiff, Set(d, 1))); | 106 | 8.71M | absdiff = IfThenElse(Gt(absdiff, absBa2), | 107 | 8.71M | Add(ShiftLeft<1>(absBa), Set(d, 1)), absdiff); | 108 | | // typo:on | 109 | 8.71M | auto absan2 = ShiftLeft<1>(absan); | 110 | 8.71M | absdiff = IfThenElse(Gt(Add(absdiff, And(absdiff, Set(d, 1))), absan2), | 111 | 8.71M | absan2, absdiff); | 112 | 8.71M | auto diff1 = IfThenElse(Lt(top, next_avg), Neg(absdiff), absdiff); | 113 | 8.71M | auto tendency = IfThenZeroElse(skipdiff, diff1); | 114 | | | 115 | 8.71M | auto diff_minus_tendency = Load(d, p_residual + x); | 116 | 8.71M | auto diff = Add(diff_minus_tendency, tendency); | 117 | 8.71M | auto out = | 118 | 8.71M | Add(avg, ShiftRight<1>( | 119 | 8.71M | Add(diff, BitCast(d, ShiftRight<31>(BitCast(du, diff)))))); | 120 | 8.71M | Store(out, d, p_out + x); | 121 | 8.71M | Store(Sub(out, diff), d, p_nout + x); | 122 | 8.71M | } | 123 | 8.71M | } |
Unexecuted instantiation: jxl::N_SSE2::FastUnsqueeze(int const*, int const*, int const*, int const*, int*, int*) |
124 | | |
125 | | #endif // HWY_TARGET != HWY_SCALAR |
126 | | |
127 | 38.1k | Status InvHSqueeze(Image &input, uint32_t c, uint32_t rc, ThreadPool *pool) { |
128 | 38.1k | JXL_ENSURE(c < input.channel.size()); |
129 | 38.1k | JXL_ENSURE(rc < input.channel.size()); |
130 | 38.1k | Channel &chin = input.channel[c]; |
131 | 38.1k | const Channel &chin_residual = input.channel[rc]; |
132 | | // These must be valid since we ran MetaApply already. |
133 | 38.1k | JXL_ENSURE(chin.w == DivCeil(chin.w + chin_residual.w, 2)); |
134 | 38.1k | JXL_ENSURE(chin.h == chin_residual.h); |
135 | 38.1k | JxlMemoryManager *memory_manager = input.memory_manager(); |
136 | | |
137 | 38.1k | if (chin_residual.w == 0) { |
138 | | // Short-circuit: output channel has same dimensions as input. |
139 | 254 | input.channel[c].hshift--; |
140 | 254 | return true; |
141 | 254 | } |
142 | | |
143 | | // Note: chin.w >= chin_residual.w and at most 1 different. |
144 | 75.7k | JXL_ASSIGN_OR_RETURN(Channel chout, |
145 | 75.7k | Channel::Create(memory_manager, chin.w + chin_residual.w, |
146 | 75.7k | chin.h, chin.hshift - 1, chin.vshift)); |
147 | 75.7k | JXL_DEBUG_V(4, |
148 | 75.7k | "Undoing horizontal squeeze of channel %i using residuals in " |
149 | 75.7k | "channel %i (going from width %" PRIuS " to %" PRIuS ")", |
150 | 75.7k | c, rc, chin.w, chout.w); |
151 | | |
152 | 75.7k | if (chin_residual.h == 0) { |
153 | | // Short-circuit: channel with no pixels. |
154 | 0 | input.channel[c] = std::move(chout); |
155 | 0 | return true; |
156 | 0 | } |
157 | 1.02M | auto unsqueeze_row = [&](size_t y, size_t x0) { |
158 | 1.02M | const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y); |
159 | 1.02M | const pixel_type *JXL_RESTRICT p_avg = chin.Row(y); |
160 | 1.02M | pixel_type *JXL_RESTRICT p_out = chout.Row(y); |
161 | 9.61M | for (size_t x = x0; x < chin_residual.w; x++) { |
162 | 8.59M | pixel_type_w diff_minus_tendency = p_residual[x]; |
163 | 8.59M | pixel_type_w avg = p_avg[x]; |
164 | 8.59M | pixel_type_w next_avg = (x + 1 < chin.w ? p_avg[x + 1] : avg); |
165 | 8.59M | pixel_type_w left = (x ? p_out[(x << 1) - 1] : avg); |
166 | 8.59M | pixel_type_w tendency = SmoothTendency(left, avg, next_avg); |
167 | 8.59M | pixel_type_w diff = diff_minus_tendency + tendency; |
168 | 8.59M | pixel_type_w A = avg + (diff / 2); |
169 | 8.59M | p_out[(x << 1)] = A; |
170 | 8.59M | pixel_type_w B = A - diff; |
171 | 8.59M | p_out[(x << 1) + 1] = B; |
172 | 8.59M | } |
173 | 1.02M | if (chout.w & 1) p_out[chout.w - 1] = p_avg[chin.w - 1]; |
174 | 1.02M | }; Unexecuted instantiation: squeeze.cc:jxl::N_SSE4::InvHSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)::$_0::operator()(unsigned long, unsigned long) const squeeze.cc:jxl::N_AVX2::InvHSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)::$_0::operator()(unsigned long, unsigned long) const Line | Count | Source | 157 | 1.02M | auto unsqueeze_row = [&](size_t y, size_t x0) { | 158 | 1.02M | const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y); | 159 | 1.02M | const pixel_type *JXL_RESTRICT p_avg = chin.Row(y); | 160 | 1.02M | pixel_type *JXL_RESTRICT p_out = chout.Row(y); | 161 | 9.61M | for (size_t x = x0; x < chin_residual.w; x++) { | 162 | 8.59M | pixel_type_w diff_minus_tendency = p_residual[x]; | 163 | 8.59M | pixel_type_w avg = p_avg[x]; | 164 | 8.59M | pixel_type_w next_avg = (x + 1 < chin.w ? p_avg[x + 1] : avg); | 165 | 8.59M | pixel_type_w left = (x ? p_out[(x << 1) - 1] : avg); | 166 | 8.59M | pixel_type_w tendency = SmoothTendency(left, avg, next_avg); | 167 | 8.59M | pixel_type_w diff = diff_minus_tendency + tendency; | 168 | 8.59M | pixel_type_w A = avg + (diff / 2); | 169 | 8.59M | p_out[(x << 1)] = A; | 170 | 8.59M | pixel_type_w B = A - diff; | 171 | 8.59M | p_out[(x << 1) + 1] = B; | 172 | 8.59M | } | 173 | 1.02M | if (chout.w & 1) p_out[chout.w - 1] = p_avg[chin.w - 1]; | 174 | 1.02M | }; |
Unexecuted instantiation: squeeze.cc:jxl::N_SSE2::InvHSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)::$_0::operator()(unsigned long, unsigned long) const |
175 | | |
176 | | // somewhat complicated trickery just to be able to SIMD this. |
177 | | // Horizontal unsqueeze has horizontal data dependencies, so we do |
178 | | // 8 rows at a time and treat it as a vertical unsqueeze of a |
179 | | // transposed 8x8 block (or 9x8 for one input). |
180 | 37.8k | static constexpr const size_t kRowsPerThread = 8; |
181 | 37.8k | const auto unsqueeze_span = [&](const uint32_t task, |
182 | 145k | size_t /* thread */) -> Status { |
183 | 145k | const size_t y0 = task * kRowsPerThread; |
184 | 145k | const size_t rows = std::min(kRowsPerThread, chin.h - y0); |
185 | 145k | size_t x = 0; |
186 | | |
187 | 145k | #if HWY_TARGET != HWY_SCALAR |
188 | 145k | intptr_t onerow_in = chin.plane.PixelsPerRow(); |
189 | 145k | intptr_t onerow_inr = chin_residual.plane.PixelsPerRow(); |
190 | 145k | intptr_t onerow_out = chout.plane.PixelsPerRow(); |
191 | 145k | const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y0); |
192 | 145k | const pixel_type *JXL_RESTRICT p_avg = chin.Row(y0); |
193 | 145k | pixel_type *JXL_RESTRICT p_out = chout.Row(y0); |
194 | 145k | HWY_ALIGN pixel_type b_p_avg[9 * kRowsPerThread]; |
195 | 145k | HWY_ALIGN pixel_type b_p_residual[8 * kRowsPerThread]; |
196 | 145k | HWY_ALIGN pixel_type b_p_out_even[8 * kRowsPerThread]; |
197 | 145k | HWY_ALIGN pixel_type b_p_out_odd[8 * kRowsPerThread]; |
198 | 145k | HWY_ALIGN pixel_type b_p_out_evenT[8 * kRowsPerThread]; |
199 | 145k | HWY_ALIGN pixel_type b_p_out_oddT[8 * kRowsPerThread]; |
200 | 145k | const size_t N = Lanes(d); |
201 | 145k | if (chin_residual.w > 16 && rows == kRowsPerThread) { |
202 | 754k | for (; x < chin_residual.w - 9; x += 8) { |
203 | 665k | Transpose8x8Block(p_residual + x, b_p_residual, onerow_inr); |
204 | 665k | Transpose8x8Block(p_avg + x, b_p_avg, onerow_in); |
205 | 5.99M | for (size_t y = 0; y < kRowsPerThread; y++) { |
206 | 5.32M | b_p_avg[8 * 8 + y] = p_avg[x + 8 + onerow_in * y]; |
207 | 5.32M | } |
208 | 5.99M | for (size_t i = 0; i < 8; i++) { |
209 | 5.32M | FastUnsqueeze( |
210 | 5.32M | b_p_residual + 8 * i, b_p_avg + 8 * i, b_p_avg + 8 * (i + 1), |
211 | 5.32M | (x + i ? b_p_out_odd + 8 * ((x + i - 1) & 7) : b_p_avg + 8 * i), |
212 | 5.32M | b_p_out_even + 8 * i, b_p_out_odd + 8 * i); |
213 | 5.32M | } |
214 | | |
215 | 665k | Transpose8x8Block(b_p_out_even, b_p_out_evenT, 8); |
216 | 665k | Transpose8x8Block(b_p_out_odd, b_p_out_oddT, 8); |
217 | 5.99M | for (size_t y = 0; y < kRowsPerThread; y++) { |
218 | 10.6M | for (size_t i = 0; i < kRowsPerThread; i += N) { |
219 | 5.32M | auto even = Load(d, b_p_out_evenT + 8 * y + i); |
220 | 5.32M | auto odd = Load(d, b_p_out_oddT + 8 * y + i); |
221 | 5.32M | StoreInterleaved(d, even, odd, |
222 | 5.32M | p_out + ((x + i) << 1) + onerow_out * y); |
223 | 5.32M | } |
224 | 5.32M | } |
225 | 665k | } |
226 | 89.1k | } |
227 | 145k | #endif // HWY_TARGET != HWY_SCALAR |
228 | 1.17M | for (size_t y = 0; y < rows; y++) { |
229 | 1.02M | unsqueeze_row(y0 + y, x); |
230 | 1.02M | } |
231 | 145k | return true; |
232 | 145k | }; Unexecuted instantiation: squeeze.cc:jxl::N_SSE4::InvHSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)::$_1::operator()(unsigned int, unsigned long) const squeeze.cc:jxl::N_AVX2::InvHSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)::$_1::operator()(unsigned int, unsigned long) const Line | Count | Source | 182 | 145k | size_t /* thread */) -> Status { | 183 | 145k | const size_t y0 = task * kRowsPerThread; | 184 | 145k | const size_t rows = std::min(kRowsPerThread, chin.h - y0); | 185 | 145k | size_t x = 0; | 186 | | | 187 | 145k | #if HWY_TARGET != HWY_SCALAR | 188 | 145k | intptr_t onerow_in = chin.plane.PixelsPerRow(); | 189 | 145k | intptr_t onerow_inr = chin_residual.plane.PixelsPerRow(); | 190 | 145k | intptr_t onerow_out = chout.plane.PixelsPerRow(); | 191 | 145k | const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y0); | 192 | 145k | const pixel_type *JXL_RESTRICT p_avg = chin.Row(y0); | 193 | 145k | pixel_type *JXL_RESTRICT p_out = chout.Row(y0); | 194 | 145k | HWY_ALIGN pixel_type b_p_avg[9 * kRowsPerThread]; | 195 | 145k | HWY_ALIGN pixel_type b_p_residual[8 * kRowsPerThread]; | 196 | 145k | HWY_ALIGN pixel_type b_p_out_even[8 * kRowsPerThread]; | 197 | 145k | HWY_ALIGN pixel_type b_p_out_odd[8 * kRowsPerThread]; | 198 | 145k | HWY_ALIGN pixel_type b_p_out_evenT[8 * kRowsPerThread]; | 199 | 145k | HWY_ALIGN pixel_type b_p_out_oddT[8 * kRowsPerThread]; | 200 | 145k | const size_t N = Lanes(d); | 201 | 145k | if (chin_residual.w > 16 && rows == kRowsPerThread) { | 202 | 754k | for (; x < chin_residual.w - 9; x += 8) { | 203 | 665k | Transpose8x8Block(p_residual + x, b_p_residual, onerow_inr); | 204 | 665k | Transpose8x8Block(p_avg + x, b_p_avg, onerow_in); | 205 | 5.99M | for (size_t y = 0; y < kRowsPerThread; y++) { | 206 | 5.32M | b_p_avg[8 * 8 + y] = p_avg[x + 8 + onerow_in * y]; | 207 | 5.32M | } | 208 | 5.99M | for (size_t i = 0; i < 8; i++) { | 209 | 5.32M | FastUnsqueeze( | 210 | 5.32M | b_p_residual + 8 * i, b_p_avg + 8 * i, b_p_avg + 8 * (i + 1), | 211 | 5.32M | (x + i ? b_p_out_odd + 8 * ((x + i - 1) & 7) : b_p_avg + 8 * i), | 212 | 5.32M | b_p_out_even + 8 * i, b_p_out_odd + 8 * i); | 213 | 5.32M | } | 214 | | | 215 | 665k | Transpose8x8Block(b_p_out_even, b_p_out_evenT, 8); | 216 | 665k | Transpose8x8Block(b_p_out_odd, b_p_out_oddT, 8); | 217 | 5.99M | for (size_t y = 0; y < kRowsPerThread; y++) { | 218 | 10.6M | for (size_t i = 0; i < kRowsPerThread; i += N) { | 219 | 5.32M | auto even = Load(d, b_p_out_evenT + 8 * y + i); | 220 | 5.32M | auto odd = Load(d, b_p_out_oddT + 8 * y + i); | 221 | 5.32M | StoreInterleaved(d, even, odd, | 222 | 5.32M | p_out + ((x + i) << 1) + onerow_out * y); | 223 | 5.32M | } | 224 | 5.32M | } | 225 | 665k | } | 226 | 89.1k | } | 227 | 145k | #endif // HWY_TARGET != HWY_SCALAR | 228 | 1.17M | for (size_t y = 0; y < rows; y++) { | 229 | 1.02M | unsqueeze_row(y0 + y, x); | 230 | 1.02M | } | 231 | 145k | return true; | 232 | 145k | }; |
Unexecuted instantiation: squeeze.cc:jxl::N_SSE2::InvHSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)::$_1::operator()(unsigned int, unsigned long) const |
233 | 37.8k | JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, DivCeil(chin.h, kRowsPerThread), |
234 | 37.8k | ThreadPool::NoInit, unsqueeze_span, |
235 | 37.8k | "InvHorizontalSqueeze")); |
236 | 37.8k | input.channel[c] = std::move(chout); |
237 | 37.8k | return true; |
238 | 37.8k | } Unexecuted instantiation: jxl::N_SSE4::InvHSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*) jxl::N_AVX2::InvHSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*) Line | Count | Source | 127 | 38.1k | Status InvHSqueeze(Image &input, uint32_t c, uint32_t rc, ThreadPool *pool) { | 128 | 38.1k | JXL_ENSURE(c < input.channel.size()); | 129 | 38.1k | JXL_ENSURE(rc < input.channel.size()); | 130 | 38.1k | Channel &chin = input.channel[c]; | 131 | 38.1k | const Channel &chin_residual = input.channel[rc]; | 132 | | // These must be valid since we ran MetaApply already. | 133 | 38.1k | JXL_ENSURE(chin.w == DivCeil(chin.w + chin_residual.w, 2)); | 134 | 38.1k | JXL_ENSURE(chin.h == chin_residual.h); | 135 | 38.1k | JxlMemoryManager *memory_manager = input.memory_manager(); | 136 | | | 137 | 38.1k | if (chin_residual.w == 0) { | 138 | | // Short-circuit: output channel has same dimensions as input. | 139 | 254 | input.channel[c].hshift--; | 140 | 254 | return true; | 141 | 254 | } | 142 | | | 143 | | // Note: chin.w >= chin_residual.w and at most 1 different. | 144 | 75.7k | JXL_ASSIGN_OR_RETURN(Channel chout, | 145 | 75.7k | Channel::Create(memory_manager, chin.w + chin_residual.w, | 146 | 75.7k | chin.h, chin.hshift - 1, chin.vshift)); | 147 | 75.7k | JXL_DEBUG_V(4, | 148 | 75.7k | "Undoing horizontal squeeze of channel %i using residuals in " | 149 | 75.7k | "channel %i (going from width %" PRIuS " to %" PRIuS ")", | 150 | 75.7k | c, rc, chin.w, chout.w); | 151 | | | 152 | 75.7k | if (chin_residual.h == 0) { | 153 | | // Short-circuit: channel with no pixels. | 154 | 0 | input.channel[c] = std::move(chout); | 155 | 0 | return true; | 156 | 0 | } | 157 | 37.8k | auto unsqueeze_row = [&](size_t y, size_t x0) { | 158 | 37.8k | const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y); | 159 | 37.8k | const pixel_type *JXL_RESTRICT p_avg = chin.Row(y); | 160 | 37.8k | pixel_type *JXL_RESTRICT p_out = chout.Row(y); | 161 | 37.8k | for (size_t x = x0; x < chin_residual.w; x++) { | 162 | 37.8k | pixel_type_w diff_minus_tendency = p_residual[x]; | 163 | 37.8k | pixel_type_w avg = p_avg[x]; | 164 | 37.8k | pixel_type_w next_avg = (x + 1 < chin.w ? p_avg[x + 1] : avg); | 165 | 37.8k | pixel_type_w left = (x ? p_out[(x << 1) - 1] : avg); | 166 | 37.8k | pixel_type_w tendency = SmoothTendency(left, avg, next_avg); | 167 | 37.8k | pixel_type_w diff = diff_minus_tendency + tendency; | 168 | 37.8k | pixel_type_w A = avg + (diff / 2); | 169 | 37.8k | p_out[(x << 1)] = A; | 170 | 37.8k | pixel_type_w B = A - diff; | 171 | 37.8k | p_out[(x << 1) + 1] = B; | 172 | 37.8k | } | 173 | 37.8k | if (chout.w & 1) p_out[chout.w - 1] = p_avg[chin.w - 1]; | 174 | 37.8k | }; | 175 | | | 176 | | // somewhat complicated trickery just to be able to SIMD this. | 177 | | // Horizontal unsqueeze has horizontal data dependencies, so we do | 178 | | // 8 rows at a time and treat it as a vertical unsqueeze of a | 179 | | // transposed 8x8 block (or 9x8 for one input). | 180 | 37.8k | static constexpr const size_t kRowsPerThread = 8; | 181 | 37.8k | const auto unsqueeze_span = [&](const uint32_t task, | 182 | 37.8k | size_t /* thread */) -> Status { | 183 | 37.8k | const size_t y0 = task * kRowsPerThread; | 184 | 37.8k | const size_t rows = std::min(kRowsPerThread, chin.h - y0); | 185 | 37.8k | size_t x = 0; | 186 | | | 187 | 37.8k | #if HWY_TARGET != HWY_SCALAR | 188 | 37.8k | intptr_t onerow_in = chin.plane.PixelsPerRow(); | 189 | 37.8k | intptr_t onerow_inr = chin_residual.plane.PixelsPerRow(); | 190 | 37.8k | intptr_t onerow_out = chout.plane.PixelsPerRow(); | 191 | 37.8k | const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y0); | 192 | 37.8k | const pixel_type *JXL_RESTRICT p_avg = chin.Row(y0); | 193 | 37.8k | pixel_type *JXL_RESTRICT p_out = chout.Row(y0); | 194 | 37.8k | HWY_ALIGN pixel_type b_p_avg[9 * kRowsPerThread]; | 195 | 37.8k | HWY_ALIGN pixel_type b_p_residual[8 * kRowsPerThread]; | 196 | 37.8k | HWY_ALIGN pixel_type b_p_out_even[8 * kRowsPerThread]; | 197 | 37.8k | HWY_ALIGN pixel_type b_p_out_odd[8 * kRowsPerThread]; | 198 | 37.8k | HWY_ALIGN pixel_type b_p_out_evenT[8 * kRowsPerThread]; | 199 | 37.8k | HWY_ALIGN pixel_type b_p_out_oddT[8 * kRowsPerThread]; | 200 | 37.8k | const size_t N = Lanes(d); | 201 | 37.8k | if (chin_residual.w > 16 && rows == kRowsPerThread) { | 202 | 37.8k | for (; x < chin_residual.w - 9; x += 8) { | 203 | 37.8k | Transpose8x8Block(p_residual + x, b_p_residual, onerow_inr); | 204 | 37.8k | Transpose8x8Block(p_avg + x, b_p_avg, onerow_in); | 205 | 37.8k | for (size_t y = 0; y < kRowsPerThread; y++) { | 206 | 37.8k | b_p_avg[8 * 8 + y] = p_avg[x + 8 + onerow_in * y]; | 207 | 37.8k | } | 208 | 37.8k | for (size_t i = 0; i < 8; i++) { | 209 | 37.8k | FastUnsqueeze( | 210 | 37.8k | b_p_residual + 8 * i, b_p_avg + 8 * i, b_p_avg + 8 * (i + 1), | 211 | 37.8k | (x + i ? b_p_out_odd + 8 * ((x + i - 1) & 7) : b_p_avg + 8 * i), | 212 | 37.8k | b_p_out_even + 8 * i, b_p_out_odd + 8 * i); | 213 | 37.8k | } | 214 | | | 215 | 37.8k | Transpose8x8Block(b_p_out_even, b_p_out_evenT, 8); | 216 | 37.8k | Transpose8x8Block(b_p_out_odd, b_p_out_oddT, 8); | 217 | 37.8k | for (size_t y = 0; y < kRowsPerThread; y++) { | 218 | 37.8k | for (size_t i = 0; i < kRowsPerThread; i += N) { | 219 | 37.8k | auto even = Load(d, b_p_out_evenT + 8 * y + i); | 220 | 37.8k | auto odd = Load(d, b_p_out_oddT + 8 * y + i); | 221 | 37.8k | StoreInterleaved(d, even, odd, | 222 | 37.8k | p_out + ((x + i) << 1) + onerow_out * y); | 223 | 37.8k | } | 224 | 37.8k | } | 225 | 37.8k | } | 226 | 37.8k | } | 227 | 37.8k | #endif // HWY_TARGET != HWY_SCALAR | 228 | 37.8k | for (size_t y = 0; y < rows; y++) { | 229 | 37.8k | unsqueeze_row(y0 + y, x); | 230 | 37.8k | } | 231 | 37.8k | return true; | 232 | 37.8k | }; | 233 | 37.8k | JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, DivCeil(chin.h, kRowsPerThread), | 234 | 37.8k | ThreadPool::NoInit, unsqueeze_span, | 235 | 37.8k | "InvHorizontalSqueeze")); | 236 | 37.8k | input.channel[c] = std::move(chout); | 237 | 37.8k | return true; | 238 | 37.8k | } |
Unexecuted instantiation: jxl::N_SSE2::InvHSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*) |
239 | | |
240 | 33.3k | Status InvVSqueeze(Image &input, uint32_t c, uint32_t rc, ThreadPool *pool) { |
241 | 33.3k | JXL_ENSURE(c < input.channel.size()); |
242 | 33.3k | JXL_ENSURE(rc < input.channel.size()); |
243 | 33.3k | const Channel &chin = input.channel[c]; |
244 | 33.3k | const Channel &chin_residual = input.channel[rc]; |
245 | | // These must be valid since we ran MetaApply already. |
246 | 33.3k | JXL_ENSURE(chin.h == DivCeil(chin.h + chin_residual.h, 2)); |
247 | 33.3k | JXL_ENSURE(chin.w == chin_residual.w); |
248 | 33.3k | JxlMemoryManager *memory_manager = input.memory_manager(); |
249 | | |
250 | 33.3k | if (chin_residual.h == 0) { |
251 | | // Short-circuit: output channel has same dimensions as input. |
252 | 406 | input.channel[c].vshift--; |
253 | 406 | return true; |
254 | 406 | } |
255 | | |
256 | | // Note: chin.h >= chin_residual.h and at most 1 different. |
257 | 65.9k | JXL_ASSIGN_OR_RETURN( |
258 | 65.9k | Channel chout, |
259 | 65.9k | Channel::Create(memory_manager, chin.w, chin.h + chin_residual.h, |
260 | 65.9k | chin.hshift, chin.vshift - 1)); |
261 | 65.9k | JXL_DEBUG_V( |
262 | 65.9k | 4, |
263 | 65.9k | "Undoing vertical squeeze of channel %i using residuals in channel " |
264 | 65.9k | "%i (going from height %" PRIuS " to %" PRIuS ")", |
265 | 65.9k | c, rc, chin.h, chout.h); |
266 | | |
267 | 65.9k | if (chin_residual.w == 0) { |
268 | | // Short-circuit: channel with no pixels. |
269 | 0 | input.channel[c] = std::move(chout); |
270 | 0 | return true; |
271 | 0 | } |
272 | | |
273 | 32.9k | static constexpr const int kColsPerThread = 64; |
274 | 32.9k | const auto unsqueeze_slice = [&](const uint32_t task, |
275 | 37.5k | size_t /* thread */) -> Status { |
276 | 37.5k | const size_t x0 = task * kColsPerThread; |
277 | 37.5k | const size_t x1 = |
278 | 37.5k | std::min(static_cast<size_t>(task + 1) * kColsPerThread, chin.w); |
279 | 37.5k | const size_t w = x1 - x0; |
280 | | // We only iterate up to std::min(chin_residual.h, chin.h) which is |
281 | | // always chin_residual.h. |
282 | 807k | for (size_t y = 0; y < chin_residual.h; y++) { |
283 | 769k | const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y) + x0; |
284 | 769k | const pixel_type *JXL_RESTRICT p_avg = chin.Row(y) + x0; |
285 | 769k | const pixel_type *JXL_RESTRICT p_navg = |
286 | 769k | chin.Row(y + 1 < chin.h ? y + 1 : y) + x0; |
287 | 769k | pixel_type *JXL_RESTRICT p_out = chout.Row(y << 1) + x0; |
288 | 769k | pixel_type *JXL_RESTRICT p_nout = chout.Row((y << 1) + 1) + x0; |
289 | 769k | const pixel_type *p_pout = y > 0 ? chout.Row((y << 1) - 1) + x0 : p_avg; |
290 | 769k | size_t x = 0; |
291 | 769k | #if HWY_TARGET != HWY_SCALAR |
292 | 4.15M | for (; x + 7 < w; x += 8) { |
293 | 3.38M | FastUnsqueeze(p_residual + x, p_avg + x, p_navg + x, p_pout + x, |
294 | 3.38M | p_out + x, p_nout + x); |
295 | 3.38M | } |
296 | 769k | #endif |
297 | 2.57M | for (; x < w; x++) { |
298 | 1.80M | pixel_type_w avg = p_avg[x]; |
299 | 1.80M | pixel_type_w next_avg = p_navg[x]; |
300 | 1.80M | pixel_type_w top = p_pout[x]; |
301 | 1.80M | pixel_type_w tendency = SmoothTendency(top, avg, next_avg); |
302 | 1.80M | pixel_type_w diff_minus_tendency = p_residual[x]; |
303 | 1.80M | pixel_type_w diff = diff_minus_tendency + tendency; |
304 | 1.80M | pixel_type_w out = avg + (diff / 2); |
305 | 1.80M | p_out[x] = out; |
306 | | // If the chin_residual.h == chin.h, the output has an even number |
307 | | // of rows so the next line is fine. Otherwise, this loop won't |
308 | | // write to the last output row which is handled separately. |
309 | 1.80M | p_nout[x] = out - diff; |
310 | 1.80M | } |
311 | 769k | } |
312 | 37.5k | return true; |
313 | 37.5k | }; Unexecuted instantiation: squeeze.cc:jxl::N_SSE4::InvVSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)::$_0::operator()(unsigned int, unsigned long) const squeeze.cc:jxl::N_AVX2::InvVSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)::$_0::operator()(unsigned int, unsigned long) const Line | Count | Source | 275 | 37.5k | size_t /* thread */) -> Status { | 276 | 37.5k | const size_t x0 = task * kColsPerThread; | 277 | 37.5k | const size_t x1 = | 278 | 37.5k | std::min(static_cast<size_t>(task + 1) * kColsPerThread, chin.w); | 279 | 37.5k | const size_t w = x1 - x0; | 280 | | // We only iterate up to std::min(chin_residual.h, chin.h) which is | 281 | | // always chin_residual.h. | 282 | 807k | for (size_t y = 0; y < chin_residual.h; y++) { | 283 | 769k | const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y) + x0; | 284 | 769k | const pixel_type *JXL_RESTRICT p_avg = chin.Row(y) + x0; | 285 | 769k | const pixel_type *JXL_RESTRICT p_navg = | 286 | 769k | chin.Row(y + 1 < chin.h ? y + 1 : y) + x0; | 287 | 769k | pixel_type *JXL_RESTRICT p_out = chout.Row(y << 1) + x0; | 288 | 769k | pixel_type *JXL_RESTRICT p_nout = chout.Row((y << 1) + 1) + x0; | 289 | 769k | const pixel_type *p_pout = y > 0 ? chout.Row((y << 1) - 1) + x0 : p_avg; | 290 | 769k | size_t x = 0; | 291 | 769k | #if HWY_TARGET != HWY_SCALAR | 292 | 4.15M | for (; x + 7 < w; x += 8) { | 293 | 3.38M | FastUnsqueeze(p_residual + x, p_avg + x, p_navg + x, p_pout + x, | 294 | 3.38M | p_out + x, p_nout + x); | 295 | 3.38M | } | 296 | 769k | #endif | 297 | 2.57M | for (; x < w; x++) { | 298 | 1.80M | pixel_type_w avg = p_avg[x]; | 299 | 1.80M | pixel_type_w next_avg = p_navg[x]; | 300 | 1.80M | pixel_type_w top = p_pout[x]; | 301 | 1.80M | pixel_type_w tendency = SmoothTendency(top, avg, next_avg); | 302 | 1.80M | pixel_type_w diff_minus_tendency = p_residual[x]; | 303 | 1.80M | pixel_type_w diff = diff_minus_tendency + tendency; | 304 | 1.80M | pixel_type_w out = avg + (diff / 2); | 305 | 1.80M | p_out[x] = out; | 306 | | // If the chin_residual.h == chin.h, the output has an even number | 307 | | // of rows so the next line is fine. Otherwise, this loop won't | 308 | | // write to the last output row which is handled separately. | 309 | 1.80M | p_nout[x] = out - diff; | 310 | 1.80M | } | 311 | 769k | } | 312 | 37.5k | return true; | 313 | 37.5k | }; |
Unexecuted instantiation: squeeze.cc:jxl::N_SSE2::InvVSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)::$_0::operator()(unsigned int, unsigned long) const |
314 | 32.9k | JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, DivCeil(chin.w, kColsPerThread), |
315 | 32.9k | ThreadPool::NoInit, unsqueeze_slice, |
316 | 32.9k | "InvVertSqueeze")); |
317 | | |
318 | 32.9k | if (chout.h & 1) { |
319 | 16.2k | size_t y = chin.h - 1; |
320 | 16.2k | const pixel_type *p_avg = chin.Row(y); |
321 | 16.2k | pixel_type *p_out = chout.Row(y << 1); |
322 | 465k | for (size_t x = 0; x < chin.w; x++) { |
323 | 449k | p_out[x] = p_avg[x]; |
324 | 449k | } |
325 | 16.2k | } |
326 | 32.9k | input.channel[c] = std::move(chout); |
327 | 32.9k | return true; |
328 | 32.9k | } Unexecuted instantiation: jxl::N_SSE4::InvVSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*) jxl::N_AVX2::InvVSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*) Line | Count | Source | 240 | 33.3k | Status InvVSqueeze(Image &input, uint32_t c, uint32_t rc, ThreadPool *pool) { | 241 | 33.3k | JXL_ENSURE(c < input.channel.size()); | 242 | 33.3k | JXL_ENSURE(rc < input.channel.size()); | 243 | 33.3k | const Channel &chin = input.channel[c]; | 244 | 33.3k | const Channel &chin_residual = input.channel[rc]; | 245 | | // These must be valid since we ran MetaApply already. | 246 | 33.3k | JXL_ENSURE(chin.h == DivCeil(chin.h + chin_residual.h, 2)); | 247 | 33.3k | JXL_ENSURE(chin.w == chin_residual.w); | 248 | 33.3k | JxlMemoryManager *memory_manager = input.memory_manager(); | 249 | | | 250 | 33.3k | if (chin_residual.h == 0) { | 251 | | // Short-circuit: output channel has same dimensions as input. | 252 | 406 | input.channel[c].vshift--; | 253 | 406 | return true; | 254 | 406 | } | 255 | | | 256 | | // Note: chin.h >= chin_residual.h and at most 1 different. | 257 | 65.9k | JXL_ASSIGN_OR_RETURN( | 258 | 65.9k | Channel chout, | 259 | 65.9k | Channel::Create(memory_manager, chin.w, chin.h + chin_residual.h, | 260 | 65.9k | chin.hshift, chin.vshift - 1)); | 261 | 65.9k | JXL_DEBUG_V( | 262 | 65.9k | 4, | 263 | 65.9k | "Undoing vertical squeeze of channel %i using residuals in channel " | 264 | 65.9k | "%i (going from height %" PRIuS " to %" PRIuS ")", | 265 | 65.9k | c, rc, chin.h, chout.h); | 266 | | | 267 | 65.9k | if (chin_residual.w == 0) { | 268 | | // Short-circuit: channel with no pixels. | 269 | 0 | input.channel[c] = std::move(chout); | 270 | 0 | return true; | 271 | 0 | } | 272 | | | 273 | 32.9k | static constexpr const int kColsPerThread = 64; | 274 | 32.9k | const auto unsqueeze_slice = [&](const uint32_t task, | 275 | 32.9k | size_t /* thread */) -> Status { | 276 | 32.9k | const size_t x0 = task * kColsPerThread; | 277 | 32.9k | const size_t x1 = | 278 | 32.9k | std::min(static_cast<size_t>(task + 1) * kColsPerThread, chin.w); | 279 | 32.9k | const size_t w = x1 - x0; | 280 | | // We only iterate up to std::min(chin_residual.h, chin.h) which is | 281 | | // always chin_residual.h. | 282 | 32.9k | for (size_t y = 0; y < chin_residual.h; y++) { | 283 | 32.9k | const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y) + x0; | 284 | 32.9k | const pixel_type *JXL_RESTRICT p_avg = chin.Row(y) + x0; | 285 | 32.9k | const pixel_type *JXL_RESTRICT p_navg = | 286 | 32.9k | chin.Row(y + 1 < chin.h ? y + 1 : y) + x0; | 287 | 32.9k | pixel_type *JXL_RESTRICT p_out = chout.Row(y << 1) + x0; | 288 | 32.9k | pixel_type *JXL_RESTRICT p_nout = chout.Row((y << 1) + 1) + x0; | 289 | 32.9k | const pixel_type *p_pout = y > 0 ? chout.Row((y << 1) - 1) + x0 : p_avg; | 290 | 32.9k | size_t x = 0; | 291 | 32.9k | #if HWY_TARGET != HWY_SCALAR | 292 | 32.9k | for (; x + 7 < w; x += 8) { | 293 | 32.9k | FastUnsqueeze(p_residual + x, p_avg + x, p_navg + x, p_pout + x, | 294 | 32.9k | p_out + x, p_nout + x); | 295 | 32.9k | } | 296 | 32.9k | #endif | 297 | 32.9k | for (; x < w; x++) { | 298 | 32.9k | pixel_type_w avg = p_avg[x]; | 299 | 32.9k | pixel_type_w next_avg = p_navg[x]; | 300 | 32.9k | pixel_type_w top = p_pout[x]; | 301 | 32.9k | pixel_type_w tendency = SmoothTendency(top, avg, next_avg); | 302 | 32.9k | pixel_type_w diff_minus_tendency = p_residual[x]; | 303 | 32.9k | pixel_type_w diff = diff_minus_tendency + tendency; | 304 | 32.9k | pixel_type_w out = avg + (diff / 2); | 305 | 32.9k | p_out[x] = out; | 306 | | // If the chin_residual.h == chin.h, the output has an even number | 307 | | // of rows so the next line is fine. Otherwise, this loop won't | 308 | | // write to the last output row which is handled separately. | 309 | 32.9k | p_nout[x] = out - diff; | 310 | 32.9k | } | 311 | 32.9k | } | 312 | 32.9k | return true; | 313 | 32.9k | }; | 314 | 32.9k | JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, DivCeil(chin.w, kColsPerThread), | 315 | 32.9k | ThreadPool::NoInit, unsqueeze_slice, | 316 | 32.9k | "InvVertSqueeze")); | 317 | | | 318 | 32.9k | if (chout.h & 1) { | 319 | 16.2k | size_t y = chin.h - 1; | 320 | 16.2k | const pixel_type *p_avg = chin.Row(y); | 321 | 16.2k | pixel_type *p_out = chout.Row(y << 1); | 322 | 465k | for (size_t x = 0; x < chin.w; x++) { | 323 | 449k | p_out[x] = p_avg[x]; | 324 | 449k | } | 325 | 16.2k | } | 326 | 32.9k | input.channel[c] = std::move(chout); | 327 | 32.9k | return true; | 328 | 32.9k | } |
Unexecuted instantiation: jxl::N_SSE2::InvVSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*) |
329 | | |
330 | | Status InvSqueeze(Image &input, const std::vector<SqueezeParams> ¶meters, |
331 | 9.29k | ThreadPool *pool) { |
332 | 35.0k | for (int i = parameters.size() - 1; i >= 0; i--) { |
333 | 25.7k | JXL_RETURN_IF_ERROR( |
334 | 25.7k | CheckMetaSqueezeParams(parameters[i], input.channel.size())); |
335 | 25.7k | bool horizontal = parameters[i].horizontal; |
336 | 25.7k | bool in_place = parameters[i].in_place; |
337 | 25.7k | uint32_t beginc = parameters[i].begin_c; |
338 | 25.7k | uint32_t endc = parameters[i].begin_c + parameters[i].num_c - 1; |
339 | 25.7k | uint32_t offset; |
340 | 25.7k | if (in_place) { |
341 | 18.7k | offset = endc + 1; |
342 | 18.7k | } else { |
343 | 7.08k | offset = input.channel.size() + beginc - endc - 1; |
344 | 7.08k | } |
345 | 25.7k | if (beginc < input.nb_meta_channels) { |
346 | | // This is checked in MetaSqueeze. |
347 | 0 | JXL_ENSURE(input.nb_meta_channels > parameters[i].num_c); |
348 | 0 | input.nb_meta_channels -= parameters[i].num_c; |
349 | 0 | } |
350 | | |
351 | 97.2k | for (uint32_t c = beginc; c <= endc; c++) { |
352 | 71.4k | uint32_t rc = offset + c - beginc; |
353 | | // MetaApply should imply that `rc` is within range, otherwise there's a |
354 | | // programming bug. |
355 | 71.4k | JXL_ENSURE(rc < input.channel.size()); |
356 | 71.4k | if ((input.channel[c].w < input.channel[rc].w) || |
357 | 71.4k | (input.channel[c].h < input.channel[rc].h)) { |
358 | 0 | return JXL_FAILURE("Corrupted squeeze transform"); |
359 | 0 | } |
360 | 71.4k | if (horizontal) { |
361 | 38.1k | JXL_RETURN_IF_ERROR(InvHSqueeze(input, c, rc, pool)); |
362 | 38.1k | } else { |
363 | 33.3k | JXL_RETURN_IF_ERROR(InvVSqueeze(input, c, rc, pool)); |
364 | 33.3k | } |
365 | 71.4k | } |
366 | 25.7k | input.channel.erase(input.channel.begin() + offset, |
367 | 25.7k | input.channel.begin() + offset + (endc - beginc + 1)); |
368 | 25.7k | } |
369 | 9.29k | return true; |
370 | 9.29k | } Unexecuted instantiation: jxl::N_SSE4::InvSqueeze(jxl::Image&, std::__1::vector<jxl::SqueezeParams, std::__1::allocator<jxl::SqueezeParams> > const&, jxl::ThreadPool*) jxl::N_AVX2::InvSqueeze(jxl::Image&, std::__1::vector<jxl::SqueezeParams, std::__1::allocator<jxl::SqueezeParams> > const&, jxl::ThreadPool*) Line | Count | Source | 331 | 9.29k | ThreadPool *pool) { | 332 | 35.0k | for (int i = parameters.size() - 1; i >= 0; i--) { | 333 | 25.7k | JXL_RETURN_IF_ERROR( | 334 | 25.7k | CheckMetaSqueezeParams(parameters[i], input.channel.size())); | 335 | 25.7k | bool horizontal = parameters[i].horizontal; | 336 | 25.7k | bool in_place = parameters[i].in_place; | 337 | 25.7k | uint32_t beginc = parameters[i].begin_c; | 338 | 25.7k | uint32_t endc = parameters[i].begin_c + parameters[i].num_c - 1; | 339 | 25.7k | uint32_t offset; | 340 | 25.7k | if (in_place) { | 341 | 18.7k | offset = endc + 1; | 342 | 18.7k | } else { | 343 | 7.08k | offset = input.channel.size() + beginc - endc - 1; | 344 | 7.08k | } | 345 | 25.7k | if (beginc < input.nb_meta_channels) { | 346 | | // This is checked in MetaSqueeze. | 347 | 0 | JXL_ENSURE(input.nb_meta_channels > parameters[i].num_c); | 348 | 0 | input.nb_meta_channels -= parameters[i].num_c; | 349 | 0 | } | 350 | | | 351 | 97.2k | for (uint32_t c = beginc; c <= endc; c++) { | 352 | 71.4k | uint32_t rc = offset + c - beginc; | 353 | | // MetaApply should imply that `rc` is within range, otherwise there's a | 354 | | // programming bug. | 355 | 71.4k | JXL_ENSURE(rc < input.channel.size()); | 356 | 71.4k | if ((input.channel[c].w < input.channel[rc].w) || | 357 | 71.4k | (input.channel[c].h < input.channel[rc].h)) { | 358 | 0 | return JXL_FAILURE("Corrupted squeeze transform"); | 359 | 0 | } | 360 | 71.4k | if (horizontal) { | 361 | 38.1k | JXL_RETURN_IF_ERROR(InvHSqueeze(input, c, rc, pool)); | 362 | 38.1k | } else { | 363 | 33.3k | JXL_RETURN_IF_ERROR(InvVSqueeze(input, c, rc, pool)); | 364 | 33.3k | } | 365 | 71.4k | } | 366 | 25.7k | input.channel.erase(input.channel.begin() + offset, | 367 | 25.7k | input.channel.begin() + offset + (endc - beginc + 1)); | 368 | 25.7k | } | 369 | 9.29k | return true; | 370 | 9.29k | } |
Unexecuted instantiation: jxl::N_SSE2::InvSqueeze(jxl::Image&, std::__1::vector<jxl::SqueezeParams, std::__1::allocator<jxl::SqueezeParams> > const&, jxl::ThreadPool*) |
371 | | |
372 | | } // namespace HWY_NAMESPACE |
373 | | } // namespace jxl |
374 | | HWY_AFTER_NAMESPACE(); |
375 | | |
376 | | #if HWY_ONCE |
377 | | |
378 | | namespace jxl { |
379 | | |
380 | | HWY_EXPORT(InvSqueeze); |
381 | | Status InvSqueeze(Image &input, const std::vector<SqueezeParams> ¶meters, |
382 | 9.29k | ThreadPool *pool) { |
383 | 9.29k | return HWY_DYNAMIC_DISPATCH(InvSqueeze)(input, parameters, pool); |
384 | 9.29k | } |
385 | | |
386 | | void DefaultSqueezeParameters(std::vector<SqueezeParams> *parameters, |
387 | 9.55k | const Image &image) { |
388 | 9.55k | int nb_channels = image.channel.size() - image.nb_meta_channels; |
389 | | |
390 | 9.55k | parameters->clear(); |
391 | 9.55k | size_t w = image.channel[image.nb_meta_channels].w; |
392 | 9.55k | size_t h = image.channel[image.nb_meta_channels].h; |
393 | 9.55k | JXL_DEBUG_V( |
394 | 9.55k | 7, "Default squeeze parameters for %" PRIuS "x%" PRIuS " image: ", w, h); |
395 | | |
396 | | // do horizontal first on wide images; vertical first on tall images |
397 | 9.55k | bool wide = (w > h); |
398 | | |
399 | 9.55k | if (nb_channels > 2 && image.channel[image.nb_meta_channels + 1].w == w && |
400 | 9.55k | image.channel[image.nb_meta_channels + 1].h == h) { |
401 | | // assume channels 1 and 2 are chroma, and can be squeezed first for 4:2:0 |
402 | | // previews |
403 | 3.53k | JXL_DEBUG_V(7, "(4:2:0 chroma), %" PRIuS "x%" PRIuS " image", w, h); |
404 | 3.53k | SqueezeParams params; |
405 | | // horizontal chroma squeeze |
406 | 3.53k | params.horizontal = true; |
407 | 3.53k | params.in_place = false; |
408 | 3.53k | params.begin_c = image.nb_meta_channels + 1; |
409 | 3.53k | params.num_c = 2; |
410 | 3.53k | parameters->push_back(params); |
411 | 3.53k | params.horizontal = false; |
412 | | // vertical chroma squeeze |
413 | 3.53k | parameters->push_back(params); |
414 | 3.53k | } |
415 | 9.55k | SqueezeParams params; |
416 | 9.55k | params.begin_c = image.nb_meta_channels; |
417 | 9.55k | params.num_c = nb_channels; |
418 | 9.55k | params.in_place = true; |
419 | | |
420 | 9.55k | if (!wide) { |
421 | 3.31k | if (h > kMaxFirstPreviewSize) { |
422 | 787 | params.horizontal = false; |
423 | 787 | parameters->push_back(params); |
424 | 787 | h = (h + 1) / 2; |
425 | 787 | JXL_DEBUG_V(7, "Vertical (%" PRIuS "x%" PRIuS "), ", w, h); |
426 | 787 | } |
427 | 3.31k | } |
428 | 20.6k | while (w > kMaxFirstPreviewSize || h > kMaxFirstPreviewSize) { |
429 | 11.1k | if (w > kMaxFirstPreviewSize) { |
430 | 10.7k | params.horizontal = true; |
431 | 10.7k | parameters->push_back(params); |
432 | 10.7k | w = (w + 1) / 2; |
433 | 10.7k | JXL_DEBUG_V(7, "Horizontal (%" PRIuS "x%" PRIuS "), ", w, h); |
434 | 10.7k | } |
435 | 11.1k | if (h > kMaxFirstPreviewSize) { |
436 | 8.18k | params.horizontal = false; |
437 | 8.18k | parameters->push_back(params); |
438 | 8.18k | h = (h + 1) / 2; |
439 | 8.18k | JXL_DEBUG_V(7, "Vertical (%" PRIuS "x%" PRIuS "), ", w, h); |
440 | 8.18k | } |
441 | 11.1k | } |
442 | 9.55k | JXL_DEBUG_V(7, "that's it"); |
443 | 9.55k | } |
444 | | |
445 | | Status CheckMetaSqueezeParams(const SqueezeParams ¶meter, |
446 | 53.7k | int num_channels) { |
447 | 53.7k | int c1 = parameter.begin_c; |
448 | 53.7k | int c2 = parameter.begin_c + parameter.num_c - 1; |
449 | 53.7k | if (c1 < 0 || c1 >= num_channels || c2 < 0 || c2 >= num_channels || c2 < c1) { |
450 | 22 | return JXL_FAILURE("Invalid channel range"); |
451 | 22 | } |
452 | 53.6k | return true; |
453 | 53.7k | } |
454 | | |
455 | 9.74k | Status MetaSqueeze(Image &image, std::vector<SqueezeParams> *parameters) { |
456 | 9.74k | JxlMemoryManager *memory_manager = image.memory_manager(); |
457 | 9.74k | if (parameters->empty()) { |
458 | 9.55k | DefaultSqueezeParameters(parameters, image); |
459 | 9.55k | } |
460 | | |
461 | 27.9k | for (auto ¶meter : *parameters) { |
462 | 27.9k | JXL_RETURN_IF_ERROR( |
463 | 27.9k | CheckMetaSqueezeParams(parameter, image.channel.size())); |
464 | 27.8k | bool horizontal = parameter.horizontal; |
465 | 27.8k | bool in_place = parameter.in_place; |
466 | 27.8k | uint32_t beginc = parameter.begin_c; |
467 | 27.8k | uint32_t endc = parameter.begin_c + parameter.num_c - 1; |
468 | | |
469 | 27.8k | uint32_t offset; |
470 | 27.8k | if (beginc < image.nb_meta_channels) { |
471 | 5 | if (endc >= image.nb_meta_channels) { |
472 | 1 | return JXL_FAILURE("Invalid squeeze: mix of meta and nonmeta channels"); |
473 | 1 | } |
474 | 4 | if (!in_place) { |
475 | 1 | return JXL_FAILURE( |
476 | 1 | "Invalid squeeze: meta channels require in-place residuals"); |
477 | 1 | } |
478 | 3 | image.nb_meta_channels += parameter.num_c; |
479 | 3 | } |
480 | 27.8k | if (in_place) { |
481 | 19.8k | offset = endc + 1; |
482 | 19.8k | } else { |
483 | 8.03k | offset = image.channel.size(); |
484 | 8.03k | } |
485 | 104k | for (uint32_t c = beginc; c <= endc; c++) { |
486 | 77.0k | if (image.channel[c].hshift > 30 || image.channel[c].vshift > 30) { |
487 | 1 | return JXL_FAILURE("Too many squeezes: shift > 30"); |
488 | 1 | } |
489 | 77.0k | size_t w = image.channel[c].w; |
490 | 77.0k | size_t h = image.channel[c].h; |
491 | 77.0k | if (w == 0 || h == 0) return JXL_FAILURE("Squeezing empty channel"); |
492 | 77.0k | if (horizontal) { |
493 | 40.6k | image.channel[c].w = (w + 1) / 2; |
494 | 40.6k | if (image.channel[c].hshift >= 0) image.channel[c].hshift++; |
495 | 40.6k | w = w - (w + 1) / 2; |
496 | 40.6k | } else { |
497 | 36.4k | image.channel[c].h = (h + 1) / 2; |
498 | 36.4k | if (image.channel[c].vshift >= 0) image.channel[c].vshift++; |
499 | 36.4k | h = h - (h + 1) / 2; |
500 | 36.4k | } |
501 | 77.0k | JXL_RETURN_IF_ERROR(image.channel[c].shrink()); |
502 | 154k | JXL_ASSIGN_OR_RETURN(Channel placeholder, |
503 | 154k | Channel::Create(memory_manager, w, h)); |
504 | 154k | placeholder.hshift = image.channel[c].hshift; |
505 | 154k | placeholder.vshift = image.channel[c].vshift; |
506 | | |
507 | 154k | image.channel.insert(image.channel.begin() + offset + (c - beginc), |
508 | 154k | std::move(placeholder)); |
509 | 154k | JXL_DEBUG_V(8, "MetaSqueeze applied, current image: %s", |
510 | 154k | image.DebugString().c_str()); |
511 | 154k | } |
512 | 27.8k | } |
513 | 9.72k | return true; |
514 | 9.74k | } |
515 | | |
516 | | } // namespace jxl |
517 | | |
518 | | #endif |