/src/libjxl/lib/jxl/modular/transform/squeeze.cc
Line | Count | Source |
1 | | // Copyright (c) the JPEG XL Project Authors. All rights reserved. |
2 | | // |
3 | | // Use of this source code is governed by a BSD-style |
4 | | // license that can be found in the LICENSE file. |
5 | | |
6 | | #include "lib/jxl/modular/transform/squeeze.h" |
7 | | |
8 | | #include <jxl/memory_manager.h> |
9 | | |
10 | | #include <algorithm> |
11 | | #include <cstddef> |
12 | | #include <cstdint> |
13 | | #include <cstdlib> |
14 | | #include <utility> |
15 | | #include <vector> |
16 | | |
17 | | #include "lib/jxl/base/common.h" |
18 | | #include "lib/jxl/base/compiler_specific.h" |
19 | | #include "lib/jxl/base/data_parallel.h" |
20 | | #include "lib/jxl/base/printf_macros.h" |
21 | | #include "lib/jxl/base/status.h" |
22 | | #include "lib/jxl/modular/modular_image.h" |
23 | | #include "lib/jxl/modular/transform/squeeze_params.h" |
24 | | #undef HWY_TARGET_INCLUDE |
25 | | #define HWY_TARGET_INCLUDE "lib/jxl/modular/transform/squeeze.cc" |
26 | | #include <hwy/foreach_target.h> |
27 | | #include <hwy/highway.h> |
28 | | |
29 | | #include "lib/jxl/simd_util-inl.h" |
30 | | |
31 | | HWY_BEFORE_NAMESPACE(); |
32 | | namespace jxl { |
33 | | namespace HWY_NAMESPACE { |
34 | | |
35 | | #if HWY_TARGET != HWY_SCALAR |
36 | | |
37 | | // These templates are not found via ADL. |
38 | | using hwy::HWY_NAMESPACE::Abs; |
39 | | using hwy::HWY_NAMESPACE::Add; |
40 | | using hwy::HWY_NAMESPACE::And; |
41 | | using hwy::HWY_NAMESPACE::DupEven; |
42 | | using hwy::HWY_NAMESPACE::DupOdd; |
43 | | using hwy::HWY_NAMESPACE::Gt; |
44 | | using hwy::HWY_NAMESPACE::IfThenElse; |
45 | | using hwy::HWY_NAMESPACE::IfThenZeroElse; |
46 | | using hwy::HWY_NAMESPACE::Lt; |
47 | | using hwy::HWY_NAMESPACE::MulEven; |
48 | | using hwy::HWY_NAMESPACE::MulOdd; |
49 | | using hwy::HWY_NAMESPACE::Ne; |
50 | | using hwy::HWY_NAMESPACE::Neg; |
51 | | using hwy::HWY_NAMESPACE::OddEven; |
52 | | using hwy::HWY_NAMESPACE::RebindToUnsigned; |
53 | | using hwy::HWY_NAMESPACE::ShiftLeft; |
54 | | using hwy::HWY_NAMESPACE::ShiftRight; |
55 | | using hwy::HWY_NAMESPACE::Sub; |
56 | | using hwy::HWY_NAMESPACE::Xor; |
57 | | |
58 | | using D = HWY_CAPPED(pixel_type, 8); |
59 | | using DU = RebindToUnsigned<D>; |
60 | | constexpr D d; |
61 | | constexpr DU du; |
62 | | |
63 | | JXL_INLINE void FastUnsqueeze(const pixel_type *JXL_RESTRICT p_residual, |
64 | | const pixel_type *JXL_RESTRICT p_avg, |
65 | | const pixel_type *JXL_RESTRICT p_navg, |
66 | | const pixel_type *p_pout, |
67 | | pixel_type *JXL_RESTRICT p_out, |
68 | 37.3M | pixel_type *p_nout) { |
69 | 37.3M | const size_t N = Lanes(d); |
70 | 37.3M | auto onethird = Set(d, 0x55555556); |
71 | 74.6M | for (size_t x = 0; x < 8; x += N) { |
72 | 37.3M | auto avg = Load(d, p_avg + x); |
73 | 37.3M | auto next_avg = Load(d, p_navg + x); |
74 | 37.3M | auto top = Load(d, p_pout + x); |
75 | | // Equivalent to SmoothTendency(top,avg,next_avg), but without branches |
76 | | // typo:off |
77 | 37.3M | auto Ba = Sub(top, avg); |
78 | 37.3M | auto an = Sub(avg, next_avg); |
79 | 37.3M | auto nonmono = Xor(Ba, an); |
80 | 37.3M | auto absBa = Abs(Ba); |
81 | 37.3M | auto absan = Abs(an); |
82 | 37.3M | auto absBn = Abs(Sub(top, next_avg)); |
83 | | // Compute a3 = absBa / 3 |
84 | 37.3M | auto a3eh = MulEven(absBa, onethird); |
85 | 37.3M | auto a3oh = MulOdd(absBa, onethird); |
86 | | |
87 | 37.3M | #if (HWY_MAJOR > 1 || (HWY_MAJOR == 1 && HWY_MINOR >= 2)) |
88 | 37.3M | #if HWY_IS_LITTLE_ENDIAN |
89 | 37.3M | auto a3 = InterleaveOdd(d, BitCast(d, a3eh), BitCast(d, a3oh)); |
90 | | #else // not little endian |
91 | | auto a3 = InterleaveEven(d, BitCast(d, a3eh), BitCast(d, a3oh)); |
92 | | #endif // endianness |
93 | | #else // hwy < 1.2 |
94 | | #if HWY_IS_LITTLE_ENDIAN |
95 | | auto a3 = OddEven(BitCast(d, a3oh), DupOdd(BitCast(d, a3eh))); |
96 | | #else // not little endian |
97 | | auto a3 = OddEven(DupEven(BitCast(d, a3oh)), BitCast(d, a3eh)) |
98 | | #endif // endianness |
99 | | #endif // hwy version |
100 | | |
101 | 37.3M | a3 = Add(a3, Add(absBn, Set(d, 2))); |
102 | 37.3M | auto absdiff = ShiftRight<2>(a3); |
103 | 37.3M | auto skipdiff = Ne(Ba, Zero(d)); |
104 | 37.3M | skipdiff = And(skipdiff, Ne(an, Zero(d))); |
105 | 37.3M | skipdiff = And(skipdiff, Lt(nonmono, Zero(d))); |
106 | 37.3M | auto absBa2 = Add(ShiftLeft<1>(absBa), And(absdiff, Set(d, 1))); |
107 | 37.3M | absdiff = IfThenElse(Gt(absdiff, absBa2), |
108 | 37.3M | Add(ShiftLeft<1>(absBa), Set(d, 1)), absdiff); |
109 | | // typo:on |
110 | 37.3M | auto absan2 = ShiftLeft<1>(absan); |
111 | 37.3M | absdiff = IfThenElse(Gt(Add(absdiff, And(absdiff, Set(d, 1))), absan2), |
112 | 37.3M | absan2, absdiff); |
113 | 37.3M | auto diff1 = IfThenElse(Lt(top, next_avg), Neg(absdiff), absdiff); |
114 | 37.3M | auto tendency = IfThenZeroElse(skipdiff, diff1); |
115 | | |
116 | 37.3M | auto diff_minus_tendency = Load(d, p_residual + x); |
117 | 37.3M | auto diff = Add(diff_minus_tendency, tendency); |
118 | 37.3M | auto out = |
119 | 37.3M | Add(avg, ShiftRight<1>( |
120 | 37.3M | Add(diff, BitCast(d, ShiftRight<31>(BitCast(du, diff)))))); |
121 | 37.3M | Store(out, d, p_out + x); |
122 | 37.3M | Store(Sub(out, diff), d, p_nout + x); |
123 | 37.3M | } |
124 | 37.3M | } Unexecuted instantiation: jxl::N_SSE4::FastUnsqueeze(int const*, int const*, int const*, int const*, int*, int*) jxl::N_AVX2::FastUnsqueeze(int const*, int const*, int const*, int const*, int*, int*) Line | Count | Source | 68 | 37.3M | pixel_type *p_nout) { | 69 | 37.3M | const size_t N = Lanes(d); | 70 | 37.3M | auto onethird = Set(d, 0x55555556); | 71 | 74.6M | for (size_t x = 0; x < 8; x += N) { | 72 | 37.3M | auto avg = Load(d, p_avg + x); | 73 | 37.3M | auto next_avg = Load(d, p_navg + x); | 74 | 37.3M | auto top = Load(d, p_pout + x); | 75 | | // Equivalent to SmoothTendency(top,avg,next_avg), but without branches | 76 | | // typo:off | 77 | 37.3M | auto Ba = Sub(top, avg); | 78 | 37.3M | auto an = Sub(avg, next_avg); | 79 | 37.3M | auto nonmono = Xor(Ba, an); | 80 | 37.3M | auto absBa = Abs(Ba); | 81 | 37.3M | auto absan = Abs(an); | 82 | 37.3M | auto absBn = Abs(Sub(top, next_avg)); | 83 | | // Compute a3 = absBa / 3 | 84 | 37.3M | auto a3eh = MulEven(absBa, onethird); | 85 | 37.3M | auto a3oh = MulOdd(absBa, onethird); | 86 | | | 87 | 37.3M | #if (HWY_MAJOR > 1 || (HWY_MAJOR == 1 && HWY_MINOR >= 2)) | 88 | 37.3M | #if HWY_IS_LITTLE_ENDIAN | 89 | 37.3M | auto a3 = InterleaveOdd(d, BitCast(d, a3eh), BitCast(d, a3oh)); | 90 | | #else // not little endian | 91 | | auto a3 = InterleaveEven(d, BitCast(d, a3eh), BitCast(d, a3oh)); | 92 | | #endif // endianness | 93 | | #else // hwy < 1.2 | 94 | | #if HWY_IS_LITTLE_ENDIAN | 95 | | auto a3 = OddEven(BitCast(d, a3oh), DupOdd(BitCast(d, a3eh))); | 96 | | #else // not little endian | 97 | | auto a3 = OddEven(DupEven(BitCast(d, a3oh)), BitCast(d, a3eh)) | 98 | | #endif // endianness | 99 | | #endif // hwy version | 100 | | | 101 | 37.3M | a3 = Add(a3, Add(absBn, Set(d, 2))); | 102 | 37.3M | auto absdiff = ShiftRight<2>(a3); | 103 | 37.3M | auto skipdiff = Ne(Ba, Zero(d)); | 104 | 37.3M | skipdiff = And(skipdiff, Ne(an, Zero(d))); | 105 | 37.3M | skipdiff = And(skipdiff, Lt(nonmono, Zero(d))); | 106 | 37.3M | auto absBa2 = Add(ShiftLeft<1>(absBa), And(absdiff, Set(d, 1))); | 107 | 37.3M | absdiff = IfThenElse(Gt(absdiff, absBa2), | 108 | 37.3M | Add(ShiftLeft<1>(absBa), Set(d, 1)), absdiff); | 109 | | // typo:on | 110 | 37.3M | auto absan2 = ShiftLeft<1>(absan); | 111 | 37.3M | absdiff = IfThenElse(Gt(Add(absdiff, And(absdiff, Set(d, 1))), absan2), | 112 | 37.3M | absan2, absdiff); | 113 | 37.3M | auto diff1 = IfThenElse(Lt(top, next_avg), Neg(absdiff), absdiff); | 114 | 37.3M | auto tendency = IfThenZeroElse(skipdiff, diff1); | 115 | | | 116 | 37.3M | auto diff_minus_tendency = Load(d, p_residual + x); | 117 | 37.3M | auto diff = Add(diff_minus_tendency, tendency); | 118 | 37.3M | auto out = | 119 | 37.3M | Add(avg, ShiftRight<1>( | 120 | 37.3M | Add(diff, BitCast(d, ShiftRight<31>(BitCast(du, diff)))))); | 121 | 37.3M | Store(out, d, p_out + x); | 122 | 37.3M | Store(Sub(out, diff), d, p_nout + x); | 123 | 37.3M | } | 124 | 37.3M | } |
Unexecuted instantiation: jxl::N_AVX3::FastUnsqueeze(int const*, int const*, int const*, int const*, int*, int*) Unexecuted instantiation: jxl::N_AVX3_ZEN4::FastUnsqueeze(int const*, int const*, int const*, int const*, int*, int*) Unexecuted instantiation: jxl::N_AVX3_SPR::FastUnsqueeze(int const*, int const*, int const*, int const*, int*, int*) Unexecuted instantiation: jxl::N_SSE2::FastUnsqueeze(int const*, int const*, int const*, int const*, int*, int*) |
125 | | |
126 | | #endif // HWY_TARGET != HWY_SCALAR |
127 | | |
128 | 161k | Status InvHSqueeze(Image &input, uint32_t c, uint32_t rc, ThreadPool *pool) { |
129 | 161k | JXL_ENSURE(c < input.channel.size()); |
130 | 161k | JXL_ENSURE(rc < input.channel.size()); |
131 | 161k | Channel &chin = input.channel[c]; |
132 | 161k | const Channel &chin_residual = input.channel[rc]; |
133 | | // These must be valid since we ran MetaApply already. |
134 | 161k | JXL_ENSURE(chin.w == DivCeil(chin.w + chin_residual.w, 2)); |
135 | 161k | JXL_ENSURE(chin.h == chin_residual.h); |
136 | 161k | JxlMemoryManager *memory_manager = input.memory_manager(); |
137 | | |
138 | 161k | if (chin_residual.w == 0) { |
139 | | // Short-circuit: output channel has same dimensions as input. |
140 | 6.82k | input.channel[c].hshift--; |
141 | 6.82k | return true; |
142 | 6.82k | } |
143 | | |
144 | | // Note: chin.w >= chin_residual.w and at most 1 different. |
145 | 310k | JXL_ASSIGN_OR_RETURN(Channel chout, |
146 | 310k | Channel::Create(memory_manager, chin.w + chin_residual.w, |
147 | 310k | chin.h, chin.hshift - 1, chin.vshift)); |
148 | 310k | JXL_DEBUG_V(4, |
149 | 310k | "Undoing horizontal squeeze of channel %i using residuals in " |
150 | 310k | "channel %i (going from width %" PRIuS " to %" PRIuS ")", |
151 | 310k | c, rc, chin.w, chout.w); |
152 | | |
153 | 310k | if (chin_residual.h == 0) { |
154 | | // Short-circuit: channel with no pixels. |
155 | 0 | input.channel[c] = std::move(chout); |
156 | 0 | return true; |
157 | 0 | } |
158 | 5.50M | auto unsqueeze_row = [&](size_t y, size_t x0) { |
159 | 5.50M | const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y); |
160 | 5.50M | const pixel_type *JXL_RESTRICT p_avg = chin.Row(y); |
161 | 5.50M | pixel_type *JXL_RESTRICT p_out = chout.Row(y); |
162 | 46.4M | for (size_t x = x0; x < chin_residual.w; x++) { |
163 | 40.9M | pixel_type_w diff_minus_tendency = p_residual[x]; |
164 | 40.9M | pixel_type_w avg = p_avg[x]; |
165 | 40.9M | pixel_type_w next_avg = (x + 1 < chin.w ? p_avg[x + 1] : avg); |
166 | 40.9M | pixel_type_w left = (x ? p_out[(x << 1) - 1] : avg); |
167 | 40.9M | pixel_type_w tendency = SmoothTendency(left, avg, next_avg); |
168 | 40.9M | pixel_type_w diff = diff_minus_tendency + tendency; |
169 | 40.9M | pixel_type_w A = avg + (diff / 2); |
170 | 40.9M | p_out[(x << 1)] = A; |
171 | 40.9M | pixel_type_w B = A - diff; |
172 | 40.9M | p_out[(x << 1) + 1] = B; |
173 | 40.9M | } |
174 | 5.50M | if (chout.w & 1) p_out[chout.w - 1] = p_avg[chin.w - 1]; |
175 | 5.50M | }; Unexecuted instantiation: squeeze.cc:jxl::N_SSE4::InvHSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)::$_0::operator()(unsigned long, unsigned long) const squeeze.cc:jxl::N_AVX2::InvHSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)::$_0::operator()(unsigned long, unsigned long) const Line | Count | Source | 158 | 5.50M | auto unsqueeze_row = [&](size_t y, size_t x0) { | 159 | 5.50M | const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y); | 160 | 5.50M | const pixel_type *JXL_RESTRICT p_avg = chin.Row(y); | 161 | 5.50M | pixel_type *JXL_RESTRICT p_out = chout.Row(y); | 162 | 46.4M | for (size_t x = x0; x < chin_residual.w; x++) { | 163 | 40.9M | pixel_type_w diff_minus_tendency = p_residual[x]; | 164 | 40.9M | pixel_type_w avg = p_avg[x]; | 165 | 40.9M | pixel_type_w next_avg = (x + 1 < chin.w ? p_avg[x + 1] : avg); | 166 | 40.9M | pixel_type_w left = (x ? p_out[(x << 1) - 1] : avg); | 167 | 40.9M | pixel_type_w tendency = SmoothTendency(left, avg, next_avg); | 168 | 40.9M | pixel_type_w diff = diff_minus_tendency + tendency; | 169 | 40.9M | pixel_type_w A = avg + (diff / 2); | 170 | 40.9M | p_out[(x << 1)] = A; | 171 | 40.9M | pixel_type_w B = A - diff; | 172 | 40.9M | p_out[(x << 1) + 1] = B; | 173 | 40.9M | } | 174 | 5.50M | if (chout.w & 1) p_out[chout.w - 1] = p_avg[chin.w - 1]; | 175 | 5.50M | }; |
Unexecuted instantiation: squeeze.cc:jxl::N_AVX3::InvHSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)::$_0::operator()(unsigned long, unsigned long) const Unexecuted instantiation: squeeze.cc:jxl::N_AVX3_ZEN4::InvHSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)::$_0::operator()(unsigned long, unsigned long) const Unexecuted instantiation: squeeze.cc:jxl::N_AVX3_SPR::InvHSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)::$_0::operator()(unsigned long, unsigned long) const Unexecuted instantiation: squeeze.cc:jxl::N_SSE2::InvHSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)::$_0::operator()(unsigned long, unsigned long) const |
176 | | |
177 | | // somewhat complicated trickery just to be able to SIMD this. |
178 | | // Horizontal unsqueeze has horizontal data dependencies, so we do |
179 | | // 8 rows at a time and treat it as a vertical unsqueeze of a |
180 | | // transposed 8x8 block (or 9x8 for one input). |
181 | 155k | static constexpr const size_t kRowsPerThread = 8; |
182 | 155k | const auto unsqueeze_span = [&](const uint32_t task, |
183 | 740k | size_t /* thread */) -> Status { |
184 | 740k | const size_t y0 = task * kRowsPerThread; |
185 | 740k | const size_t rows = std::min(kRowsPerThread, chin.h - y0); |
186 | 740k | size_t x = 0; |
187 | | |
188 | 740k | #if HWY_TARGET != HWY_SCALAR |
189 | 740k | ptrdiff_t onerow_in = chin.plane.PixelsPerRow(); |
190 | 740k | ptrdiff_t onerow_inr = chin_residual.plane.PixelsPerRow(); |
191 | 740k | ptrdiff_t onerow_out = chout.plane.PixelsPerRow(); |
192 | 740k | const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y0); |
193 | 740k | const pixel_type *JXL_RESTRICT p_avg = chin.Row(y0); |
194 | 740k | pixel_type *JXL_RESTRICT p_out = chout.Row(y0); |
195 | 740k | HWY_ALIGN pixel_type b_p_avg[9 * kRowsPerThread]; |
196 | 740k | HWY_ALIGN pixel_type b_p_residual[8 * kRowsPerThread]; |
197 | 740k | HWY_ALIGN pixel_type b_p_out_even[8 * kRowsPerThread]; |
198 | 740k | HWY_ALIGN pixel_type b_p_out_odd[8 * kRowsPerThread]; |
199 | 740k | HWY_ALIGN pixel_type b_p_out_evenT[8 * kRowsPerThread]; |
200 | 740k | HWY_ALIGN pixel_type b_p_out_oddT[8 * kRowsPerThread]; |
201 | 740k | const size_t N = Lanes(d); |
202 | 740k | if (chin_residual.w > 16 && rows == kRowsPerThread) { |
203 | 2.73M | for (; x < chin_residual.w - 9; x += 8) { |
204 | 2.31M | Transpose8x8Block(p_residual + x, b_p_residual, onerow_inr); |
205 | 2.31M | Transpose8x8Block(p_avg + x, b_p_avg, onerow_in); |
206 | 20.8M | for (size_t y = 0; y < kRowsPerThread; y++) { |
207 | 18.4M | b_p_avg[8 * 8 + y] = p_avg[x + 8 + onerow_in * y]; |
208 | 18.4M | } |
209 | 20.8M | for (size_t i = 0; i < 8; i++) { |
210 | 18.4M | FastUnsqueeze( |
211 | 18.4M | b_p_residual + 8 * i, b_p_avg + 8 * i, b_p_avg + 8 * (i + 1), |
212 | 18.4M | (x + i ? b_p_out_odd + 8 * ((x + i - 1) & 7) : b_p_avg + 8 * i), |
213 | 18.4M | b_p_out_even + 8 * i, b_p_out_odd + 8 * i); |
214 | 18.4M | } |
215 | | |
216 | 2.31M | Transpose8x8Block(b_p_out_even, b_p_out_evenT, 8); |
217 | 2.31M | Transpose8x8Block(b_p_out_odd, b_p_out_oddT, 8); |
218 | 20.8M | for (size_t y = 0; y < kRowsPerThread; y++) { |
219 | 36.9M | for (size_t i = 0; i < kRowsPerThread; i += N) { |
220 | 18.4M | auto even = Load(d, b_p_out_evenT + 8 * y + i); |
221 | 18.4M | auto odd = Load(d, b_p_out_oddT + 8 * y + i); |
222 | 18.4M | StoreInterleaved(d, even, odd, |
223 | 18.4M | p_out + ((x + i) << 1) + onerow_out * y); |
224 | 18.4M | } |
225 | 18.4M | } |
226 | 2.31M | } |
227 | 424k | } |
228 | 740k | #endif // HWY_TARGET != HWY_SCALAR |
229 | 6.24M | for (size_t y = 0; y < rows; y++) { |
230 | 5.50M | unsqueeze_row(y0 + y, x); |
231 | 5.50M | } |
232 | 740k | return true; |
233 | 740k | }; Unexecuted instantiation: squeeze.cc:jxl::N_SSE4::InvHSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)::$_1::operator()(unsigned int, unsigned long) const squeeze.cc:jxl::N_AVX2::InvHSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)::$_1::operator()(unsigned int, unsigned long) const Line | Count | Source | 183 | 740k | size_t /* thread */) -> Status { | 184 | 740k | const size_t y0 = task * kRowsPerThread; | 185 | 740k | const size_t rows = std::min(kRowsPerThread, chin.h - y0); | 186 | 740k | size_t x = 0; | 187 | | | 188 | 740k | #if HWY_TARGET != HWY_SCALAR | 189 | 740k | ptrdiff_t onerow_in = chin.plane.PixelsPerRow(); | 190 | 740k | ptrdiff_t onerow_inr = chin_residual.plane.PixelsPerRow(); | 191 | 740k | ptrdiff_t onerow_out = chout.plane.PixelsPerRow(); | 192 | 740k | const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y0); | 193 | 740k | const pixel_type *JXL_RESTRICT p_avg = chin.Row(y0); | 194 | 740k | pixel_type *JXL_RESTRICT p_out = chout.Row(y0); | 195 | 740k | HWY_ALIGN pixel_type b_p_avg[9 * kRowsPerThread]; | 196 | 740k | HWY_ALIGN pixel_type b_p_residual[8 * kRowsPerThread]; | 197 | 740k | HWY_ALIGN pixel_type b_p_out_even[8 * kRowsPerThread]; | 198 | 740k | HWY_ALIGN pixel_type b_p_out_odd[8 * kRowsPerThread]; | 199 | 740k | HWY_ALIGN pixel_type b_p_out_evenT[8 * kRowsPerThread]; | 200 | 740k | HWY_ALIGN pixel_type b_p_out_oddT[8 * kRowsPerThread]; | 201 | 740k | const size_t N = Lanes(d); | 202 | 740k | if (chin_residual.w > 16 && rows == kRowsPerThread) { | 203 | 2.73M | for (; x < chin_residual.w - 9; x += 8) { | 204 | 2.31M | Transpose8x8Block(p_residual + x, b_p_residual, onerow_inr); | 205 | 2.31M | Transpose8x8Block(p_avg + x, b_p_avg, onerow_in); | 206 | 20.8M | for (size_t y = 0; y < kRowsPerThread; y++) { | 207 | 18.4M | b_p_avg[8 * 8 + y] = p_avg[x + 8 + onerow_in * y]; | 208 | 18.4M | } | 209 | 20.8M | for (size_t i = 0; i < 8; i++) { | 210 | 18.4M | FastUnsqueeze( | 211 | 18.4M | b_p_residual + 8 * i, b_p_avg + 8 * i, b_p_avg + 8 * (i + 1), | 212 | 18.4M | (x + i ? b_p_out_odd + 8 * ((x + i - 1) & 7) : b_p_avg + 8 * i), | 213 | 18.4M | b_p_out_even + 8 * i, b_p_out_odd + 8 * i); | 214 | 18.4M | } | 215 | | | 216 | 2.31M | Transpose8x8Block(b_p_out_even, b_p_out_evenT, 8); | 217 | 2.31M | Transpose8x8Block(b_p_out_odd, b_p_out_oddT, 8); | 218 | 20.8M | for (size_t y = 0; y < kRowsPerThread; y++) { | 219 | 36.9M | for (size_t i = 0; i < kRowsPerThread; i += N) { | 220 | 18.4M | auto even = Load(d, b_p_out_evenT + 8 * y + i); | 221 | 18.4M | auto odd = Load(d, b_p_out_oddT + 8 * y + i); | 222 | 18.4M | StoreInterleaved(d, even, odd, | 223 | 18.4M | p_out + ((x + i) << 1) + onerow_out * y); | 224 | 18.4M | } | 225 | 18.4M | } | 226 | 2.31M | } | 227 | 424k | } | 228 | 740k | #endif // HWY_TARGET != HWY_SCALAR | 229 | 6.24M | for (size_t y = 0; y < rows; y++) { | 230 | 5.50M | unsqueeze_row(y0 + y, x); | 231 | 5.50M | } | 232 | 740k | return true; | 233 | 740k | }; |
Unexecuted instantiation: squeeze.cc:jxl::N_AVX3::InvHSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)::$_1::operator()(unsigned int, unsigned long) const Unexecuted instantiation: squeeze.cc:jxl::N_AVX3_ZEN4::InvHSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)::$_1::operator()(unsigned int, unsigned long) const Unexecuted instantiation: squeeze.cc:jxl::N_AVX3_SPR::InvHSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)::$_1::operator()(unsigned int, unsigned long) const Unexecuted instantiation: squeeze.cc:jxl::N_SSE2::InvHSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)::$_1::operator()(unsigned int, unsigned long) const |
234 | 155k | JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, DivCeil(chin.h, kRowsPerThread), |
235 | 155k | ThreadPool::NoInit, unsqueeze_span, |
236 | 155k | "InvHorizontalSqueeze")); |
237 | 155k | input.channel[c] = std::move(chout); |
238 | 155k | return true; |
239 | 155k | } Unexecuted instantiation: jxl::N_SSE4::InvHSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*) jxl::N_AVX2::InvHSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*) Line | Count | Source | 128 | 161k | Status InvHSqueeze(Image &input, uint32_t c, uint32_t rc, ThreadPool *pool) { | 129 | 161k | JXL_ENSURE(c < input.channel.size()); | 130 | 161k | JXL_ENSURE(rc < input.channel.size()); | 131 | 161k | Channel &chin = input.channel[c]; | 132 | 161k | const Channel &chin_residual = input.channel[rc]; | 133 | | // These must be valid since we ran MetaApply already. | 134 | 161k | JXL_ENSURE(chin.w == DivCeil(chin.w + chin_residual.w, 2)); | 135 | 161k | JXL_ENSURE(chin.h == chin_residual.h); | 136 | 161k | JxlMemoryManager *memory_manager = input.memory_manager(); | 137 | | | 138 | 161k | if (chin_residual.w == 0) { | 139 | | // Short-circuit: output channel has same dimensions as input. | 140 | 6.82k | input.channel[c].hshift--; | 141 | 6.82k | return true; | 142 | 6.82k | } | 143 | | | 144 | | // Note: chin.w >= chin_residual.w and at most 1 different. | 145 | 310k | JXL_ASSIGN_OR_RETURN(Channel chout, | 146 | 310k | Channel::Create(memory_manager, chin.w + chin_residual.w, | 147 | 310k | chin.h, chin.hshift - 1, chin.vshift)); | 148 | 310k | JXL_DEBUG_V(4, | 149 | 310k | "Undoing horizontal squeeze of channel %i using residuals in " | 150 | 310k | "channel %i (going from width %" PRIuS " to %" PRIuS ")", | 151 | 310k | c, rc, chin.w, chout.w); | 152 | | | 153 | 310k | if (chin_residual.h == 0) { | 154 | | // Short-circuit: channel with no pixels. | 155 | 0 | input.channel[c] = std::move(chout); | 156 | 0 | return true; | 157 | 0 | } | 158 | 155k | auto unsqueeze_row = [&](size_t y, size_t x0) { | 159 | 155k | const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y); | 160 | 155k | const pixel_type *JXL_RESTRICT p_avg = chin.Row(y); | 161 | 155k | pixel_type *JXL_RESTRICT p_out = chout.Row(y); | 162 | 155k | for (size_t x = x0; x < chin_residual.w; x++) { | 163 | 155k | pixel_type_w diff_minus_tendency = p_residual[x]; | 164 | 155k | pixel_type_w avg = p_avg[x]; | 165 | 155k | pixel_type_w next_avg = (x + 1 < chin.w ? p_avg[x + 1] : avg); | 166 | 155k | pixel_type_w left = (x ? p_out[(x << 1) - 1] : avg); | 167 | 155k | pixel_type_w tendency = SmoothTendency(left, avg, next_avg); | 168 | 155k | pixel_type_w diff = diff_minus_tendency + tendency; | 169 | 155k | pixel_type_w A = avg + (diff / 2); | 170 | 155k | p_out[(x << 1)] = A; | 171 | 155k | pixel_type_w B = A - diff; | 172 | 155k | p_out[(x << 1) + 1] = B; | 173 | 155k | } | 174 | 155k | if (chout.w & 1) p_out[chout.w - 1] = p_avg[chin.w - 1]; | 175 | 155k | }; | 176 | | | 177 | | // somewhat complicated trickery just to be able to SIMD this. | 178 | | // Horizontal unsqueeze has horizontal data dependencies, so we do | 179 | | // 8 rows at a time and treat it as a vertical unsqueeze of a | 180 | | // transposed 8x8 block (or 9x8 for one input). | 181 | 155k | static constexpr const size_t kRowsPerThread = 8; | 182 | 155k | const auto unsqueeze_span = [&](const uint32_t task, | 183 | 155k | size_t /* thread */) -> Status { | 184 | 155k | const size_t y0 = task * kRowsPerThread; | 185 | 155k | const size_t rows = std::min(kRowsPerThread, chin.h - y0); | 186 | 155k | size_t x = 0; | 187 | | | 188 | 155k | #if HWY_TARGET != HWY_SCALAR | 189 | 155k | ptrdiff_t onerow_in = chin.plane.PixelsPerRow(); | 190 | 155k | ptrdiff_t onerow_inr = chin_residual.plane.PixelsPerRow(); | 191 | 155k | ptrdiff_t onerow_out = chout.plane.PixelsPerRow(); | 192 | 155k | const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y0); | 193 | 155k | const pixel_type *JXL_RESTRICT p_avg = chin.Row(y0); | 194 | 155k | pixel_type *JXL_RESTRICT p_out = chout.Row(y0); | 195 | 155k | HWY_ALIGN pixel_type b_p_avg[9 * kRowsPerThread]; | 196 | 155k | HWY_ALIGN pixel_type b_p_residual[8 * kRowsPerThread]; | 197 | 155k | HWY_ALIGN pixel_type b_p_out_even[8 * kRowsPerThread]; | 198 | 155k | HWY_ALIGN pixel_type b_p_out_odd[8 * kRowsPerThread]; | 199 | 155k | HWY_ALIGN pixel_type b_p_out_evenT[8 * kRowsPerThread]; | 200 | 155k | HWY_ALIGN pixel_type b_p_out_oddT[8 * kRowsPerThread]; | 201 | 155k | const size_t N = Lanes(d); | 202 | 155k | if (chin_residual.w > 16 && rows == kRowsPerThread) { | 203 | 155k | for (; x < chin_residual.w - 9; x += 8) { | 204 | 155k | Transpose8x8Block(p_residual + x, b_p_residual, onerow_inr); | 205 | 155k | Transpose8x8Block(p_avg + x, b_p_avg, onerow_in); | 206 | 155k | for (size_t y = 0; y < kRowsPerThread; y++) { | 207 | 155k | b_p_avg[8 * 8 + y] = p_avg[x + 8 + onerow_in * y]; | 208 | 155k | } | 209 | 155k | for (size_t i = 0; i < 8; i++) { | 210 | 155k | FastUnsqueeze( | 211 | 155k | b_p_residual + 8 * i, b_p_avg + 8 * i, b_p_avg + 8 * (i + 1), | 212 | 155k | (x + i ? b_p_out_odd + 8 * ((x + i - 1) & 7) : b_p_avg + 8 * i), | 213 | 155k | b_p_out_even + 8 * i, b_p_out_odd + 8 * i); | 214 | 155k | } | 215 | | | 216 | 155k | Transpose8x8Block(b_p_out_even, b_p_out_evenT, 8); | 217 | 155k | Transpose8x8Block(b_p_out_odd, b_p_out_oddT, 8); | 218 | 155k | for (size_t y = 0; y < kRowsPerThread; y++) { | 219 | 155k | for (size_t i = 0; i < kRowsPerThread; i += N) { | 220 | 155k | auto even = Load(d, b_p_out_evenT + 8 * y + i); | 221 | 155k | auto odd = Load(d, b_p_out_oddT + 8 * y + i); | 222 | 155k | StoreInterleaved(d, even, odd, | 223 | 155k | p_out + ((x + i) << 1) + onerow_out * y); | 224 | 155k | } | 225 | 155k | } | 226 | 155k | } | 227 | 155k | } | 228 | 155k | #endif // HWY_TARGET != HWY_SCALAR | 229 | 155k | for (size_t y = 0; y < rows; y++) { | 230 | 155k | unsqueeze_row(y0 + y, x); | 231 | 155k | } | 232 | 155k | return true; | 233 | 155k | }; | 234 | 155k | JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, DivCeil(chin.h, kRowsPerThread), | 235 | 155k | ThreadPool::NoInit, unsqueeze_span, | 236 | 155k | "InvHorizontalSqueeze")); | 237 | 155k | input.channel[c] = std::move(chout); | 238 | 155k | return true; | 239 | 155k | } |
Unexecuted instantiation: jxl::N_AVX3::InvHSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*) Unexecuted instantiation: jxl::N_AVX3_ZEN4::InvHSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*) Unexecuted instantiation: jxl::N_AVX3_SPR::InvHSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*) Unexecuted instantiation: jxl::N_SSE2::InvHSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*) |
240 | | |
241 | 206k | Status InvVSqueeze(Image &input, uint32_t c, uint32_t rc, ThreadPool *pool) { |
242 | 206k | JXL_ENSURE(c < input.channel.size()); |
243 | 206k | JXL_ENSURE(rc < input.channel.size()); |
244 | 206k | const Channel &chin = input.channel[c]; |
245 | 206k | const Channel &chin_residual = input.channel[rc]; |
246 | | // These must be valid since we ran MetaApply already. |
247 | 206k | JXL_ENSURE(chin.h == DivCeil(chin.h + chin_residual.h, 2)); |
248 | 206k | JXL_ENSURE(chin.w == chin_residual.w); |
249 | 206k | JxlMemoryManager *memory_manager = input.memory_manager(); |
250 | | |
251 | 206k | if (chin_residual.h == 0) { |
252 | | // Short-circuit: output channel has same dimensions as input. |
253 | 25.4k | input.channel[c].vshift--; |
254 | 25.4k | return true; |
255 | 25.4k | } |
256 | | |
257 | | // Note: chin.h >= chin_residual.h and at most 1 different. |
258 | 362k | JXL_ASSIGN_OR_RETURN( |
259 | 362k | Channel chout, |
260 | 362k | Channel::Create(memory_manager, chin.w, chin.h + chin_residual.h, |
261 | 362k | chin.hshift, chin.vshift - 1)); |
262 | 362k | JXL_DEBUG_V( |
263 | 362k | 4, |
264 | 362k | "Undoing vertical squeeze of channel %i using residuals in channel " |
265 | 362k | "%i (going from height %" PRIuS " to %" PRIuS ")", |
266 | 362k | c, rc, chin.h, chout.h); |
267 | | |
268 | 362k | if (chin_residual.w == 0) { |
269 | | // Short-circuit: channel with no pixels. |
270 | 0 | input.channel[c] = std::move(chout); |
271 | 0 | return true; |
272 | 0 | } |
273 | | |
274 | 181k | static constexpr const int kColsPerThread = 64; |
275 | 181k | const auto unsqueeze_slice = [&](const uint32_t task, |
276 | 192k | size_t /* thread */) -> Status { |
277 | 192k | const size_t x0 = task * kColsPerThread; |
278 | 192k | const size_t x1 = |
279 | 192k | std::min(static_cast<size_t>(task + 1) * kColsPerThread, chin.w); |
280 | 192k | const size_t w = x1 - x0; |
281 | | // We only iterate up to std::min(chin_residual.h, chin.h) which is |
282 | | // always chin_residual.h. |
283 | 4.91M | for (size_t y = 0; y < chin_residual.h; y++) { |
284 | 4.72M | const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y) + x0; |
285 | 4.72M | const pixel_type *JXL_RESTRICT p_avg = chin.Row(y) + x0; |
286 | 4.72M | const pixel_type *JXL_RESTRICT p_navg = |
287 | 4.72M | chin.Row(y + 1 < chin.h ? y + 1 : y) + x0; |
288 | 4.72M | pixel_type *JXL_RESTRICT p_out = chout.Row(y << 1) + x0; |
289 | 4.72M | pixel_type *JXL_RESTRICT p_nout = chout.Row((y << 1) + 1) + x0; |
290 | 4.72M | const pixel_type *p_pout = y > 0 ? chout.Row((y << 1) - 1) + x0 : p_avg; |
291 | 4.72M | size_t x = 0; |
292 | 4.72M | #if HWY_TARGET != HWY_SCALAR |
293 | 23.5M | for (; x + 7 < w; x += 8) { |
294 | 18.8M | FastUnsqueeze(p_residual + x, p_avg + x, p_navg + x, p_pout + x, |
295 | 18.8M | p_out + x, p_nout + x); |
296 | 18.8M | } |
297 | 4.72M | #endif |
298 | 16.1M | for (; x < w; x++) { |
299 | 11.4M | pixel_type_w avg = p_avg[x]; |
300 | 11.4M | pixel_type_w next_avg = p_navg[x]; |
301 | 11.4M | pixel_type_w top = p_pout[x]; |
302 | 11.4M | pixel_type_w tendency = SmoothTendency(top, avg, next_avg); |
303 | 11.4M | pixel_type_w diff_minus_tendency = p_residual[x]; |
304 | 11.4M | pixel_type_w diff = diff_minus_tendency + tendency; |
305 | 11.4M | pixel_type_w out = avg + (diff / 2); |
306 | 11.4M | p_out[x] = out; |
307 | | // If the chin_residual.h == chin.h, the output has an even number |
308 | | // of rows so the next line is fine. Otherwise, this loop won't |
309 | | // write to the last output row which is handled separately. |
310 | 11.4M | p_nout[x] = out - diff; |
311 | 11.4M | } |
312 | 4.72M | } |
313 | 192k | return true; |
314 | 192k | }; Unexecuted instantiation: squeeze.cc:jxl::N_SSE4::InvVSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)::$_0::operator()(unsigned int, unsigned long) const squeeze.cc:jxl::N_AVX2::InvVSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)::$_0::operator()(unsigned int, unsigned long) const Line | Count | Source | 276 | 192k | size_t /* thread */) -> Status { | 277 | 192k | const size_t x0 = task * kColsPerThread; | 278 | 192k | const size_t x1 = | 279 | 192k | std::min(static_cast<size_t>(task + 1) * kColsPerThread, chin.w); | 280 | 192k | const size_t w = x1 - x0; | 281 | | // We only iterate up to std::min(chin_residual.h, chin.h) which is | 282 | | // always chin_residual.h. | 283 | 4.91M | for (size_t y = 0; y < chin_residual.h; y++) { | 284 | 4.72M | const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y) + x0; | 285 | 4.72M | const pixel_type *JXL_RESTRICT p_avg = chin.Row(y) + x0; | 286 | 4.72M | const pixel_type *JXL_RESTRICT p_navg = | 287 | 4.72M | chin.Row(y + 1 < chin.h ? y + 1 : y) + x0; | 288 | 4.72M | pixel_type *JXL_RESTRICT p_out = chout.Row(y << 1) + x0; | 289 | 4.72M | pixel_type *JXL_RESTRICT p_nout = chout.Row((y << 1) + 1) + x0; | 290 | 4.72M | const pixel_type *p_pout = y > 0 ? chout.Row((y << 1) - 1) + x0 : p_avg; | 291 | 4.72M | size_t x = 0; | 292 | 4.72M | #if HWY_TARGET != HWY_SCALAR | 293 | 23.5M | for (; x + 7 < w; x += 8) { | 294 | 18.8M | FastUnsqueeze(p_residual + x, p_avg + x, p_navg + x, p_pout + x, | 295 | 18.8M | p_out + x, p_nout + x); | 296 | 18.8M | } | 297 | 4.72M | #endif | 298 | 16.1M | for (; x < w; x++) { | 299 | 11.4M | pixel_type_w avg = p_avg[x]; | 300 | 11.4M | pixel_type_w next_avg = p_navg[x]; | 301 | 11.4M | pixel_type_w top = p_pout[x]; | 302 | 11.4M | pixel_type_w tendency = SmoothTendency(top, avg, next_avg); | 303 | 11.4M | pixel_type_w diff_minus_tendency = p_residual[x]; | 304 | 11.4M | pixel_type_w diff = diff_minus_tendency + tendency; | 305 | 11.4M | pixel_type_w out = avg + (diff / 2); | 306 | 11.4M | p_out[x] = out; | 307 | | // If the chin_residual.h == chin.h, the output has an even number | 308 | | // of rows so the next line is fine. Otherwise, this loop won't | 309 | | // write to the last output row which is handled separately. | 310 | 11.4M | p_nout[x] = out - diff; | 311 | 11.4M | } | 312 | 4.72M | } | 313 | 192k | return true; | 314 | 192k | }; |
Unexecuted instantiation: squeeze.cc:jxl::N_AVX3::InvVSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)::$_0::operator()(unsigned int, unsigned long) const Unexecuted instantiation: squeeze.cc:jxl::N_AVX3_ZEN4::InvVSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)::$_0::operator()(unsigned int, unsigned long) const Unexecuted instantiation: squeeze.cc:jxl::N_AVX3_SPR::InvVSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)::$_0::operator()(unsigned int, unsigned long) const Unexecuted instantiation: squeeze.cc:jxl::N_SSE2::InvVSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)::$_0::operator()(unsigned int, unsigned long) const |
315 | 181k | JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, DivCeil(chin.w, kColsPerThread), |
316 | 181k | ThreadPool::NoInit, unsqueeze_slice, |
317 | 181k | "InvVertSqueeze")); |
318 | | |
319 | 181k | if (chout.h & 1) { |
320 | 57.8k | size_t y = chin.h - 1; |
321 | 57.8k | const pixel_type *p_avg = chin.Row(y); |
322 | 57.8k | pixel_type *p_out = chout.Row(y << 1); |
323 | 1.53M | for (size_t x = 0; x < chin.w; x++) { |
324 | 1.47M | p_out[x] = p_avg[x]; |
325 | 1.47M | } |
326 | 57.8k | } |
327 | 181k | input.channel[c] = std::move(chout); |
328 | 181k | return true; |
329 | 181k | } Unexecuted instantiation: jxl::N_SSE4::InvVSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*) jxl::N_AVX2::InvVSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*) Line | Count | Source | 241 | 206k | Status InvVSqueeze(Image &input, uint32_t c, uint32_t rc, ThreadPool *pool) { | 242 | 206k | JXL_ENSURE(c < input.channel.size()); | 243 | 206k | JXL_ENSURE(rc < input.channel.size()); | 244 | 206k | const Channel &chin = input.channel[c]; | 245 | 206k | const Channel &chin_residual = input.channel[rc]; | 246 | | // These must be valid since we ran MetaApply already. | 247 | 206k | JXL_ENSURE(chin.h == DivCeil(chin.h + chin_residual.h, 2)); | 248 | 206k | JXL_ENSURE(chin.w == chin_residual.w); | 249 | 206k | JxlMemoryManager *memory_manager = input.memory_manager(); | 250 | | | 251 | 206k | if (chin_residual.h == 0) { | 252 | | // Short-circuit: output channel has same dimensions as input. | 253 | 25.4k | input.channel[c].vshift--; | 254 | 25.4k | return true; | 255 | 25.4k | } | 256 | | | 257 | | // Note: chin.h >= chin_residual.h and at most 1 different. | 258 | 362k | JXL_ASSIGN_OR_RETURN( | 259 | 362k | Channel chout, | 260 | 362k | Channel::Create(memory_manager, chin.w, chin.h + chin_residual.h, | 261 | 362k | chin.hshift, chin.vshift - 1)); | 262 | 362k | JXL_DEBUG_V( | 263 | 362k | 4, | 264 | 362k | "Undoing vertical squeeze of channel %i using residuals in channel " | 265 | 362k | "%i (going from height %" PRIuS " to %" PRIuS ")", | 266 | 362k | c, rc, chin.h, chout.h); | 267 | | | 268 | 362k | if (chin_residual.w == 0) { | 269 | | // Short-circuit: channel with no pixels. | 270 | 0 | input.channel[c] = std::move(chout); | 271 | 0 | return true; | 272 | 0 | } | 273 | | | 274 | 181k | static constexpr const int kColsPerThread = 64; | 275 | 181k | const auto unsqueeze_slice = [&](const uint32_t task, | 276 | 181k | size_t /* thread */) -> Status { | 277 | 181k | const size_t x0 = task * kColsPerThread; | 278 | 181k | const size_t x1 = | 279 | 181k | std::min(static_cast<size_t>(task + 1) * kColsPerThread, chin.w); | 280 | 181k | const size_t w = x1 - x0; | 281 | | // We only iterate up to std::min(chin_residual.h, chin.h) which is | 282 | | // always chin_residual.h. | 283 | 181k | for (size_t y = 0; y < chin_residual.h; y++) { | 284 | 181k | const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y) + x0; | 285 | 181k | const pixel_type *JXL_RESTRICT p_avg = chin.Row(y) + x0; | 286 | 181k | const pixel_type *JXL_RESTRICT p_navg = | 287 | 181k | chin.Row(y + 1 < chin.h ? y + 1 : y) + x0; | 288 | 181k | pixel_type *JXL_RESTRICT p_out = chout.Row(y << 1) + x0; | 289 | 181k | pixel_type *JXL_RESTRICT p_nout = chout.Row((y << 1) + 1) + x0; | 290 | 181k | const pixel_type *p_pout = y > 0 ? chout.Row((y << 1) - 1) + x0 : p_avg; | 291 | 181k | size_t x = 0; | 292 | 181k | #if HWY_TARGET != HWY_SCALAR | 293 | 181k | for (; x + 7 < w; x += 8) { | 294 | 181k | FastUnsqueeze(p_residual + x, p_avg + x, p_navg + x, p_pout + x, | 295 | 181k | p_out + x, p_nout + x); | 296 | 181k | } | 297 | 181k | #endif | 298 | 181k | for (; x < w; x++) { | 299 | 181k | pixel_type_w avg = p_avg[x]; | 300 | 181k | pixel_type_w next_avg = p_navg[x]; | 301 | 181k | pixel_type_w top = p_pout[x]; | 302 | 181k | pixel_type_w tendency = SmoothTendency(top, avg, next_avg); | 303 | 181k | pixel_type_w diff_minus_tendency = p_residual[x]; | 304 | 181k | pixel_type_w diff = diff_minus_tendency + tendency; | 305 | 181k | pixel_type_w out = avg + (diff / 2); | 306 | 181k | p_out[x] = out; | 307 | | // If the chin_residual.h == chin.h, the output has an even number | 308 | | // of rows so the next line is fine. Otherwise, this loop won't | 309 | | // write to the last output row which is handled separately. | 310 | 181k | p_nout[x] = out - diff; | 311 | 181k | } | 312 | 181k | } | 313 | 181k | return true; | 314 | 181k | }; | 315 | 181k | JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, DivCeil(chin.w, kColsPerThread), | 316 | 181k | ThreadPool::NoInit, unsqueeze_slice, | 317 | 181k | "InvVertSqueeze")); | 318 | | | 319 | 181k | if (chout.h & 1) { | 320 | 57.8k | size_t y = chin.h - 1; | 321 | 57.8k | const pixel_type *p_avg = chin.Row(y); | 322 | 57.8k | pixel_type *p_out = chout.Row(y << 1); | 323 | 1.53M | for (size_t x = 0; x < chin.w; x++) { | 324 | 1.47M | p_out[x] = p_avg[x]; | 325 | 1.47M | } | 326 | 57.8k | } | 327 | 181k | input.channel[c] = std::move(chout); | 328 | 181k | return true; | 329 | 181k | } |
Unexecuted instantiation: jxl::N_AVX3::InvVSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*) Unexecuted instantiation: jxl::N_AVX3_ZEN4::InvVSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*) Unexecuted instantiation: jxl::N_AVX3_SPR::InvVSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*) Unexecuted instantiation: jxl::N_SSE2::InvVSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*) |
330 | | |
331 | | Status InvSqueeze(Image &input, const std::vector<SqueezeParams> ¶meters, |
332 | 22.2k | ThreadPool *pool) { |
333 | 152k | for (int i = parameters.size() - 1; i >= 0; i--) { |
334 | 130k | JXL_RETURN_IF_ERROR( |
335 | 130k | CheckMetaSqueezeParams(parameters[i], input.channel.size())); |
336 | 130k | bool horizontal = parameters[i].horizontal; |
337 | 130k | bool in_place = parameters[i].in_place; |
338 | 130k | uint32_t beginc = parameters[i].begin_c; |
339 | 130k | uint32_t endc = parameters[i].begin_c + parameters[i].num_c - 1; |
340 | 130k | uint32_t offset; |
341 | 130k | if (in_place) { |
342 | 71.8k | offset = endc + 1; |
343 | 71.8k | } else { |
344 | 58.2k | offset = input.channel.size() + beginc - endc - 1; |
345 | 58.2k | } |
346 | 130k | if (beginc < input.nb_meta_channels) { |
347 | | // This is checked in MetaSqueeze. |
348 | 4 | JXL_ENSURE(input.nb_meta_channels > parameters[i].num_c); |
349 | 4 | input.nb_meta_channels -= parameters[i].num_c; |
350 | 4 | } |
351 | | |
352 | 498k | for (uint32_t c = beginc; c <= endc; c++) { |
353 | 368k | uint32_t rc = offset + c - beginc; |
354 | | // MetaApply should imply that `rc` is within range, otherwise there's a |
355 | | // programming bug. |
356 | 368k | JXL_ENSURE(rc < input.channel.size()); |
357 | 368k | if ((input.channel[c].w < input.channel[rc].w) || |
358 | 368k | (input.channel[c].h < input.channel[rc].h)) { |
359 | 0 | return JXL_FAILURE("Corrupted squeeze transform"); |
360 | 0 | } |
361 | 368k | if (horizontal) { |
362 | 161k | JXL_RETURN_IF_ERROR(InvHSqueeze(input, c, rc, pool)); |
363 | 206k | } else { |
364 | 206k | JXL_RETURN_IF_ERROR(InvVSqueeze(input, c, rc, pool)); |
365 | 206k | } |
366 | 368k | } |
367 | 130k | input.channel.erase(input.channel.begin() + offset, |
368 | 130k | input.channel.begin() + offset + (endc - beginc + 1)); |
369 | 130k | } |
370 | 22.2k | return true; |
371 | 22.2k | } Unexecuted instantiation: jxl::N_SSE4::InvSqueeze(jxl::Image&, std::__1::vector<jxl::SqueezeParams, std::__1::allocator<jxl::SqueezeParams> > const&, jxl::ThreadPool*) jxl::N_AVX2::InvSqueeze(jxl::Image&, std::__1::vector<jxl::SqueezeParams, std::__1::allocator<jxl::SqueezeParams> > const&, jxl::ThreadPool*) Line | Count | Source | 332 | 22.2k | ThreadPool *pool) { | 333 | 152k | for (int i = parameters.size() - 1; i >= 0; i--) { | 334 | 130k | JXL_RETURN_IF_ERROR( | 335 | 130k | CheckMetaSqueezeParams(parameters[i], input.channel.size())); | 336 | 130k | bool horizontal = parameters[i].horizontal; | 337 | 130k | bool in_place = parameters[i].in_place; | 338 | 130k | uint32_t beginc = parameters[i].begin_c; | 339 | 130k | uint32_t endc = parameters[i].begin_c + parameters[i].num_c - 1; | 340 | 130k | uint32_t offset; | 341 | 130k | if (in_place) { | 342 | 71.8k | offset = endc + 1; | 343 | 71.8k | } else { | 344 | 58.2k | offset = input.channel.size() + beginc - endc - 1; | 345 | 58.2k | } | 346 | 130k | if (beginc < input.nb_meta_channels) { | 347 | | // This is checked in MetaSqueeze. | 348 | 4 | JXL_ENSURE(input.nb_meta_channels > parameters[i].num_c); | 349 | 4 | input.nb_meta_channels -= parameters[i].num_c; | 350 | 4 | } | 351 | | | 352 | 498k | for (uint32_t c = beginc; c <= endc; c++) { | 353 | 368k | uint32_t rc = offset + c - beginc; | 354 | | // MetaApply should imply that `rc` is within range, otherwise there's a | 355 | | // programming bug. | 356 | 368k | JXL_ENSURE(rc < input.channel.size()); | 357 | 368k | if ((input.channel[c].w < input.channel[rc].w) || | 358 | 368k | (input.channel[c].h < input.channel[rc].h)) { | 359 | 0 | return JXL_FAILURE("Corrupted squeeze transform"); | 360 | 0 | } | 361 | 368k | if (horizontal) { | 362 | 161k | JXL_RETURN_IF_ERROR(InvHSqueeze(input, c, rc, pool)); | 363 | 206k | } else { | 364 | 206k | JXL_RETURN_IF_ERROR(InvVSqueeze(input, c, rc, pool)); | 365 | 206k | } | 366 | 368k | } | 367 | 130k | input.channel.erase(input.channel.begin() + offset, | 368 | 130k | input.channel.begin() + offset + (endc - beginc + 1)); | 369 | 130k | } | 370 | 22.2k | return true; | 371 | 22.2k | } |
Unexecuted instantiation: jxl::N_AVX3::InvSqueeze(jxl::Image&, std::__1::vector<jxl::SqueezeParams, std::__1::allocator<jxl::SqueezeParams> > const&, jxl::ThreadPool*) Unexecuted instantiation: jxl::N_AVX3_ZEN4::InvSqueeze(jxl::Image&, std::__1::vector<jxl::SqueezeParams, std::__1::allocator<jxl::SqueezeParams> > const&, jxl::ThreadPool*) Unexecuted instantiation: jxl::N_AVX3_SPR::InvSqueeze(jxl::Image&, std::__1::vector<jxl::SqueezeParams, std::__1::allocator<jxl::SqueezeParams> > const&, jxl::ThreadPool*) Unexecuted instantiation: jxl::N_SSE2::InvSqueeze(jxl::Image&, std::__1::vector<jxl::SqueezeParams, std::__1::allocator<jxl::SqueezeParams> > const&, jxl::ThreadPool*) |
372 | | |
373 | | } // namespace HWY_NAMESPACE |
374 | | } // namespace jxl |
375 | | HWY_AFTER_NAMESPACE(); |
376 | | |
377 | | #if HWY_ONCE |
378 | | |
379 | | namespace jxl { |
380 | | |
381 | | HWY_EXPORT(InvSqueeze); |
382 | | Status InvSqueeze(Image &input, const std::vector<SqueezeParams> ¶meters, |
383 | 22.2k | ThreadPool *pool) { |
384 | 22.2k | return HWY_DYNAMIC_DISPATCH(InvSqueeze)(input, parameters, pool); |
385 | 22.2k | } |
386 | | |
387 | | void DefaultSqueezeParameters(std::vector<SqueezeParams> *parameters, |
388 | 19.8k | const Image &image) { |
389 | 19.8k | int nb_channels = image.channel.size() - image.nb_meta_channels; |
390 | | |
391 | 19.8k | parameters->clear(); |
392 | 19.8k | size_t w = image.channel[image.nb_meta_channels].w; |
393 | 19.8k | size_t h = image.channel[image.nb_meta_channels].h; |
394 | 19.8k | JXL_DEBUG_V( |
395 | 19.8k | 7, "Default squeeze parameters for %" PRIuS "x%" PRIuS " image: ", w, h); |
396 | | |
397 | | // do horizontal first on wide images; vertical first on tall images |
398 | 19.8k | bool wide = (w > h); |
399 | | |
400 | 19.8k | if (nb_channels > 2 && image.channel[image.nb_meta_channels + 1].w == w && |
401 | 12.7k | image.channel[image.nb_meta_channels + 1].h == h) { |
402 | | // assume channels 1 and 2 are chroma, and can be squeezed first for 4:2:0 |
403 | | // previews |
404 | 12.4k | JXL_DEBUG_V(7, "(4:2:0 chroma), %" PRIuS "x%" PRIuS " image", w, h); |
405 | 12.4k | SqueezeParams params; |
406 | | // horizontal chroma squeeze |
407 | 12.4k | params.horizontal = true; |
408 | 12.4k | params.in_place = false; |
409 | 12.4k | params.begin_c = image.nb_meta_channels + 1; |
410 | 12.4k | params.num_c = 2; |
411 | 12.4k | parameters->push_back(params); |
412 | 12.4k | params.horizontal = false; |
413 | | // vertical chroma squeeze |
414 | 12.4k | parameters->push_back(params); |
415 | 12.4k | } |
416 | 19.8k | SqueezeParams params; |
417 | 19.8k | params.begin_c = image.nb_meta_channels; |
418 | 19.8k | params.num_c = nb_channels; |
419 | 19.8k | params.in_place = true; |
420 | | |
421 | 19.8k | if (!wide) { |
422 | 14.3k | if (h > kMaxFirstPreviewSize) { |
423 | 10.0k | params.horizontal = false; |
424 | 10.0k | parameters->push_back(params); |
425 | 10.0k | h = (h + 1) / 2; |
426 | 10.0k | JXL_DEBUG_V(7, "Vertical (%" PRIuS "x%" PRIuS "), ", w, h); |
427 | 10.0k | } |
428 | 14.3k | } |
429 | 57.2k | while (w > kMaxFirstPreviewSize || h > kMaxFirstPreviewSize) { |
430 | 37.3k | if (w > kMaxFirstPreviewSize) { |
431 | 34.5k | params.horizontal = true; |
432 | 34.5k | parameters->push_back(params); |
433 | 34.5k | w = (w + 1) / 2; |
434 | 34.5k | JXL_DEBUG_V(7, "Horizontal (%" PRIuS "x%" PRIuS "), ", w, h); |
435 | 34.5k | } |
436 | 37.3k | if (h > kMaxFirstPreviewSize) { |
437 | 30.7k | params.horizontal = false; |
438 | 30.7k | parameters->push_back(params); |
439 | 30.7k | h = (h + 1) / 2; |
440 | 30.7k | JXL_DEBUG_V(7, "Vertical (%" PRIuS "x%" PRIuS "), ", w, h); |
441 | 30.7k | } |
442 | 37.3k | } |
443 | 19.8k | JXL_DEBUG_V(7, "that's it"); |
444 | 19.8k | } |
445 | | |
446 | | Status CheckMetaSqueezeParams(const SqueezeParams ¶meter, |
447 | 269k | int num_channels) { |
448 | 269k | int c1 = parameter.begin_c; |
449 | 269k | int c2 = parameter.begin_c + parameter.num_c - 1; |
450 | 269k | if (c1 < 0 || c1 >= num_channels || c2 < 0 || c2 >= num_channels || c2 < c1) { |
451 | 267 | return JXL_FAILURE("Invalid channel range"); |
452 | 267 | } |
453 | 268k | return true; |
454 | 269k | } |
455 | | |
456 | 24.8k | Status MetaSqueeze(Image &image, std::vector<SqueezeParams> *parameters) { |
457 | 24.8k | JxlMemoryManager *memory_manager = image.memory_manager(); |
458 | 24.8k | if (parameters->empty()) { |
459 | 19.8k | DefaultSqueezeParameters(parameters, image); |
460 | 19.8k | } |
461 | | |
462 | 139k | for (auto ¶meter : *parameters) { |
463 | 139k | JXL_RETURN_IF_ERROR( |
464 | 139k | CheckMetaSqueezeParams(parameter, image.channel.size())); |
465 | 138k | bool horizontal = parameter.horizontal; |
466 | 138k | bool in_place = parameter.in_place; |
467 | 138k | uint32_t beginc = parameter.begin_c; |
468 | 138k | uint32_t endc = parameter.begin_c + parameter.num_c - 1; |
469 | | |
470 | 138k | uint32_t offset; |
471 | 138k | if (beginc < image.nb_meta_channels) { |
472 | 36 | if (endc >= image.nb_meta_channels) { |
473 | 4 | return JXL_FAILURE("Invalid squeeze: mix of meta and nonmeta channels"); |
474 | 4 | } |
475 | 32 | if (!in_place) { |
476 | 9 | return JXL_FAILURE( |
477 | 9 | "Invalid squeeze: meta channels require in-place residuals"); |
478 | 9 | } |
479 | 23 | image.nb_meta_channels += parameter.num_c; |
480 | 23 | } |
481 | 138k | if (in_place) { |
482 | 75.6k | offset = endc + 1; |
483 | 75.6k | } else { |
484 | 63.1k | offset = image.channel.size(); |
485 | 63.1k | } |
486 | 526k | for (uint32_t c = beginc; c <= endc; c++) { |
487 | 387k | if (image.channel[c].hshift > 30 || image.channel[c].vshift > 30) { |
488 | 5 | return JXL_FAILURE("Too many squeezes: shift > 30"); |
489 | 5 | } |
490 | 387k | size_t w = image.channel[c].w; |
491 | 387k | size_t h = image.channel[c].h; |
492 | 387k | if (w == 0 || h == 0) return JXL_FAILURE("Squeezing empty channel"); |
493 | 387k | if (horizontal) { |
494 | 170k | image.channel[c].w = (w + 1) / 2; |
495 | 170k | if (image.channel[c].hshift >= 0) image.channel[c].hshift++; |
496 | 170k | w = w - (w + 1) / 2; |
497 | 217k | } else { |
498 | 217k | image.channel[c].h = (h + 1) / 2; |
499 | 217k | if (image.channel[c].vshift >= 0) image.channel[c].vshift++; |
500 | 217k | h = h - (h + 1) / 2; |
501 | 217k | } |
502 | 387k | JXL_RETURN_IF_ERROR(image.channel[c].shrink()); |
503 | 775k | JXL_ASSIGN_OR_RETURN(Channel placeholder, |
504 | 775k | Channel::Create(memory_manager, w, h)); |
505 | 775k | placeholder.hshift = image.channel[c].hshift; |
506 | 775k | placeholder.vshift = image.channel[c].vshift; |
507 | 775k | placeholder.component = image.channel[c].component; |
508 | 775k | image.channel.insert(image.channel.begin() + offset + (c - beginc), |
509 | 775k | std::move(placeholder)); |
510 | 775k | JXL_DEBUG_V(0, "MetaSqueeze applied, current image: %s", |
511 | 775k | image.DebugString().c_str()); |
512 | 775k | } |
513 | 138k | } |
514 | 24.4k | return true; |
515 | 24.8k | } |
516 | | |
517 | | } // namespace jxl |
518 | | |
519 | | #endif |