/src/libjxl/lib/jxl/enc_convolve_separable5.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) the JPEG XL Project Authors. All rights reserved. |
2 | | // |
3 | | // Use of this source code is governed by a BSD-style |
4 | | // license that can be found in the LICENSE file. |
5 | | |
6 | | #include <cstddef> |
7 | | #include <cstdint> |
8 | | #include <cstdlib> |
9 | | |
10 | | #include "lib/jxl/base/compiler_specific.h" |
11 | | #include "lib/jxl/base/data_parallel.h" |
12 | | #include "lib/jxl/base/status.h" |
13 | | #include "lib/jxl/convolve.h" |
14 | | #include "lib/jxl/image.h" |
15 | | #include "lib/jxl/image_ops.h" |
16 | | |
17 | | #undef HWY_TARGET_INCLUDE |
18 | | #define HWY_TARGET_INCLUDE "lib/jxl/enc_convolve_separable5.cc" |
19 | | #include <hwy/foreach_target.h> |
20 | | #include <hwy/highway.h> |
21 | | |
22 | | #include "lib/jxl/base/rect.h" |
23 | | #include "lib/jxl/convolve-inl.h" |
24 | | |
25 | | HWY_BEFORE_NAMESPACE(); |
26 | | namespace jxl { |
27 | | namespace HWY_NAMESPACE { |
28 | | |
29 | | // These templates are not found via ADL. |
30 | | using hwy::HWY_NAMESPACE::Add; |
31 | | using hwy::HWY_NAMESPACE::IndicesFromVec; |
32 | | using hwy::HWY_NAMESPACE::Iota; |
33 | | using hwy::HWY_NAMESPACE::Max; |
34 | | using hwy::HWY_NAMESPACE::Min; |
35 | | using hwy::HWY_NAMESPACE::Mul; |
36 | | using hwy::HWY_NAMESPACE::MulAdd; |
37 | | using hwy::HWY_NAMESPACE::Sub; |
38 | | using hwy::HWY_NAMESPACE::Vec; |
39 | | |
40 | | using D = HWY_CAPPED(float, 16); |
41 | | using DI32 = HWY_CAPPED(int32_t, 16); |
42 | | using V = Vec<D>; |
43 | | using VI32 = Vec<DI32>; |
44 | | using I = decltype(SetTableIndices(D(), static_cast<int32_t*>(nullptr))); |
45 | | |
46 | | // 5x5 convolution by separable kernel with a single scan through the input. |
47 | | // This is more cache-efficient than separate horizontal/vertical passes, and |
48 | | // possibly faster (given enough registers) than tiling and/or transposing. |
49 | | // |
50 | | // Overview: imagine a 5x5 window around a central pixel. First convolve the |
51 | | // rows by multiplying the pixels with the corresponding weights from |
52 | | // WeightsSeparable5.horz[abs(x_offset) * 4]. Then multiply each of these |
53 | | // intermediate results by the corresponding vertical weight, i.e. |
54 | | // vert[abs(y_offset) * 4]. Finally, store the sum of these values as the |
55 | | // convolution result at the position of the central pixel in the output. |
56 | | // |
57 | | // Each of these operations uses SIMD vectors. The central pixel and most |
58 | | // importantly the output are aligned, so neighnoring pixels (e.g. x_offset=1) |
59 | | // require unaligned loads. Because weights are supplied in identical groups of |
60 | | // 4, we can use LoadDup128 to load them (slightly faster). |
61 | | // |
62 | | // Uses mirrored boundary handling. Until x >= kRadius, the horizontal |
63 | | // convolution uses Neighbors class to shuffle vectors as if each of its lanes |
64 | | // had been loaded from the mirrored offset. Similarly, the last full vector to |
65 | | // write uses mirroring. In the case of scalar vectors, Neighbors is not usable |
66 | | // and the value is loaded directly. Otherwise, the number of valid pixels |
67 | | // modulo the vector size enables a small optimization: for smaller offsets, |
68 | | // a non-mirrored load is sufficient. |
69 | | class Separable5Impl { |
70 | | public: |
71 | | using Simd = HWY_CAPPED(float, 16); |
72 | | static constexpr int64_t kRadius = 2; |
73 | | |
74 | | Separable5Impl(const ImageF* in, const Rect& rect, |
75 | | const WeightsSeparable5* weights, ThreadPool* pool, |
76 | | ImageF* out) |
77 | 0 | : in(in), rect(rect), weights(weights), pool(pool), out(out) {} Unexecuted instantiation: jxl::N_SSE4::Separable5Impl::Separable5Impl(jxl::Plane<float> const*, jxl::RectT<unsigned long> const&, jxl::WeightsSeparable5 const*, jxl::ThreadPool*, jxl::Plane<float>*) Unexecuted instantiation: jxl::N_AVX2::Separable5Impl::Separable5Impl(jxl::Plane<float> const*, jxl::RectT<unsigned long> const&, jxl::WeightsSeparable5 const*, jxl::ThreadPool*, jxl::Plane<float>*) Unexecuted instantiation: jxl::N_SSE2::Separable5Impl::Separable5Impl(jxl::Plane<float> const*, jxl::RectT<unsigned long> const&, jxl::WeightsSeparable5 const*, jxl::ThreadPool*, jxl::Plane<float>*) |
78 | | |
79 | 0 | Status Run() { |
80 | | #if HWY_TARGET == HWY_SCALAR |
81 | | // First/Last use mirrored loads of up to +/- kRadius. |
82 | | size_t min_width = 2 * kRadius; |
83 | | #else |
84 | 0 | size_t min_width = Lanes(Simd()) + kRadius; |
85 | 0 | #endif |
86 | |
|
87 | 0 | if (rect.xsize() >= min_width) { |
88 | 0 | JXL_ENSURE(SameSize(rect, *out)); |
89 | | |
90 | 0 | switch (rect.xsize() % Lanes(Simd())) { |
91 | 0 | case 0: |
92 | 0 | RunRows<0>(); |
93 | 0 | break; |
94 | 0 | case 1: |
95 | 0 | RunRows<1>(); |
96 | 0 | break; |
97 | 0 | case 2: |
98 | 0 | RunRows<2>(); |
99 | 0 | break; |
100 | 0 | default: |
101 | 0 | RunRows<3>(); |
102 | 0 | break; |
103 | 0 | } |
104 | 0 | return true; |
105 | 0 | } else { |
106 | 0 | return SlowSeparable5(*in, rect, *weights, pool, out, Rect(*out)); |
107 | 0 | } |
108 | 0 | } Unexecuted instantiation: jxl::N_SSE4::Separable5Impl::Run() Unexecuted instantiation: jxl::N_AVX2::Separable5Impl::Run() Unexecuted instantiation: jxl::N_SSE2::Separable5Impl::Run() |
109 | | |
110 | | template <size_t kSizeModN, bool kBorder> |
111 | 0 | JXL_NOINLINE void ConvolveRow(const uint32_t y) { |
112 | 0 | const D d; |
113 | 0 | const int64_t stride = in->PixelsPerRow(); |
114 | 0 | const int64_t neg_stride = -stride; // allows LEA addressing. |
115 | 0 | const size_t xsize = rect.xsize(); |
116 | 0 | const float* const JXL_RESTRICT row_m = rect.ConstRow(*in, y); |
117 | 0 | float* const JXL_RESTRICT row_out = out->Row(y); |
118 | 0 | const float* JXL_RESTRICT row_t2 = row_m + 2 * neg_stride; |
119 | 0 | const float* JXL_RESTRICT row_t1 = row_m + 1 * neg_stride; |
120 | 0 | const float* JXL_RESTRICT row_b1 = row_m + 1 * stride; |
121 | 0 | const float* JXL_RESTRICT row_b2 = row_m + 2 * stride; |
122 | |
|
123 | 0 | if (kBorder) { |
124 | 0 | size_t img_y = rect.y0() + y; |
125 | 0 | if (in->ysize() <= 2 * kRadius) { // Very special: double reflections |
126 | 0 | static constexpr size_t kBorderLut[4 * 8] = { |
127 | 0 | 0, 0, 0, 0, 0, 0xBAD, 0xBAD, 0xBAD, // 1 row |
128 | 0 | 1, 0, 0, 1, 1, 0, 0xBAD, 0xBAD, // 2 rows |
129 | 0 | 1, 0, 0, 1, 2, 2, 1, 0xBAD, // 3 rows |
130 | 0 | 1, 0, 0, 1, 2, 3, 3, 2, // 4 rows |
131 | 0 | }; |
132 | 0 | JXL_DASSERT(in->ysize() <= 4); |
133 | 0 | size_t o = in->ysize() * 8 - 6 + img_y; |
134 | 0 | row_t2 = in->ConstRow(kBorderLut[o - 2]) + rect.x0(); |
135 | 0 | row_t1 = in->ConstRow(kBorderLut[o - 1]) + rect.x0(); |
136 | 0 | row_b1 = in->ConstRow(kBorderLut[o + 1]) + rect.x0(); |
137 | 0 | row_b2 = in->ConstRow(kBorderLut[o + 2]) + rect.x0(); |
138 | 0 | } else if (img_y < kRadius) { |
139 | 0 | if (img_y == 0) { |
140 | 0 | row_t1 = row_m; |
141 | 0 | row_t2 = row_b1; |
142 | 0 | } else { |
143 | 0 | JXL_DASSERT(img_y == 1); |
144 | 0 | row_t2 = row_t1; |
145 | 0 | } |
146 | 0 | } else { |
147 | 0 | JXL_DASSERT(img_y + kRadius >= in->ysize()); |
148 | 0 | if (img_y + 1 == in->ysize()) { |
149 | 0 | row_b1 = row_m; |
150 | 0 | row_b2 = row_t1; |
151 | 0 | } else { |
152 | 0 | JXL_DASSERT(img_y + 2 == in->ysize()); |
153 | 0 | row_b2 = row_b1; |
154 | 0 | } |
155 | 0 | } |
156 | 0 | } |
157 | |
|
158 | 0 | const V wh0 = LoadDup128(d, weights->horz + 0 * 4); |
159 | 0 | const V wh1 = LoadDup128(d, weights->horz + 1 * 4); |
160 | 0 | const V wh2 = LoadDup128(d, weights->horz + 2 * 4); |
161 | 0 | const V wv0 = LoadDup128(d, weights->vert + 0 * 4); |
162 | 0 | const V wv1 = LoadDup128(d, weights->vert + 1 * 4); |
163 | 0 | const V wv2 = LoadDup128(d, weights->vert + 2 * 4); |
164 | 0 | const I ml1 = MirrorLanes<1>(); |
165 | 0 | const I ml2 = MirrorLanes<2>(); |
166 | |
|
167 | 0 | size_t x = 0; |
168 | | |
169 | | // More than one iteration for scalars. |
170 | 0 | for (; x < kRadius; x += Lanes(d)) { |
171 | 0 | const V conv0 = |
172 | 0 | Mul(HorzConvolveFirst(row_m, x, xsize, wh0, wh1, wh2), wv0); |
173 | |
|
174 | 0 | const V conv1t = HorzConvolveFirst(row_t1, x, xsize, wh0, wh1, wh2); |
175 | 0 | const V conv1b = HorzConvolveFirst(row_b1, x, xsize, wh0, wh1, wh2); |
176 | 0 | const V conv1 = MulAdd(Add(conv1t, conv1b), wv1, conv0); |
177 | |
|
178 | 0 | const V conv2t = HorzConvolveFirst(row_t2, x, xsize, wh0, wh1, wh2); |
179 | 0 | const V conv2b = HorzConvolveFirst(row_b2, x, xsize, wh0, wh1, wh2); |
180 | 0 | const V conv2 = MulAdd(Add(conv2t, conv2b), wv2, conv1); |
181 | 0 | Store(conv2, d, row_out + x); |
182 | 0 | } |
183 | | |
184 | | // Main loop: load inputs without padding |
185 | 0 | for (; x + Lanes(d) + kRadius <= xsize; x += Lanes(d)) { |
186 | 0 | const V conv0 = Mul(HorzConvolve(row_m + x, wh0, wh1, wh2), wv0); |
187 | |
|
188 | 0 | const V conv1t = HorzConvolve(row_t1 + x, wh0, wh1, wh2); |
189 | 0 | const V conv1b = HorzConvolve(row_b1 + x, wh0, wh1, wh2); |
190 | 0 | const V conv1 = MulAdd(Add(conv1t, conv1b), wv1, conv0); |
191 | |
|
192 | 0 | const V conv2t = HorzConvolve(row_t2 + x, wh0, wh1, wh2); |
193 | 0 | const V conv2b = HorzConvolve(row_b2 + x, wh0, wh1, wh2); |
194 | 0 | const V conv2 = MulAdd(Add(conv2t, conv2b), wv2, conv1); |
195 | 0 | Store(conv2, d, row_out + x); |
196 | 0 | } |
197 | | |
198 | | // Last full vector to write (the above loop handled mod >= kRadius) |
199 | | #if HWY_TARGET == HWY_SCALAR |
200 | | while (x < xsize) { |
201 | | #else |
202 | 0 | if (kSizeModN < kRadius) { |
203 | 0 | #endif |
204 | 0 | const V conv0 = Mul( |
205 | 0 | HorzConvolveLast<kSizeModN>(row_m, x, xsize, wh0, wh1, wh2, ml1, ml2), |
206 | 0 | wv0); |
207 | |
|
208 | 0 | const V conv1t = HorzConvolveLast<kSizeModN>(row_t1, x, xsize, wh0, wh1, |
209 | 0 | wh2, ml1, ml2); |
210 | 0 | const V conv1b = HorzConvolveLast<kSizeModN>(row_b1, x, xsize, wh0, wh1, |
211 | 0 | wh2, ml1, ml2); |
212 | 0 | const V conv1 = MulAdd(Add(conv1t, conv1b), wv1, conv0); |
213 | |
|
214 | 0 | const V conv2t = HorzConvolveLast<kSizeModN>(row_t2, x, xsize, wh0, wh1, |
215 | 0 | wh2, ml1, ml2); |
216 | 0 | const V conv2b = HorzConvolveLast<kSizeModN>(row_b2, x, xsize, wh0, wh1, |
217 | 0 | wh2, ml1, ml2); |
218 | 0 | const V conv2 = MulAdd(Add(conv2t, conv2b), wv2, conv1); |
219 | 0 | Store(conv2, d, row_out + x); |
220 | 0 | x += Lanes(d); |
221 | 0 | } |
222 | | |
223 | | // If mod = 0, the above vector was the last. |
224 | 0 | if (kSizeModN != 0) { |
225 | 0 | const float* JXL_RESTRICT rows[5] = {row_t2, row_t1, row_m, row_b1, |
226 | 0 | row_b2}; |
227 | 0 | for (; x < xsize; ++x) { |
228 | 0 | float mul = 0.0f; |
229 | 0 | for (int64_t dy = -kRadius; dy <= kRadius; ++dy) { |
230 | 0 | const float wy = weights->vert[std::abs(dy) * 4]; |
231 | 0 | const float* clamped_row = rows[dy + 2]; |
232 | 0 | for (int64_t dx = -kRadius; dx <= kRadius; ++dx) { |
233 | 0 | const float wx = weights->horz[std::abs(dx) * 4]; |
234 | 0 | const int64_t clamped_x = Mirror(x + dx, xsize); |
235 | 0 | mul += clamped_row[clamped_x] * wx * wy; |
236 | 0 | } |
237 | 0 | } |
238 | 0 | row_out[x] = mul; |
239 | 0 | } |
240 | 0 | } |
241 | 0 | } Unexecuted instantiation: void jxl::N_SSE4::Separable5Impl::ConvolveRow<0ul, true>(unsigned int) Unexecuted instantiation: void jxl::N_SSE4::Separable5Impl::ConvolveRow<0ul, false>(unsigned int) Unexecuted instantiation: void jxl::N_SSE4::Separable5Impl::ConvolveRow<1ul, true>(unsigned int) Unexecuted instantiation: void jxl::N_SSE4::Separable5Impl::ConvolveRow<1ul, false>(unsigned int) Unexecuted instantiation: void jxl::N_SSE4::Separable5Impl::ConvolveRow<2ul, true>(unsigned int) Unexecuted instantiation: void jxl::N_SSE4::Separable5Impl::ConvolveRow<2ul, false>(unsigned int) Unexecuted instantiation: void jxl::N_SSE4::Separable5Impl::ConvolveRow<3ul, true>(unsigned int) Unexecuted instantiation: void jxl::N_SSE4::Separable5Impl::ConvolveRow<3ul, false>(unsigned int) Unexecuted instantiation: void jxl::N_AVX2::Separable5Impl::ConvolveRow<0ul, true>(unsigned int) Unexecuted instantiation: void jxl::N_AVX2::Separable5Impl::ConvolveRow<0ul, false>(unsigned int) Unexecuted instantiation: void jxl::N_AVX2::Separable5Impl::ConvolveRow<1ul, true>(unsigned int) Unexecuted instantiation: void jxl::N_AVX2::Separable5Impl::ConvolveRow<1ul, false>(unsigned int) Unexecuted instantiation: void jxl::N_AVX2::Separable5Impl::ConvolveRow<2ul, true>(unsigned int) Unexecuted instantiation: void jxl::N_AVX2::Separable5Impl::ConvolveRow<2ul, false>(unsigned int) Unexecuted instantiation: void jxl::N_AVX2::Separable5Impl::ConvolveRow<3ul, true>(unsigned int) Unexecuted instantiation: void jxl::N_AVX2::Separable5Impl::ConvolveRow<3ul, false>(unsigned int) Unexecuted instantiation: void jxl::N_SSE2::Separable5Impl::ConvolveRow<0ul, true>(unsigned int) Unexecuted instantiation: void jxl::N_SSE2::Separable5Impl::ConvolveRow<0ul, false>(unsigned int) Unexecuted instantiation: void jxl::N_SSE2::Separable5Impl::ConvolveRow<1ul, true>(unsigned int) Unexecuted instantiation: void jxl::N_SSE2::Separable5Impl::ConvolveRow<1ul, false>(unsigned int) Unexecuted instantiation: void jxl::N_SSE2::Separable5Impl::ConvolveRow<2ul, true>(unsigned int) Unexecuted instantiation: void jxl::N_SSE2::Separable5Impl::ConvolveRow<2ul, false>(unsigned int) Unexecuted instantiation: void jxl::N_SSE2::Separable5Impl::ConvolveRow<3ul, true>(unsigned int) Unexecuted instantiation: void jxl::N_SSE2::Separable5Impl::ConvolveRow<3ul, false>(unsigned int) |
242 | | |
243 | | private: |
244 | | template <size_t kSizeModN> |
245 | 0 | JXL_INLINE void RunRows() { |
246 | | // NB: borders are image-bound, not rect-bound. |
247 | 0 | size_t ybegin = rect.y0(); |
248 | 0 | size_t yend = rect.y1(); |
249 | 0 | while (ybegin < yend && ybegin < kRadius) { |
250 | 0 | ybegin++; |
251 | 0 | } |
252 | 0 | while (ybegin < yend && yend + kRadius > in->ysize()) { |
253 | 0 | yend--; |
254 | 0 | } |
255 | 0 | if (ybegin > rect.y0()) { |
256 | 0 | RunBorderRows<kSizeModN>(0, ybegin - rect.y0()); |
257 | 0 | } |
258 | 0 | if (yend > ybegin) { |
259 | 0 | RunInteriorRows<kSizeModN>(ybegin - rect.y0(), yend - rect.y0()); |
260 | 0 | } |
261 | 0 | if (yend < rect.y1()) { |
262 | 0 | RunBorderRows<kSizeModN>(yend - rect.y0(), rect.ysize()); |
263 | 0 | } |
264 | 0 | } Unexecuted instantiation: void jxl::N_SSE4::Separable5Impl::RunRows<0ul>() Unexecuted instantiation: void jxl::N_SSE4::Separable5Impl::RunRows<1ul>() Unexecuted instantiation: void jxl::N_SSE4::Separable5Impl::RunRows<2ul>() Unexecuted instantiation: void jxl::N_SSE4::Separable5Impl::RunRows<3ul>() Unexecuted instantiation: void jxl::N_AVX2::Separable5Impl::RunRows<0ul>() Unexecuted instantiation: void jxl::N_AVX2::Separable5Impl::RunRows<1ul>() Unexecuted instantiation: void jxl::N_AVX2::Separable5Impl::RunRows<2ul>() Unexecuted instantiation: void jxl::N_AVX2::Separable5Impl::RunRows<3ul>() Unexecuted instantiation: void jxl::N_SSE2::Separable5Impl::RunRows<0ul>() Unexecuted instantiation: void jxl::N_SSE2::Separable5Impl::RunRows<1ul>() Unexecuted instantiation: void jxl::N_SSE2::Separable5Impl::RunRows<2ul>() Unexecuted instantiation: void jxl::N_SSE2::Separable5Impl::RunRows<3ul>() |
265 | | |
266 | | template <size_t kSizeModN> |
267 | 0 | JXL_INLINE void RunBorderRows(const size_t ybegin, const size_t yend) { |
268 | 0 | for (size_t y = ybegin; y < yend; ++y) { |
269 | 0 | ConvolveRow<kSizeModN, true>(y); |
270 | 0 | } |
271 | 0 | } Unexecuted instantiation: void jxl::N_SSE4::Separable5Impl::RunBorderRows<0ul>(unsigned long, unsigned long) Unexecuted instantiation: void jxl::N_SSE4::Separable5Impl::RunBorderRows<1ul>(unsigned long, unsigned long) Unexecuted instantiation: void jxl::N_SSE4::Separable5Impl::RunBorderRows<2ul>(unsigned long, unsigned long) Unexecuted instantiation: void jxl::N_SSE4::Separable5Impl::RunBorderRows<3ul>(unsigned long, unsigned long) Unexecuted instantiation: void jxl::N_AVX2::Separable5Impl::RunBorderRows<0ul>(unsigned long, unsigned long) Unexecuted instantiation: void jxl::N_AVX2::Separable5Impl::RunBorderRows<1ul>(unsigned long, unsigned long) Unexecuted instantiation: void jxl::N_AVX2::Separable5Impl::RunBorderRows<2ul>(unsigned long, unsigned long) Unexecuted instantiation: void jxl::N_AVX2::Separable5Impl::RunBorderRows<3ul>(unsigned long, unsigned long) Unexecuted instantiation: void jxl::N_SSE2::Separable5Impl::RunBorderRows<0ul>(unsigned long, unsigned long) Unexecuted instantiation: void jxl::N_SSE2::Separable5Impl::RunBorderRows<1ul>(unsigned long, unsigned long) Unexecuted instantiation: void jxl::N_SSE2::Separable5Impl::RunBorderRows<2ul>(unsigned long, unsigned long) Unexecuted instantiation: void jxl::N_SSE2::Separable5Impl::RunBorderRows<3ul>(unsigned long, unsigned long) |
272 | | |
273 | | template <size_t kSizeModN> |
274 | 0 | JXL_INLINE void RunInteriorRows(const size_t ybegin, const size_t yend) { |
275 | 0 | const auto process_row = [&](const uint32_t y, size_t /*thread*/) HWY_ATTR { |
276 | 0 | ConvolveRow<kSizeModN, false>(y); |
277 | 0 | return true; |
278 | 0 | }; Unexecuted instantiation: jxl::N_SSE4::Separable5Impl::RunInteriorRows<0ul>(unsigned long, unsigned long)::{lambda(unsigned int, unsigned long)#1}::operator()(unsigned int, unsigned long) const Unexecuted instantiation: jxl::N_SSE4::Separable5Impl::RunInteriorRows<1ul>(unsigned long, unsigned long)::{lambda(unsigned int, unsigned long)#1}::operator()(unsigned int, unsigned long) const Unexecuted instantiation: jxl::N_SSE4::Separable5Impl::RunInteriorRows<2ul>(unsigned long, unsigned long)::{lambda(unsigned int, unsigned long)#1}::operator()(unsigned int, unsigned long) const Unexecuted instantiation: jxl::N_SSE4::Separable5Impl::RunInteriorRows<3ul>(unsigned long, unsigned long)::{lambda(unsigned int, unsigned long)#1}::operator()(unsigned int, unsigned long) const Unexecuted instantiation: jxl::N_AVX2::Separable5Impl::RunInteriorRows<0ul>(unsigned long, unsigned long)::{lambda(unsigned int, unsigned long)#1}::operator()(unsigned int, unsigned long) const Unexecuted instantiation: jxl::N_AVX2::Separable5Impl::RunInteriorRows<1ul>(unsigned long, unsigned long)::{lambda(unsigned int, unsigned long)#1}::operator()(unsigned int, unsigned long) const Unexecuted instantiation: jxl::N_AVX2::Separable5Impl::RunInteriorRows<2ul>(unsigned long, unsigned long)::{lambda(unsigned int, unsigned long)#1}::operator()(unsigned int, unsigned long) const Unexecuted instantiation: jxl::N_AVX2::Separable5Impl::RunInteriorRows<3ul>(unsigned long, unsigned long)::{lambda(unsigned int, unsigned long)#1}::operator()(unsigned int, unsigned long) const Unexecuted instantiation: jxl::N_SSE2::Separable5Impl::RunInteriorRows<0ul>(unsigned long, unsigned long)::{lambda(unsigned int, unsigned long)#1}::operator()(unsigned int, unsigned long) const Unexecuted instantiation: jxl::N_SSE2::Separable5Impl::RunInteriorRows<1ul>(unsigned long, unsigned long)::{lambda(unsigned int, unsigned long)#1}::operator()(unsigned int, unsigned long) const Unexecuted instantiation: jxl::N_SSE2::Separable5Impl::RunInteriorRows<2ul>(unsigned long, unsigned long)::{lambda(unsigned int, unsigned long)#1}::operator()(unsigned int, unsigned long) const Unexecuted instantiation: jxl::N_SSE2::Separable5Impl::RunInteriorRows<3ul>(unsigned long, unsigned long)::{lambda(unsigned int, unsigned long)#1}::operator()(unsigned int, unsigned long) const |
279 | 0 | Status status = RunOnPool(pool, ybegin, yend, ThreadPool::NoInit, |
280 | 0 | process_row, "Convolve"); |
281 | 0 | JXL_DASSERT(status); |
282 | 0 | (void)status; |
283 | 0 | } Unexecuted instantiation: void jxl::N_SSE4::Separable5Impl::RunInteriorRows<0ul>(unsigned long, unsigned long) Unexecuted instantiation: void jxl::N_SSE4::Separable5Impl::RunInteriorRows<1ul>(unsigned long, unsigned long) Unexecuted instantiation: void jxl::N_SSE4::Separable5Impl::RunInteriorRows<2ul>(unsigned long, unsigned long) Unexecuted instantiation: void jxl::N_SSE4::Separable5Impl::RunInteriorRows<3ul>(unsigned long, unsigned long) Unexecuted instantiation: void jxl::N_AVX2::Separable5Impl::RunInteriorRows<0ul>(unsigned long, unsigned long) Unexecuted instantiation: void jxl::N_AVX2::Separable5Impl::RunInteriorRows<1ul>(unsigned long, unsigned long) Unexecuted instantiation: void jxl::N_AVX2::Separable5Impl::RunInteriorRows<2ul>(unsigned long, unsigned long) Unexecuted instantiation: void jxl::N_AVX2::Separable5Impl::RunInteriorRows<3ul>(unsigned long, unsigned long) Unexecuted instantiation: void jxl::N_SSE2::Separable5Impl::RunInteriorRows<0ul>(unsigned long, unsigned long) Unexecuted instantiation: void jxl::N_SSE2::Separable5Impl::RunInteriorRows<1ul>(unsigned long, unsigned long) Unexecuted instantiation: void jxl::N_SSE2::Separable5Impl::RunInteriorRows<2ul>(unsigned long, unsigned long) Unexecuted instantiation: void jxl::N_SSE2::Separable5Impl::RunInteriorRows<3ul>(unsigned long, unsigned long) |
284 | | |
285 | | // Returns IndicesFromVec(d, indices) such that TableLookupLanes on the |
286 | | // rightmost unaligned vector (rightmost sample in its most-significant lane) |
287 | | // returns the mirrored values, with the mirror outside the last valid sample. |
288 | | template <size_t M> |
289 | 0 | static JXL_INLINE I MirrorLanes() { |
290 | 0 | static_assert(M >= 1 && M <= 2, "Only M in range {1..2} is supported"); |
291 | 0 | D d; |
292 | 0 | DI32 di32; |
293 | 0 | const VI32 up = Min(Iota(di32, M), Set(di32, Lanes(d) - 1)); |
294 | 0 | const VI32 down = Max(Iota(di32, M - Lanes(d)), Zero(di32)); |
295 | 0 | return IndicesFromVec(d, Sub(up, down)); |
296 | 0 | } Unexecuted instantiation: hwy::N_SSE4::Indices128<float, 4ul> jxl::N_SSE4::Separable5Impl::MirrorLanes<1ul>() Unexecuted instantiation: hwy::N_SSE4::Indices128<float, 4ul> jxl::N_SSE4::Separable5Impl::MirrorLanes<2ul>() Unexecuted instantiation: hwy::N_AVX2::Indices256<float> jxl::N_AVX2::Separable5Impl::MirrorLanes<1ul>() Unexecuted instantiation: hwy::N_AVX2::Indices256<float> jxl::N_AVX2::Separable5Impl::MirrorLanes<2ul>() Unexecuted instantiation: hwy::N_SSE2::Indices128<float, 4ul> jxl::N_SSE2::Separable5Impl::MirrorLanes<1ul>() Unexecuted instantiation: hwy::N_SSE2::Indices128<float, 4ul> jxl::N_SSE2::Separable5Impl::MirrorLanes<2ul>() |
297 | | |
298 | | // Same as HorzConvolve for the first/last vector in a row. |
299 | | static JXL_MAYBE_INLINE V HorzConvolveFirst( |
300 | | const float* const JXL_RESTRICT row, const int64_t x, const int64_t xsize, |
301 | 0 | const V wh0, const V wh1, const V wh2) { |
302 | 0 | const D d; |
303 | 0 | const V c = LoadU(d, row + x); |
304 | 0 | const V mul0 = Mul(c, wh0); |
305 | |
|
306 | | #if HWY_TARGET == HWY_SCALAR |
307 | | const V l1 = LoadU(d, row + Mirror(x - 1, xsize)); |
308 | | const V l2 = LoadU(d, row + Mirror(x - 2, xsize)); |
309 | | #else |
310 | 0 | (void)xsize; |
311 | 0 | const V l1 = Neighbors::FirstL1(c); |
312 | 0 | const V l2 = Neighbors::FirstL2(c); |
313 | 0 | #endif |
314 | |
|
315 | 0 | const V r1 = LoadU(d, row + x + 1); |
316 | 0 | const V r2 = LoadU(d, row + x + 2); |
317 | |
|
318 | 0 | const V mul1 = MulAdd(Add(l1, r1), wh1, mul0); |
319 | 0 | const V mul2 = MulAdd(Add(l2, r2), wh2, mul1); |
320 | 0 | return mul2; |
321 | 0 | } Unexecuted instantiation: jxl::N_SSE4::Separable5Impl::HorzConvolveFirst(float const*, long, long, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>) Unexecuted instantiation: jxl::N_AVX2::Separable5Impl::HorzConvolveFirst(float const*, long, long, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>) Unexecuted instantiation: jxl::N_SSE2::Separable5Impl::HorzConvolveFirst(float const*, long, long, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>) |
322 | | |
323 | | template <size_t kSizeModN> |
324 | | static JXL_MAYBE_INLINE V HorzConvolveLast( |
325 | | const float* const JXL_RESTRICT row, const int64_t x, const int64_t xsize, |
326 | 0 | const V wh0, const V wh1, const V wh2, const I ml1, const I ml2) { |
327 | 0 | const D d; |
328 | 0 | const V c = LoadU(d, row + x); |
329 | 0 | const V mul0 = Mul(c, wh0); |
330 | |
|
331 | 0 | const V l1 = LoadU(d, row + x - 1); |
332 | 0 | const V l2 = LoadU(d, row + x - 2); |
333 | |
|
334 | 0 | V r1; |
335 | 0 | V r2; |
336 | | #if HWY_TARGET == HWY_SCALAR |
337 | | r1 = LoadU(d, row + Mirror(x + 1, xsize)); |
338 | | r2 = LoadU(d, row + Mirror(x + 2, xsize)); |
339 | | (void)ml1; |
340 | | (void)ml2; |
341 | | #else |
342 | 0 | const size_t N = Lanes(d); |
343 | 0 | if (kSizeModN == 0) { |
344 | 0 | r2 = TableLookupLanes(c, ml2); |
345 | 0 | r1 = TableLookupLanes(c, ml1); |
346 | 0 | } else { // == 1 |
347 | 0 | const auto last = LoadU(d, row + xsize - N); |
348 | 0 | r2 = TableLookupLanes(last, ml1); |
349 | 0 | r1 = last; |
350 | 0 | } |
351 | 0 | #endif |
352 | | |
353 | | // Sum of pixels with Manhattan distance i, multiplied by weights[i]. |
354 | 0 | const V sum1 = Add(l1, r1); |
355 | 0 | const V mul1 = MulAdd(sum1, wh1, mul0); |
356 | 0 | const V sum2 = Add(l2, r2); |
357 | 0 | const V mul2 = MulAdd(sum2, wh2, mul1); |
358 | 0 | return mul2; |
359 | 0 | } Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::Separable5Impl::HorzConvolveLast<0ul>(float const*, long, long, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Indices128<float, 4ul>, hwy::N_SSE4::Indices128<float, 4ul>) Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::Separable5Impl::HorzConvolveLast<1ul>(float const*, long, long, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Indices128<float, 4ul>, hwy::N_SSE4::Indices128<float, 4ul>) Unexecuted instantiation: hwy::N_AVX2::Vec256<float> jxl::N_AVX2::Separable5Impl::HorzConvolveLast<0ul>(float const*, long, long, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Indices256<float>, hwy::N_AVX2::Indices256<float>) Unexecuted instantiation: hwy::N_AVX2::Vec256<float> jxl::N_AVX2::Separable5Impl::HorzConvolveLast<1ul>(float const*, long, long, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Indices256<float>, hwy::N_AVX2::Indices256<float>) Unexecuted instantiation: hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::Separable5Impl::HorzConvolveLast<0ul>(float const*, long, long, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Indices128<float, 4ul>, hwy::N_SSE2::Indices128<float, 4ul>) Unexecuted instantiation: hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::Separable5Impl::HorzConvolveLast<1ul>(float const*, long, long, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Indices128<float, 4ul>, hwy::N_SSE2::Indices128<float, 4ul>) Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::Separable5Impl::HorzConvolveLast<2ul>(float const*, long, long, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Indices128<float, 4ul>, hwy::N_SSE4::Indices128<float, 4ul>) Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::Separable5Impl::HorzConvolveLast<3ul>(float const*, long, long, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Indices128<float, 4ul>, hwy::N_SSE4::Indices128<float, 4ul>) Unexecuted instantiation: hwy::N_AVX2::Vec256<float> jxl::N_AVX2::Separable5Impl::HorzConvolveLast<2ul>(float const*, long, long, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Indices256<float>, hwy::N_AVX2::Indices256<float>) Unexecuted instantiation: hwy::N_AVX2::Vec256<float> jxl::N_AVX2::Separable5Impl::HorzConvolveLast<3ul>(float const*, long, long, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Indices256<float>, hwy::N_AVX2::Indices256<float>) Unexecuted instantiation: hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::Separable5Impl::HorzConvolveLast<2ul>(float const*, long, long, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Indices128<float, 4ul>, hwy::N_SSE2::Indices128<float, 4ul>) Unexecuted instantiation: hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::Separable5Impl::HorzConvolveLast<3ul>(float const*, long, long, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Indices128<float, 4ul>, hwy::N_SSE2::Indices128<float, 4ul>) |
360 | | |
361 | | // Requires kRadius valid pixels before/after pos. |
362 | | static JXL_MAYBE_INLINE V HorzConvolve(const float* const JXL_RESTRICT pos, |
363 | | const V wh0, const V wh1, |
364 | 0 | const V wh2) { |
365 | 0 | const D d; |
366 | 0 | const V c = LoadU(d, pos); |
367 | 0 | const V mul0 = Mul(c, wh0); |
368 | | |
369 | | // Loading anew is faster than combining vectors. |
370 | 0 | const V l1 = LoadU(d, pos - 1); |
371 | 0 | const V r1 = LoadU(d, pos + 1); |
372 | 0 | const V l2 = LoadU(d, pos - 2); |
373 | 0 | const V r2 = LoadU(d, pos + 2); |
374 | | // Sum of pixels with Manhattan distance i, multiplied by weights[i]. |
375 | 0 | const V sum1 = Add(l1, r1); |
376 | 0 | const V mul1 = MulAdd(sum1, wh1, mul0); |
377 | 0 | const V sum2 = Add(l2, r2); |
378 | 0 | const V mul2 = MulAdd(sum2, wh2, mul1); |
379 | 0 | return mul2; |
380 | 0 | } Unexecuted instantiation: jxl::N_SSE4::Separable5Impl::HorzConvolve(float const*, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>) Unexecuted instantiation: jxl::N_AVX2::Separable5Impl::HorzConvolve(float const*, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>) Unexecuted instantiation: jxl::N_SSE2::Separable5Impl::HorzConvolve(float const*, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>) |
381 | | |
382 | | const ImageF* in; |
383 | | const Rect rect; |
384 | | const WeightsSeparable5* weights; |
385 | | ThreadPool* pool; |
386 | | ImageF* out; |
387 | | }; |
388 | | |
389 | | Status Separable5(const ImageF& in, const Rect& rect, |
390 | | const WeightsSeparable5& weights, ThreadPool* pool, |
391 | 0 | ImageF* out) { |
392 | 0 | Separable5Impl impl(&in, rect, &weights, pool, out); |
393 | 0 | return impl.Run(); |
394 | 0 | } Unexecuted instantiation: jxl::N_SSE4::Separable5(jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, jxl::WeightsSeparable5 const&, jxl::ThreadPool*, jxl::Plane<float>*) Unexecuted instantiation: jxl::N_AVX2::Separable5(jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, jxl::WeightsSeparable5 const&, jxl::ThreadPool*, jxl::Plane<float>*) Unexecuted instantiation: jxl::N_SSE2::Separable5(jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, jxl::WeightsSeparable5 const&, jxl::ThreadPool*, jxl::Plane<float>*) |
395 | | |
396 | | // NOLINTNEXTLINE(google-readability-namespace-comments) |
397 | | } // namespace HWY_NAMESPACE |
398 | | } // namespace jxl |
399 | | HWY_AFTER_NAMESPACE(); |
400 | | |
401 | | #if HWY_ONCE |
402 | | namespace jxl { |
403 | | |
404 | | HWY_EXPORT(Separable5); |
405 | | Status Separable5(const ImageF& in, const Rect& rect, |
406 | | const WeightsSeparable5& weights, ThreadPool* pool, |
407 | 0 | ImageF* out) { |
408 | 0 | return HWY_DYNAMIC_DISPATCH(Separable5)(in, rect, weights, pool, out); |
409 | 0 | } |
410 | | |
411 | | } // namespace jxl |
412 | | #endif // HWY_ONCE |