/src/libjxl/lib/jxl/convolve_separable5.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) the JPEG XL Project Authors. All rights reserved. |
2 | | // |
3 | | // Use of this source code is governed by a BSD-style |
4 | | // license that can be found in the LICENSE file. |
5 | | |
6 | | #include "lib/jxl/convolve.h" |
7 | | |
8 | | #undef HWY_TARGET_INCLUDE |
9 | | #define HWY_TARGET_INCLUDE "lib/jxl/convolve_separable5.cc" |
10 | | #include <hwy/foreach_target.h> |
11 | | #include <hwy/highway.h> |
12 | | |
13 | | #include "lib/jxl/base/rect.h" |
14 | | #include "lib/jxl/convolve-inl.h" |
15 | | |
16 | | HWY_BEFORE_NAMESPACE(); |
17 | | namespace jxl { |
18 | | namespace HWY_NAMESPACE { |
19 | | |
20 | | // These templates are not found via ADL. |
21 | | using hwy::HWY_NAMESPACE::Add; |
22 | | using hwy::HWY_NAMESPACE::Mul; |
23 | | using hwy::HWY_NAMESPACE::MulAdd; |
24 | | using hwy::HWY_NAMESPACE::Vec; |
25 | | |
26 | | // 5x5 convolution by separable kernel with a single scan through the input. |
27 | | // This is more cache-efficient than separate horizontal/vertical passes, and |
28 | | // possibly faster (given enough registers) than tiling and/or transposing. |
29 | | // |
30 | | // Overview: imagine a 5x5 window around a central pixel. First convolve the |
31 | | // rows by multiplying the pixels with the corresponding weights from |
32 | | // WeightsSeparable5.horz[abs(x_offset) * 4]. Then multiply each of these |
33 | | // intermediate results by the corresponding vertical weight, i.e. |
34 | | // vert[abs(y_offset) * 4]. Finally, store the sum of these values as the |
35 | | // convolution result at the position of the central pixel in the output. |
36 | | // |
37 | | // Each of these operations uses SIMD vectors. The central pixel and most |
38 | | // importantly the output are aligned, so neighnoring pixels (e.g. x_offset=1) |
39 | | // require unaligned loads. Because weights are supplied in identical groups of |
40 | | // 4, we can use LoadDup128 to load them (slightly faster). |
41 | | // |
42 | | // Uses mirrored boundary handling. Until x >= kRadius, the horizontal |
43 | | // convolution uses Neighbors class to shuffle vectors as if each of its lanes |
44 | | // had been loaded from the mirrored offset. Similarly, the last full vector to |
45 | | // write uses mirroring. In the case of scalar vectors, Neighbors is not usable |
46 | | // and the value is loaded directly. Otherwise, the number of valid pixels |
47 | | // modulo the vector size enables a small optimization: for smaller offsets, |
48 | | // a non-mirrored load is sufficient. |
49 | | class Separable5Strategy { |
50 | | using D = HWY_CAPPED(float, 16); |
51 | | using V = Vec<D>; |
52 | | |
53 | | public: |
54 | | static constexpr int64_t kRadius = 2; |
55 | | |
56 | | template <size_t kSizeModN, class WrapRow> |
57 | | static JXL_MAYBE_INLINE void ConvolveRow( |
58 | | const float* const JXL_RESTRICT row_m, const size_t xsize, |
59 | | const int64_t stride, const WrapRow& wrap_row, |
60 | 0 | const WeightsSeparable5& weights, float* const JXL_RESTRICT row_out) { |
61 | 0 | const D d; |
62 | 0 | const int64_t neg_stride = -stride; // allows LEA addressing. |
63 | 0 | const float* const JXL_RESTRICT row_t2 = |
64 | 0 | wrap_row(row_m + 2 * neg_stride, stride); |
65 | 0 | const float* const JXL_RESTRICT row_t1 = |
66 | 0 | wrap_row(row_m + 1 * neg_stride, stride); |
67 | 0 | const float* const JXL_RESTRICT row_b1 = |
68 | 0 | wrap_row(row_m + 1 * stride, stride); |
69 | 0 | const float* const JXL_RESTRICT row_b2 = |
70 | 0 | wrap_row(row_m + 2 * stride, stride); |
71 | |
|
72 | 0 | const V wh0 = LoadDup128(d, weights.horz + 0 * 4); |
73 | 0 | const V wh1 = LoadDup128(d, weights.horz + 1 * 4); |
74 | 0 | const V wh2 = LoadDup128(d, weights.horz + 2 * 4); |
75 | 0 | const V wv0 = LoadDup128(d, weights.vert + 0 * 4); |
76 | 0 | const V wv1 = LoadDup128(d, weights.vert + 1 * 4); |
77 | 0 | const V wv2 = LoadDup128(d, weights.vert + 2 * 4); |
78 | |
|
79 | 0 | size_t x = 0; |
80 | | |
81 | | // More than one iteration for scalars. |
82 | 0 | for (; x < kRadius; x += Lanes(d)) { |
83 | 0 | const V conv0 = |
84 | 0 | Mul(HorzConvolveFirst(row_m, x, xsize, wh0, wh1, wh2), wv0); |
85 | |
|
86 | 0 | const V conv1t = HorzConvolveFirst(row_t1, x, xsize, wh0, wh1, wh2); |
87 | 0 | const V conv1b = HorzConvolveFirst(row_b1, x, xsize, wh0, wh1, wh2); |
88 | 0 | const V conv1 = MulAdd(Add(conv1t, conv1b), wv1, conv0); |
89 | |
|
90 | 0 | const V conv2t = HorzConvolveFirst(row_t2, x, xsize, wh0, wh1, wh2); |
91 | 0 | const V conv2b = HorzConvolveFirst(row_b2, x, xsize, wh0, wh1, wh2); |
92 | 0 | const V conv2 = MulAdd(Add(conv2t, conv2b), wv2, conv1); |
93 | 0 | Store(conv2, d, row_out + x); |
94 | 0 | } |
95 | | |
96 | | // Main loop: load inputs without padding |
97 | 0 | for (; x + Lanes(d) + kRadius <= xsize; x += Lanes(d)) { |
98 | 0 | const V conv0 = Mul(HorzConvolve(row_m + x, wh0, wh1, wh2), wv0); |
99 | |
|
100 | 0 | const V conv1t = HorzConvolve(row_t1 + x, wh0, wh1, wh2); |
101 | 0 | const V conv1b = HorzConvolve(row_b1 + x, wh0, wh1, wh2); |
102 | 0 | const V conv1 = MulAdd(Add(conv1t, conv1b), wv1, conv0); |
103 | |
|
104 | 0 | const V conv2t = HorzConvolve(row_t2 + x, wh0, wh1, wh2); |
105 | 0 | const V conv2b = HorzConvolve(row_b2 + x, wh0, wh1, wh2); |
106 | 0 | const V conv2 = MulAdd(Add(conv2t, conv2b), wv2, conv1); |
107 | 0 | Store(conv2, d, row_out + x); |
108 | 0 | } |
109 | | |
110 | | // Last full vector to write (the above loop handled mod >= kRadius) |
111 | 0 | #if HWY_TARGET == HWY_SCALAR |
112 | 0 | while (x < xsize) { |
113 | | #else |
114 | | if (kSizeModN < kRadius) { |
115 | | #endif |
116 | 0 | const V conv0 = |
117 | 0 | Mul(HorzConvolveLast<kSizeModN>(row_m, x, xsize, wh0, wh1, wh2), wv0); |
118 | |
|
119 | 0 | const V conv1t = |
120 | 0 | HorzConvolveLast<kSizeModN>(row_t1, x, xsize, wh0, wh1, wh2); |
121 | 0 | const V conv1b = |
122 | 0 | HorzConvolveLast<kSizeModN>(row_b1, x, xsize, wh0, wh1, wh2); |
123 | 0 | const V conv1 = MulAdd(Add(conv1t, conv1b), wv1, conv0); |
124 | |
|
125 | 0 | const V conv2t = |
126 | 0 | HorzConvolveLast<kSizeModN>(row_t2, x, xsize, wh0, wh1, wh2); |
127 | 0 | const V conv2b = |
128 | 0 | HorzConvolveLast<kSizeModN>(row_b2, x, xsize, wh0, wh1, wh2); |
129 | 0 | const V conv2 = MulAdd(Add(conv2t, conv2b), wv2, conv1); |
130 | 0 | Store(conv2, d, row_out + x); |
131 | 0 | x += Lanes(d); |
132 | 0 | } |
133 | | |
134 | | // If mod = 0, the above vector was the last. |
135 | 0 | if (kSizeModN != 0) { |
136 | 0 | for (; x < xsize; ++x) { |
137 | 0 | float mul = 0.0f; |
138 | 0 | for (int64_t dy = -kRadius; dy <= kRadius; ++dy) { |
139 | 0 | const float wy = weights.vert[std::abs(dy) * 4]; |
140 | 0 | const float* clamped_row = wrap_row(row_m + dy * stride, stride); |
141 | 0 | for (int64_t dx = -kRadius; dx <= kRadius; ++dx) { |
142 | 0 | const float wx = weights.horz[std::abs(dx) * 4]; |
143 | 0 | const int64_t clamped_x = Mirror(x + dx, xsize); |
144 | 0 | mul += clamped_row[clamped_x] * wx * wy; |
145 | 0 | } |
146 | 0 | } |
147 | 0 | row_out[x] = mul; |
148 | 0 | } |
149 | 0 | } |
150 | 0 | } Unexecuted instantiation: void jxl::N_SCALAR::Separable5Strategy::ConvolveRow<0ul, jxl::WrapRowMirror>(float const*, unsigned long, long, jxl::WrapRowMirror const&, jxl::WeightsSeparable5 const&, float*) Unexecuted instantiation: void jxl::N_SCALAR::Separable5Strategy::ConvolveRow<0ul, jxl::WrapRowUnchanged>(float const*, unsigned long, long, jxl::WrapRowUnchanged const&, jxl::WeightsSeparable5 const&, float*) Unexecuted instantiation: void jxl::N_SCALAR::Separable5Strategy::ConvolveRow<1ul, jxl::WrapRowMirror>(float const*, unsigned long, long, jxl::WrapRowMirror const&, jxl::WeightsSeparable5 const&, float*) Unexecuted instantiation: void jxl::N_SCALAR::Separable5Strategy::ConvolveRow<1ul, jxl::WrapRowUnchanged>(float const*, unsigned long, long, jxl::WrapRowUnchanged const&, jxl::WeightsSeparable5 const&, float*) Unexecuted instantiation: void jxl::N_SCALAR::Separable5Strategy::ConvolveRow<2ul, jxl::WrapRowMirror>(float const*, unsigned long, long, jxl::WrapRowMirror const&, jxl::WeightsSeparable5 const&, float*) Unexecuted instantiation: void jxl::N_SCALAR::Separable5Strategy::ConvolveRow<2ul, jxl::WrapRowUnchanged>(float const*, unsigned long, long, jxl::WrapRowUnchanged const&, jxl::WeightsSeparable5 const&, float*) Unexecuted instantiation: void jxl::N_SCALAR::Separable5Strategy::ConvolveRow<3ul, jxl::WrapRowMirror>(float const*, unsigned long, long, jxl::WrapRowMirror const&, jxl::WeightsSeparable5 const&, float*) Unexecuted instantiation: void jxl::N_SCALAR::Separable5Strategy::ConvolveRow<3ul, jxl::WrapRowUnchanged>(float const*, unsigned long, long, jxl::WrapRowUnchanged const&, jxl::WeightsSeparable5 const&, float*) |
151 | | |
152 | | private: |
153 | | // Same as HorzConvolve for the first/last vector in a row. |
154 | | static JXL_MAYBE_INLINE V HorzConvolveFirst( |
155 | | const float* const JXL_RESTRICT row, const int64_t x, const int64_t xsize, |
156 | 0 | const V wh0, const V wh1, const V wh2) { |
157 | 0 | const D d; |
158 | 0 | const V c = LoadU(d, row + x); |
159 | 0 | const V mul0 = Mul(c, wh0); |
160 | |
|
161 | 0 | #if HWY_TARGET == HWY_SCALAR |
162 | 0 | const V l1 = LoadU(d, row + Mirror(x - 1, xsize)); |
163 | 0 | const V l2 = LoadU(d, row + Mirror(x - 2, xsize)); |
164 | | #else |
165 | | (void)xsize; |
166 | | const V l1 = Neighbors::FirstL1(c); |
167 | | const V l2 = Neighbors::FirstL2(c); |
168 | | #endif |
169 | |
|
170 | 0 | const V r1 = LoadU(d, row + x + 1); |
171 | 0 | const V r2 = LoadU(d, row + x + 2); |
172 | |
|
173 | 0 | const V mul1 = MulAdd(Add(l1, r1), wh1, mul0); |
174 | 0 | const V mul2 = MulAdd(Add(l2, r2), wh2, mul1); |
175 | 0 | return mul2; |
176 | 0 | } |
177 | | |
178 | | template <size_t kSizeModN> |
179 | | static JXL_MAYBE_INLINE V |
180 | | HorzConvolveLast(const float* const JXL_RESTRICT row, const int64_t x, |
181 | 0 | const int64_t xsize, const V wh0, const V wh1, const V wh2) { |
182 | 0 | const D d; |
183 | 0 | const V c = LoadU(d, row + x); |
184 | 0 | const V mul0 = Mul(c, wh0); |
185 | |
|
186 | 0 | const V l1 = LoadU(d, row + x - 1); |
187 | 0 | const V l2 = LoadU(d, row + x - 2); |
188 | |
|
189 | 0 | V r1; |
190 | 0 | V r2; |
191 | 0 | #if HWY_TARGET == HWY_SCALAR |
192 | 0 | r1 = LoadU(d, row + Mirror(x + 1, xsize)); |
193 | 0 | r2 = LoadU(d, row + Mirror(x + 2, xsize)); |
194 | | #else |
195 | | const size_t N = Lanes(d); |
196 | | if (kSizeModN == 0) { |
197 | | r2 = TableLookupLanes(c, SetTableIndices(d, MirrorLanes(N - 2))); |
198 | | r1 = TableLookupLanes(c, SetTableIndices(d, MirrorLanes(N - 1))); |
199 | | } else { // == 1 |
200 | | const auto last = LoadU(d, row + xsize - N); |
201 | | r2 = TableLookupLanes(last, SetTableIndices(d, MirrorLanes(N - 1))); |
202 | | r1 = last; |
203 | | } |
204 | | #endif |
205 | | |
206 | | // Sum of pixels with Manhattan distance i, multiplied by weights[i]. |
207 | 0 | const V sum1 = Add(l1, r1); |
208 | 0 | const V mul1 = MulAdd(sum1, wh1, mul0); |
209 | 0 | const V sum2 = Add(l2, r2); |
210 | 0 | const V mul2 = MulAdd(sum2, wh2, mul1); |
211 | 0 | return mul2; |
212 | 0 | } Unexecuted instantiation: hwy::N_SCALAR::Vec1<float> jxl::N_SCALAR::Separable5Strategy::HorzConvolveLast<0ul>(float const*, long, long, hwy::N_SCALAR::Vec1<float>, hwy::N_SCALAR::Vec1<float>, hwy::N_SCALAR::Vec1<float>) Unexecuted instantiation: hwy::N_SCALAR::Vec1<float> jxl::N_SCALAR::Separable5Strategy::HorzConvolveLast<1ul>(float const*, long, long, hwy::N_SCALAR::Vec1<float>, hwy::N_SCALAR::Vec1<float>, hwy::N_SCALAR::Vec1<float>) Unexecuted instantiation: hwy::N_SCALAR::Vec1<float> jxl::N_SCALAR::Separable5Strategy::HorzConvolveLast<2ul>(float const*, long, long, hwy::N_SCALAR::Vec1<float>, hwy::N_SCALAR::Vec1<float>, hwy::N_SCALAR::Vec1<float>) Unexecuted instantiation: hwy::N_SCALAR::Vec1<float> jxl::N_SCALAR::Separable5Strategy::HorzConvolveLast<3ul>(float const*, long, long, hwy::N_SCALAR::Vec1<float>, hwy::N_SCALAR::Vec1<float>, hwy::N_SCALAR::Vec1<float>) |
213 | | |
214 | | // Requires kRadius valid pixels before/after pos. |
215 | | static JXL_MAYBE_INLINE V HorzConvolve(const float* const JXL_RESTRICT pos, |
216 | | const V wh0, const V wh1, |
217 | 0 | const V wh2) { |
218 | 0 | const D d; |
219 | 0 | const V c = LoadU(d, pos); |
220 | 0 | const V mul0 = Mul(c, wh0); |
221 | | |
222 | | // Loading anew is faster than combining vectors. |
223 | 0 | const V l1 = LoadU(d, pos - 1); |
224 | 0 | const V r1 = LoadU(d, pos + 1); |
225 | 0 | const V l2 = LoadU(d, pos - 2); |
226 | 0 | const V r2 = LoadU(d, pos + 2); |
227 | | // Sum of pixels with Manhattan distance i, multiplied by weights[i]. |
228 | 0 | const V sum1 = Add(l1, r1); |
229 | 0 | const V mul1 = MulAdd(sum1, wh1, mul0); |
230 | 0 | const V sum2 = Add(l2, r2); |
231 | 0 | const V mul2 = MulAdd(sum2, wh2, mul1); |
232 | 0 | return mul2; |
233 | 0 | } |
234 | | }; |
235 | | |
236 | | Status Separable5(const ImageF& in, const Rect& rect, |
237 | | const WeightsSeparable5& weights, ThreadPool* pool, |
238 | 0 | ImageF* out) { |
239 | 0 | using Conv = ConvolveT<Separable5Strategy>; |
240 | 0 | if (rect.xsize() >= Conv::MinWidth()) { |
241 | 0 | JXL_ENSURE(SameSize(rect, *out)); |
242 | 0 | JXL_ENSURE(rect.xsize() >= Conv::MinWidth()); |
243 | | |
244 | 0 | Conv::Run(in, rect, weights, pool, out); |
245 | 0 | return true; |
246 | 0 | } |
247 | | |
248 | 0 | return SlowSeparable5(in, rect, weights, pool, out, Rect(*out)); |
249 | 0 | } |
250 | | |
251 | | // NOLINTNEXTLINE(google-readability-namespace-comments) |
252 | | } // namespace HWY_NAMESPACE |
253 | | } // namespace jxl |
254 | | HWY_AFTER_NAMESPACE(); |
255 | | |
256 | | #if HWY_ONCE |
257 | | namespace jxl { |
258 | | |
259 | | HWY_EXPORT(Separable5); |
260 | | Status Separable5(const ImageF& in, const Rect& rect, |
261 | | const WeightsSeparable5& weights, ThreadPool* pool, |
262 | 0 | ImageF* out) { |
263 | 0 | return HWY_DYNAMIC_DISPATCH(Separable5)(in, rect, weights, pool, out); |
264 | 0 | } |
265 | | |
266 | | } // namespace jxl |
267 | | #endif // HWY_ONCE |