Coverage Report

Created: 2025-06-22 08:04

/src/libjxl/lib/jxl/convolve_separable5.cc
Line
Count
Source (jump to first uncovered line)
1
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
2
//
3
// Use of this source code is governed by a BSD-style
4
// license that can be found in the LICENSE file.
5
6
#include "lib/jxl/convolve.h"
7
8
#undef HWY_TARGET_INCLUDE
9
#define HWY_TARGET_INCLUDE "lib/jxl/convolve_separable5.cc"
10
#include <hwy/foreach_target.h>
11
#include <hwy/highway.h>
12
13
#include "lib/jxl/base/rect.h"
14
#include "lib/jxl/convolve-inl.h"
15
16
HWY_BEFORE_NAMESPACE();
17
namespace jxl {
18
namespace HWY_NAMESPACE {
19
20
// These templates are not found via ADL.
21
using hwy::HWY_NAMESPACE::Add;
22
using hwy::HWY_NAMESPACE::Mul;
23
using hwy::HWY_NAMESPACE::MulAdd;
24
using hwy::HWY_NAMESPACE::Vec;
25
26
// 5x5 convolution by separable kernel with a single scan through the input.
27
// This is more cache-efficient than separate horizontal/vertical passes, and
28
// possibly faster (given enough registers) than tiling and/or transposing.
29
//
30
// Overview: imagine a 5x5 window around a central pixel. First convolve the
31
// rows by multiplying the pixels with the corresponding weights from
32
// WeightsSeparable5.horz[abs(x_offset) * 4]. Then multiply each of these
33
// intermediate results by the corresponding vertical weight, i.e.
34
// vert[abs(y_offset) * 4]. Finally, store the sum of these values as the
35
// convolution result at the position of the central pixel in the output.
36
//
37
// Each of these operations uses SIMD vectors. The central pixel and most
38
// importantly the output are aligned, so neighnoring pixels (e.g. x_offset=1)
39
// require unaligned loads. Because weights are supplied in identical groups of
40
// 4, we can use LoadDup128 to load them (slightly faster).
41
//
42
// Uses mirrored boundary handling. Until x >= kRadius, the horizontal
43
// convolution uses Neighbors class to shuffle vectors as if each of its lanes
44
// had been loaded from the mirrored offset. Similarly, the last full vector to
45
// write uses mirroring. In the case of scalar vectors, Neighbors is not usable
46
// and the value is loaded directly. Otherwise, the number of valid pixels
47
// modulo the vector size enables a small optimization: for smaller offsets,
48
// a non-mirrored load is sufficient.
49
class Separable5Strategy {
50
  using D = HWY_CAPPED(float, 16);
51
  using V = Vec<D>;
52
53
 public:
54
  static constexpr int64_t kRadius = 2;
55
56
  template <size_t kSizeModN, class WrapRow>
57
  static JXL_MAYBE_INLINE void ConvolveRow(
58
      const float* const JXL_RESTRICT row_m, const size_t xsize,
59
      const int64_t stride, const WrapRow& wrap_row,
60
0
      const WeightsSeparable5& weights, float* const JXL_RESTRICT row_out) {
61
0
    const D d;
62
0
    const int64_t neg_stride = -stride;  // allows LEA addressing.
63
0
    const float* const JXL_RESTRICT row_t2 =
64
0
        wrap_row(row_m + 2 * neg_stride, stride);
65
0
    const float* const JXL_RESTRICT row_t1 =
66
0
        wrap_row(row_m + 1 * neg_stride, stride);
67
0
    const float* const JXL_RESTRICT row_b1 =
68
0
        wrap_row(row_m + 1 * stride, stride);
69
0
    const float* const JXL_RESTRICT row_b2 =
70
0
        wrap_row(row_m + 2 * stride, stride);
71
72
0
    const V wh0 = LoadDup128(d, weights.horz + 0 * 4);
73
0
    const V wh1 = LoadDup128(d, weights.horz + 1 * 4);
74
0
    const V wh2 = LoadDup128(d, weights.horz + 2 * 4);
75
0
    const V wv0 = LoadDup128(d, weights.vert + 0 * 4);
76
0
    const V wv1 = LoadDup128(d, weights.vert + 1 * 4);
77
0
    const V wv2 = LoadDup128(d, weights.vert + 2 * 4);
78
79
0
    size_t x = 0;
80
81
    // More than one iteration for scalars.
82
0
    for (; x < kRadius; x += Lanes(d)) {
83
0
      const V conv0 =
84
0
          Mul(HorzConvolveFirst(row_m, x, xsize, wh0, wh1, wh2), wv0);
85
86
0
      const V conv1t = HorzConvolveFirst(row_t1, x, xsize, wh0, wh1, wh2);
87
0
      const V conv1b = HorzConvolveFirst(row_b1, x, xsize, wh0, wh1, wh2);
88
0
      const V conv1 = MulAdd(Add(conv1t, conv1b), wv1, conv0);
89
90
0
      const V conv2t = HorzConvolveFirst(row_t2, x, xsize, wh0, wh1, wh2);
91
0
      const V conv2b = HorzConvolveFirst(row_b2, x, xsize, wh0, wh1, wh2);
92
0
      const V conv2 = MulAdd(Add(conv2t, conv2b), wv2, conv1);
93
0
      Store(conv2, d, row_out + x);
94
0
    }
95
96
    // Main loop: load inputs without padding
97
0
    for (; x + Lanes(d) + kRadius <= xsize; x += Lanes(d)) {
98
0
      const V conv0 = Mul(HorzConvolve(row_m + x, wh0, wh1, wh2), wv0);
99
100
0
      const V conv1t = HorzConvolve(row_t1 + x, wh0, wh1, wh2);
101
0
      const V conv1b = HorzConvolve(row_b1 + x, wh0, wh1, wh2);
102
0
      const V conv1 = MulAdd(Add(conv1t, conv1b), wv1, conv0);
103
104
0
      const V conv2t = HorzConvolve(row_t2 + x, wh0, wh1, wh2);
105
0
      const V conv2b = HorzConvolve(row_b2 + x, wh0, wh1, wh2);
106
0
      const V conv2 = MulAdd(Add(conv2t, conv2b), wv2, conv1);
107
0
      Store(conv2, d, row_out + x);
108
0
    }
109
110
    // Last full vector to write (the above loop handled mod >= kRadius)
111
0
#if HWY_TARGET == HWY_SCALAR
112
0
    while (x < xsize) {
113
#else
114
    if (kSizeModN < kRadius) {
115
#endif
116
0
      const V conv0 =
117
0
          Mul(HorzConvolveLast<kSizeModN>(row_m, x, xsize, wh0, wh1, wh2), wv0);
118
119
0
      const V conv1t =
120
0
          HorzConvolveLast<kSizeModN>(row_t1, x, xsize, wh0, wh1, wh2);
121
0
      const V conv1b =
122
0
          HorzConvolveLast<kSizeModN>(row_b1, x, xsize, wh0, wh1, wh2);
123
0
      const V conv1 = MulAdd(Add(conv1t, conv1b), wv1, conv0);
124
125
0
      const V conv2t =
126
0
          HorzConvolveLast<kSizeModN>(row_t2, x, xsize, wh0, wh1, wh2);
127
0
      const V conv2b =
128
0
          HorzConvolveLast<kSizeModN>(row_b2, x, xsize, wh0, wh1, wh2);
129
0
      const V conv2 = MulAdd(Add(conv2t, conv2b), wv2, conv1);
130
0
      Store(conv2, d, row_out + x);
131
0
      x += Lanes(d);
132
0
    }
133
134
    // If mod = 0, the above vector was the last.
135
0
    if (kSizeModN != 0) {
136
0
      for (; x < xsize; ++x) {
137
0
        float mul = 0.0f;
138
0
        for (int64_t dy = -kRadius; dy <= kRadius; ++dy) {
139
0
          const float wy = weights.vert[std::abs(dy) * 4];
140
0
          const float* clamped_row = wrap_row(row_m + dy * stride, stride);
141
0
          for (int64_t dx = -kRadius; dx <= kRadius; ++dx) {
142
0
            const float wx = weights.horz[std::abs(dx) * 4];
143
0
            const int64_t clamped_x = Mirror(x + dx, xsize);
144
0
            mul += clamped_row[clamped_x] * wx * wy;
145
0
          }
146
0
        }
147
0
        row_out[x] = mul;
148
0
      }
149
0
    }
150
0
  }
Unexecuted instantiation: void jxl::N_SCALAR::Separable5Strategy::ConvolveRow<0ul, jxl::WrapRowMirror>(float const*, unsigned long, long, jxl::WrapRowMirror const&, jxl::WeightsSeparable5 const&, float*)
Unexecuted instantiation: void jxl::N_SCALAR::Separable5Strategy::ConvolveRow<0ul, jxl::WrapRowUnchanged>(float const*, unsigned long, long, jxl::WrapRowUnchanged const&, jxl::WeightsSeparable5 const&, float*)
Unexecuted instantiation: void jxl::N_SCALAR::Separable5Strategy::ConvolveRow<1ul, jxl::WrapRowMirror>(float const*, unsigned long, long, jxl::WrapRowMirror const&, jxl::WeightsSeparable5 const&, float*)
Unexecuted instantiation: void jxl::N_SCALAR::Separable5Strategy::ConvolveRow<1ul, jxl::WrapRowUnchanged>(float const*, unsigned long, long, jxl::WrapRowUnchanged const&, jxl::WeightsSeparable5 const&, float*)
Unexecuted instantiation: void jxl::N_SCALAR::Separable5Strategy::ConvolveRow<2ul, jxl::WrapRowMirror>(float const*, unsigned long, long, jxl::WrapRowMirror const&, jxl::WeightsSeparable5 const&, float*)
Unexecuted instantiation: void jxl::N_SCALAR::Separable5Strategy::ConvolveRow<2ul, jxl::WrapRowUnchanged>(float const*, unsigned long, long, jxl::WrapRowUnchanged const&, jxl::WeightsSeparable5 const&, float*)
Unexecuted instantiation: void jxl::N_SCALAR::Separable5Strategy::ConvolveRow<3ul, jxl::WrapRowMirror>(float const*, unsigned long, long, jxl::WrapRowMirror const&, jxl::WeightsSeparable5 const&, float*)
Unexecuted instantiation: void jxl::N_SCALAR::Separable5Strategy::ConvolveRow<3ul, jxl::WrapRowUnchanged>(float const*, unsigned long, long, jxl::WrapRowUnchanged const&, jxl::WeightsSeparable5 const&, float*)
151
152
 private:
153
  // Same as HorzConvolve for the first/last vector in a row.
154
  static JXL_MAYBE_INLINE V HorzConvolveFirst(
155
      const float* const JXL_RESTRICT row, const int64_t x, const int64_t xsize,
156
0
      const V wh0, const V wh1, const V wh2) {
157
0
    const D d;
158
0
    const V c = LoadU(d, row + x);
159
0
    const V mul0 = Mul(c, wh0);
160
161
0
#if HWY_TARGET == HWY_SCALAR
162
0
    const V l1 = LoadU(d, row + Mirror(x - 1, xsize));
163
0
    const V l2 = LoadU(d, row + Mirror(x - 2, xsize));
164
#else
165
    (void)xsize;
166
    const V l1 = Neighbors::FirstL1(c);
167
    const V l2 = Neighbors::FirstL2(c);
168
#endif
169
170
0
    const V r1 = LoadU(d, row + x + 1);
171
0
    const V r2 = LoadU(d, row + x + 2);
172
173
0
    const V mul1 = MulAdd(Add(l1, r1), wh1, mul0);
174
0
    const V mul2 = MulAdd(Add(l2, r2), wh2, mul1);
175
0
    return mul2;
176
0
  }
177
178
  template <size_t kSizeModN>
179
  static JXL_MAYBE_INLINE V
180
  HorzConvolveLast(const float* const JXL_RESTRICT row, const int64_t x,
181
0
                   const int64_t xsize, const V wh0, const V wh1, const V wh2) {
182
0
    const D d;
183
0
    const V c = LoadU(d, row + x);
184
0
    const V mul0 = Mul(c, wh0);
185
186
0
    const V l1 = LoadU(d, row + x - 1);
187
0
    const V l2 = LoadU(d, row + x - 2);
188
189
0
    V r1;
190
0
    V r2;
191
0
#if HWY_TARGET == HWY_SCALAR
192
0
    r1 = LoadU(d, row + Mirror(x + 1, xsize));
193
0
    r2 = LoadU(d, row + Mirror(x + 2, xsize));
194
#else
195
    const size_t N = Lanes(d);
196
    if (kSizeModN == 0) {
197
      r2 = TableLookupLanes(c, SetTableIndices(d, MirrorLanes(N - 2)));
198
      r1 = TableLookupLanes(c, SetTableIndices(d, MirrorLanes(N - 1)));
199
    } else {  // == 1
200
      const auto last = LoadU(d, row + xsize - N);
201
      r2 = TableLookupLanes(last, SetTableIndices(d, MirrorLanes(N - 1)));
202
      r1 = last;
203
    }
204
#endif
205
206
    // Sum of pixels with Manhattan distance i, multiplied by weights[i].
207
0
    const V sum1 = Add(l1, r1);
208
0
    const V mul1 = MulAdd(sum1, wh1, mul0);
209
0
    const V sum2 = Add(l2, r2);
210
0
    const V mul2 = MulAdd(sum2, wh2, mul1);
211
0
    return mul2;
212
0
  }
Unexecuted instantiation: hwy::N_SCALAR::Vec1<float> jxl::N_SCALAR::Separable5Strategy::HorzConvolveLast<0ul>(float const*, long, long, hwy::N_SCALAR::Vec1<float>, hwy::N_SCALAR::Vec1<float>, hwy::N_SCALAR::Vec1<float>)
Unexecuted instantiation: hwy::N_SCALAR::Vec1<float> jxl::N_SCALAR::Separable5Strategy::HorzConvolveLast<1ul>(float const*, long, long, hwy::N_SCALAR::Vec1<float>, hwy::N_SCALAR::Vec1<float>, hwy::N_SCALAR::Vec1<float>)
Unexecuted instantiation: hwy::N_SCALAR::Vec1<float> jxl::N_SCALAR::Separable5Strategy::HorzConvolveLast<2ul>(float const*, long, long, hwy::N_SCALAR::Vec1<float>, hwy::N_SCALAR::Vec1<float>, hwy::N_SCALAR::Vec1<float>)
Unexecuted instantiation: hwy::N_SCALAR::Vec1<float> jxl::N_SCALAR::Separable5Strategy::HorzConvolveLast<3ul>(float const*, long, long, hwy::N_SCALAR::Vec1<float>, hwy::N_SCALAR::Vec1<float>, hwy::N_SCALAR::Vec1<float>)
213
214
  // Requires kRadius valid pixels before/after pos.
215
  static JXL_MAYBE_INLINE V HorzConvolve(const float* const JXL_RESTRICT pos,
216
                                         const V wh0, const V wh1,
217
0
                                         const V wh2) {
218
0
    const D d;
219
0
    const V c = LoadU(d, pos);
220
0
    const V mul0 = Mul(c, wh0);
221
222
    // Loading anew is faster than combining vectors.
223
0
    const V l1 = LoadU(d, pos - 1);
224
0
    const V r1 = LoadU(d, pos + 1);
225
0
    const V l2 = LoadU(d, pos - 2);
226
0
    const V r2 = LoadU(d, pos + 2);
227
    // Sum of pixels with Manhattan distance i, multiplied by weights[i].
228
0
    const V sum1 = Add(l1, r1);
229
0
    const V mul1 = MulAdd(sum1, wh1, mul0);
230
0
    const V sum2 = Add(l2, r2);
231
0
    const V mul2 = MulAdd(sum2, wh2, mul1);
232
0
    return mul2;
233
0
  }
234
};
235
236
Status Separable5(const ImageF& in, const Rect& rect,
237
                  const WeightsSeparable5& weights, ThreadPool* pool,
238
0
                  ImageF* out) {
239
0
  using Conv = ConvolveT<Separable5Strategy>;
240
0
  if (rect.xsize() >= Conv::MinWidth()) {
241
0
    JXL_ENSURE(SameSize(rect, *out));
242
0
    JXL_ENSURE(rect.xsize() >= Conv::MinWidth());
243
244
0
    Conv::Run(in, rect, weights, pool, out);
245
0
    return true;
246
0
  }
247
248
0
  return SlowSeparable5(in, rect, weights, pool, out, Rect(*out));
249
0
}
250
251
// NOLINTNEXTLINE(google-readability-namespace-comments)
252
}  // namespace HWY_NAMESPACE
253
}  // namespace jxl
254
HWY_AFTER_NAMESPACE();
255
256
#if HWY_ONCE
257
namespace jxl {
258
259
HWY_EXPORT(Separable5);
260
Status Separable5(const ImageF& in, const Rect& rect,
261
                  const WeightsSeparable5& weights, ThreadPool* pool,
262
0
                  ImageF* out) {
263
0
  return HWY_DYNAMIC_DISPATCH(Separable5)(in, rect, weights, pool, out);
264
0
}
265
266
}  // namespace jxl
267
#endif  // HWY_ONCE