Coverage Report

Created: 2025-06-16 07:00

/src/libjxl/lib/jxl/enc_convolve_separable5.cc
Line
Count
Source (jump to first uncovered line)
1
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
2
//
3
// Use of this source code is governed by a BSD-style
4
// license that can be found in the LICENSE file.
5
6
#include <cstddef>
7
#include <cstdint>
8
#include <cstdlib>
9
10
#include "lib/jxl/base/compiler_specific.h"
11
#include "lib/jxl/base/data_parallel.h"
12
#include "lib/jxl/base/status.h"
13
#include "lib/jxl/convolve.h"
14
#include "lib/jxl/image.h"
15
#include "lib/jxl/image_ops.h"
16
17
#undef HWY_TARGET_INCLUDE
18
#define HWY_TARGET_INCLUDE "lib/jxl/enc_convolve_separable5.cc"
19
#include <hwy/foreach_target.h>
20
#include <hwy/highway.h>
21
22
#include "lib/jxl/base/rect.h"
23
#include "lib/jxl/convolve-inl.h"
24
25
HWY_BEFORE_NAMESPACE();
26
namespace jxl {
27
namespace HWY_NAMESPACE {
28
29
// These templates are not found via ADL.
30
using hwy::HWY_NAMESPACE::Add;
31
using hwy::HWY_NAMESPACE::IndicesFromVec;
32
using hwy::HWY_NAMESPACE::Iota;
33
using hwy::HWY_NAMESPACE::Max;
34
using hwy::HWY_NAMESPACE::Min;
35
using hwy::HWY_NAMESPACE::Mul;
36
using hwy::HWY_NAMESPACE::MulAdd;
37
using hwy::HWY_NAMESPACE::Sub;
38
using hwy::HWY_NAMESPACE::Vec;
39
40
using D = HWY_CAPPED(float, 16);
41
using DI32 = HWY_CAPPED(int32_t, 16);
42
using V = Vec<D>;
43
using VI32 = Vec<DI32>;
44
using I = decltype(SetTableIndices(D(), static_cast<int32_t*>(nullptr)));
45
46
// 5x5 convolution by separable kernel with a single scan through the input.
47
// This is more cache-efficient than separate horizontal/vertical passes, and
48
// possibly faster (given enough registers) than tiling and/or transposing.
49
//
50
// Overview: imagine a 5x5 window around a central pixel. First convolve the
51
// rows by multiplying the pixels with the corresponding weights from
52
// WeightsSeparable5.horz[abs(x_offset) * 4]. Then multiply each of these
53
// intermediate results by the corresponding vertical weight, i.e.
54
// vert[abs(y_offset) * 4]. Finally, store the sum of these values as the
55
// convolution result at the position of the central pixel in the output.
56
//
57
// Each of these operations uses SIMD vectors. The central pixel and most
58
// importantly the output are aligned, so neighnoring pixels (e.g. x_offset=1)
59
// require unaligned loads. Because weights are supplied in identical groups of
60
// 4, we can use LoadDup128 to load them (slightly faster).
61
//
62
// Uses mirrored boundary handling. Until x >= kRadius, the horizontal
63
// convolution uses Neighbors class to shuffle vectors as if each of its lanes
64
// had been loaded from the mirrored offset. Similarly, the last full vector to
65
// write uses mirroring. In the case of scalar vectors, Neighbors is not usable
66
// and the value is loaded directly. Otherwise, the number of valid pixels
67
// modulo the vector size enables a small optimization: for smaller offsets,
68
// a non-mirrored load is sufficient.
69
class Separable5Impl {
70
 public:
71
  using Simd = HWY_CAPPED(float, 16);
72
  static constexpr int64_t kRadius = 2;
73
74
  Separable5Impl(const ImageF* in, const Rect& rect,
75
                 const WeightsSeparable5* weights, ThreadPool* pool,
76
                 ImageF* out)
77
0
      : in(in), rect(rect), weights(weights), pool(pool), out(out) {}
Unexecuted instantiation: jxl::N_SSE4::Separable5Impl::Separable5Impl(jxl::Plane<float> const*, jxl::RectT<unsigned long> const&, jxl::WeightsSeparable5 const*, jxl::ThreadPool*, jxl::Plane<float>*)
Unexecuted instantiation: jxl::N_AVX2::Separable5Impl::Separable5Impl(jxl::Plane<float> const*, jxl::RectT<unsigned long> const&, jxl::WeightsSeparable5 const*, jxl::ThreadPool*, jxl::Plane<float>*)
Unexecuted instantiation: jxl::N_SSE2::Separable5Impl::Separable5Impl(jxl::Plane<float> const*, jxl::RectT<unsigned long> const&, jxl::WeightsSeparable5 const*, jxl::ThreadPool*, jxl::Plane<float>*)
78
79
0
  Status Run() {
80
#if HWY_TARGET == HWY_SCALAR
81
    // First/Last use mirrored loads of up to +/- kRadius.
82
    size_t min_width = 2 * kRadius;
83
#else
84
0
    size_t min_width = Lanes(Simd()) + kRadius;
85
0
#endif
86
87
0
    if (rect.xsize() >= min_width) {
88
0
      JXL_ENSURE(SameSize(rect, *out));
89
90
0
      switch (rect.xsize() % Lanes(Simd())) {
91
0
        case 0:
92
0
          RunRows<0>();
93
0
          break;
94
0
        case 1:
95
0
          RunRows<1>();
96
0
          break;
97
0
        case 2:
98
0
          RunRows<2>();
99
0
          break;
100
0
        default:
101
0
          RunRows<3>();
102
0
          break;
103
0
      }
104
0
      return true;
105
0
    } else {
106
0
      return SlowSeparable5(*in, rect, *weights, pool, out, Rect(*out));
107
0
    }
108
0
  }
Unexecuted instantiation: jxl::N_SSE4::Separable5Impl::Run()
Unexecuted instantiation: jxl::N_AVX2::Separable5Impl::Run()
Unexecuted instantiation: jxl::N_SSE2::Separable5Impl::Run()
109
110
  template <size_t kSizeModN, bool kBorder>
111
0
  JXL_NOINLINE void ConvolveRow(const uint32_t y) {
112
0
    const D d;
113
0
    const int64_t stride = in->PixelsPerRow();
114
0
    const int64_t neg_stride = -stride;  // allows LEA addressing.
115
0
    const size_t xsize = rect.xsize();
116
0
    const float* const JXL_RESTRICT row_m = rect.ConstRow(*in, y);
117
0
    float* const JXL_RESTRICT row_out = out->Row(y);
118
0
    const float* JXL_RESTRICT row_t2 = row_m + 2 * neg_stride;
119
0
    const float* JXL_RESTRICT row_t1 = row_m + 1 * neg_stride;
120
0
    const float* JXL_RESTRICT row_b1 = row_m + 1 * stride;
121
0
    const float* JXL_RESTRICT row_b2 = row_m + 2 * stride;
122
123
0
    if (kBorder) {
124
0
      size_t img_y = rect.y0() + y;
125
0
      if (in->ysize() <= 2 * kRadius) {  // Very special: double reflections
126
0
        static constexpr size_t kBorderLut[4 * 8] = {
127
0
            0, 0, 0, 0, 0, 0xBAD, 0xBAD, 0xBAD,  // 1 row
128
0
            1, 0, 0, 1, 1, 0,     0xBAD, 0xBAD,  // 2 rows
129
0
            1, 0, 0, 1, 2, 2,     1,     0xBAD,  // 3 rows
130
0
            1, 0, 0, 1, 2, 3,     3,     2,      // 4 rows
131
0
        };
132
0
        JXL_DASSERT(in->ysize() <= 4);
133
0
        size_t o = in->ysize() * 8 - 6 + img_y;
134
0
        row_t2 = in->ConstRow(kBorderLut[o - 2]) + rect.x0();
135
0
        row_t1 = in->ConstRow(kBorderLut[o - 1]) + rect.x0();
136
0
        row_b1 = in->ConstRow(kBorderLut[o + 1]) + rect.x0();
137
0
        row_b2 = in->ConstRow(kBorderLut[o + 2]) + rect.x0();
138
0
      } else if (img_y < kRadius) {
139
0
        if (img_y == 0) {
140
0
          row_t1 = row_m;
141
0
          row_t2 = row_b1;
142
0
        } else {
143
0
          JXL_DASSERT(img_y == 1);
144
0
          row_t2 = row_t1;
145
0
        }
146
0
      } else {
147
0
        JXL_DASSERT(img_y + kRadius >= in->ysize());
148
0
        if (img_y + 1 == in->ysize()) {
149
0
          row_b1 = row_m;
150
0
          row_b2 = row_t1;
151
0
        } else {
152
0
          JXL_DASSERT(img_y + 2 == in->ysize());
153
0
          row_b2 = row_b1;
154
0
        }
155
0
      }
156
0
    }
157
158
0
    const V wh0 = LoadDup128(d, weights->horz + 0 * 4);
159
0
    const V wh1 = LoadDup128(d, weights->horz + 1 * 4);
160
0
    const V wh2 = LoadDup128(d, weights->horz + 2 * 4);
161
0
    const V wv0 = LoadDup128(d, weights->vert + 0 * 4);
162
0
    const V wv1 = LoadDup128(d, weights->vert + 1 * 4);
163
0
    const V wv2 = LoadDup128(d, weights->vert + 2 * 4);
164
0
    const I ml1 = MirrorLanes<1>();
165
0
    const I ml2 = MirrorLanes<2>();
166
167
0
    size_t x = 0;
168
169
    // More than one iteration for scalars.
170
0
    for (; x < kRadius; x += Lanes(d)) {
171
0
      const V conv0 =
172
0
          Mul(HorzConvolveFirst(row_m, x, xsize, wh0, wh1, wh2), wv0);
173
174
0
      const V conv1t = HorzConvolveFirst(row_t1, x, xsize, wh0, wh1, wh2);
175
0
      const V conv1b = HorzConvolveFirst(row_b1, x, xsize, wh0, wh1, wh2);
176
0
      const V conv1 = MulAdd(Add(conv1t, conv1b), wv1, conv0);
177
178
0
      const V conv2t = HorzConvolveFirst(row_t2, x, xsize, wh0, wh1, wh2);
179
0
      const V conv2b = HorzConvolveFirst(row_b2, x, xsize, wh0, wh1, wh2);
180
0
      const V conv2 = MulAdd(Add(conv2t, conv2b), wv2, conv1);
181
0
      Store(conv2, d, row_out + x);
182
0
    }
183
184
    // Main loop: load inputs without padding
185
0
    for (; x + Lanes(d) + kRadius <= xsize; x += Lanes(d)) {
186
0
      const V conv0 = Mul(HorzConvolve(row_m + x, wh0, wh1, wh2), wv0);
187
188
0
      const V conv1t = HorzConvolve(row_t1 + x, wh0, wh1, wh2);
189
0
      const V conv1b = HorzConvolve(row_b1 + x, wh0, wh1, wh2);
190
0
      const V conv1 = MulAdd(Add(conv1t, conv1b), wv1, conv0);
191
192
0
      const V conv2t = HorzConvolve(row_t2 + x, wh0, wh1, wh2);
193
0
      const V conv2b = HorzConvolve(row_b2 + x, wh0, wh1, wh2);
194
0
      const V conv2 = MulAdd(Add(conv2t, conv2b), wv2, conv1);
195
0
      Store(conv2, d, row_out + x);
196
0
    }
197
198
    // Last full vector to write (the above loop handled mod >= kRadius)
199
#if HWY_TARGET == HWY_SCALAR
200
    while (x < xsize) {
201
#else
202
0
    if (kSizeModN < kRadius) {
203
0
#endif
204
0
      const V conv0 = Mul(
205
0
          HorzConvolveLast<kSizeModN>(row_m, x, xsize, wh0, wh1, wh2, ml1, ml2),
206
0
          wv0);
207
208
0
      const V conv1t = HorzConvolveLast<kSizeModN>(row_t1, x, xsize, wh0, wh1,
209
0
                                                   wh2, ml1, ml2);
210
0
      const V conv1b = HorzConvolveLast<kSizeModN>(row_b1, x, xsize, wh0, wh1,
211
0
                                                   wh2, ml1, ml2);
212
0
      const V conv1 = MulAdd(Add(conv1t, conv1b), wv1, conv0);
213
214
0
      const V conv2t = HorzConvolveLast<kSizeModN>(row_t2, x, xsize, wh0, wh1,
215
0
                                                   wh2, ml1, ml2);
216
0
      const V conv2b = HorzConvolveLast<kSizeModN>(row_b2, x, xsize, wh0, wh1,
217
0
                                                   wh2, ml1, ml2);
218
0
      const V conv2 = MulAdd(Add(conv2t, conv2b), wv2, conv1);
219
0
      Store(conv2, d, row_out + x);
220
0
      x += Lanes(d);
221
0
    }
222
223
    // If mod = 0, the above vector was the last.
224
0
    if (kSizeModN != 0) {
225
0
      const float* JXL_RESTRICT rows[5] = {row_t2, row_t1, row_m, row_b1,
226
0
                                           row_b2};
227
0
      for (; x < xsize; ++x) {
228
0
        float mul = 0.0f;
229
0
        for (int64_t dy = -kRadius; dy <= kRadius; ++dy) {
230
0
          const float wy = weights->vert[std::abs(dy) * 4];
231
0
          const float* clamped_row = rows[dy + 2];
232
0
          for (int64_t dx = -kRadius; dx <= kRadius; ++dx) {
233
0
            const float wx = weights->horz[std::abs(dx) * 4];
234
0
            const int64_t clamped_x = Mirror(x + dx, xsize);
235
0
            mul += clamped_row[clamped_x] * wx * wy;
236
0
          }
237
0
        }
238
0
        row_out[x] = mul;
239
0
      }
240
0
    }
241
0
  }
Unexecuted instantiation: void jxl::N_SSE4::Separable5Impl::ConvolveRow<0ul, true>(unsigned int)
Unexecuted instantiation: void jxl::N_SSE4::Separable5Impl::ConvolveRow<0ul, false>(unsigned int)
Unexecuted instantiation: void jxl::N_SSE4::Separable5Impl::ConvolveRow<1ul, true>(unsigned int)
Unexecuted instantiation: void jxl::N_SSE4::Separable5Impl::ConvolveRow<1ul, false>(unsigned int)
Unexecuted instantiation: void jxl::N_SSE4::Separable5Impl::ConvolveRow<2ul, true>(unsigned int)
Unexecuted instantiation: void jxl::N_SSE4::Separable5Impl::ConvolveRow<2ul, false>(unsigned int)
Unexecuted instantiation: void jxl::N_SSE4::Separable5Impl::ConvolveRow<3ul, true>(unsigned int)
Unexecuted instantiation: void jxl::N_SSE4::Separable5Impl::ConvolveRow<3ul, false>(unsigned int)
Unexecuted instantiation: void jxl::N_AVX2::Separable5Impl::ConvolveRow<0ul, true>(unsigned int)
Unexecuted instantiation: void jxl::N_AVX2::Separable5Impl::ConvolveRow<0ul, false>(unsigned int)
Unexecuted instantiation: void jxl::N_AVX2::Separable5Impl::ConvolveRow<1ul, true>(unsigned int)
Unexecuted instantiation: void jxl::N_AVX2::Separable5Impl::ConvolveRow<1ul, false>(unsigned int)
Unexecuted instantiation: void jxl::N_AVX2::Separable5Impl::ConvolveRow<2ul, true>(unsigned int)
Unexecuted instantiation: void jxl::N_AVX2::Separable5Impl::ConvolveRow<2ul, false>(unsigned int)
Unexecuted instantiation: void jxl::N_AVX2::Separable5Impl::ConvolveRow<3ul, true>(unsigned int)
Unexecuted instantiation: void jxl::N_AVX2::Separable5Impl::ConvolveRow<3ul, false>(unsigned int)
Unexecuted instantiation: void jxl::N_SSE2::Separable5Impl::ConvolveRow<0ul, true>(unsigned int)
Unexecuted instantiation: void jxl::N_SSE2::Separable5Impl::ConvolveRow<0ul, false>(unsigned int)
Unexecuted instantiation: void jxl::N_SSE2::Separable5Impl::ConvolveRow<1ul, true>(unsigned int)
Unexecuted instantiation: void jxl::N_SSE2::Separable5Impl::ConvolveRow<1ul, false>(unsigned int)
Unexecuted instantiation: void jxl::N_SSE2::Separable5Impl::ConvolveRow<2ul, true>(unsigned int)
Unexecuted instantiation: void jxl::N_SSE2::Separable5Impl::ConvolveRow<2ul, false>(unsigned int)
Unexecuted instantiation: void jxl::N_SSE2::Separable5Impl::ConvolveRow<3ul, true>(unsigned int)
Unexecuted instantiation: void jxl::N_SSE2::Separable5Impl::ConvolveRow<3ul, false>(unsigned int)
242
243
 private:
244
  template <size_t kSizeModN>
245
0
  JXL_INLINE void RunRows() {
246
    // NB: borders are image-bound, not rect-bound.
247
0
    size_t ybegin = rect.y0();
248
0
    size_t yend = rect.y1();
249
0
    while (ybegin < yend && ybegin < kRadius) {
250
0
      ybegin++;
251
0
    }
252
0
    while (ybegin < yend && yend + kRadius > in->ysize()) {
253
0
      yend--;
254
0
    }
255
0
    if (ybegin > rect.y0()) {
256
0
      RunBorderRows<kSizeModN>(0, ybegin - rect.y0());
257
0
    }
258
0
    if (yend > ybegin) {
259
0
      RunInteriorRows<kSizeModN>(ybegin - rect.y0(), yend - rect.y0());
260
0
    }
261
0
    if (yend < rect.y1()) {
262
0
      RunBorderRows<kSizeModN>(yend - rect.y0(), rect.ysize());
263
0
    }
264
0
  }
Unexecuted instantiation: void jxl::N_SSE4::Separable5Impl::RunRows<0ul>()
Unexecuted instantiation: void jxl::N_SSE4::Separable5Impl::RunRows<1ul>()
Unexecuted instantiation: void jxl::N_SSE4::Separable5Impl::RunRows<2ul>()
Unexecuted instantiation: void jxl::N_SSE4::Separable5Impl::RunRows<3ul>()
Unexecuted instantiation: void jxl::N_AVX2::Separable5Impl::RunRows<0ul>()
Unexecuted instantiation: void jxl::N_AVX2::Separable5Impl::RunRows<1ul>()
Unexecuted instantiation: void jxl::N_AVX2::Separable5Impl::RunRows<2ul>()
Unexecuted instantiation: void jxl::N_AVX2::Separable5Impl::RunRows<3ul>()
Unexecuted instantiation: void jxl::N_SSE2::Separable5Impl::RunRows<0ul>()
Unexecuted instantiation: void jxl::N_SSE2::Separable5Impl::RunRows<1ul>()
Unexecuted instantiation: void jxl::N_SSE2::Separable5Impl::RunRows<2ul>()
Unexecuted instantiation: void jxl::N_SSE2::Separable5Impl::RunRows<3ul>()
265
266
  template <size_t kSizeModN>
267
0
  JXL_INLINE void RunBorderRows(const size_t ybegin, const size_t yend) {
268
0
    for (size_t y = ybegin; y < yend; ++y) {
269
0
      ConvolveRow<kSizeModN, true>(y);
270
0
    }
271
0
  }
Unexecuted instantiation: void jxl::N_SSE4::Separable5Impl::RunBorderRows<0ul>(unsigned long, unsigned long)
Unexecuted instantiation: void jxl::N_SSE4::Separable5Impl::RunBorderRows<1ul>(unsigned long, unsigned long)
Unexecuted instantiation: void jxl::N_SSE4::Separable5Impl::RunBorderRows<2ul>(unsigned long, unsigned long)
Unexecuted instantiation: void jxl::N_SSE4::Separable5Impl::RunBorderRows<3ul>(unsigned long, unsigned long)
Unexecuted instantiation: void jxl::N_AVX2::Separable5Impl::RunBorderRows<0ul>(unsigned long, unsigned long)
Unexecuted instantiation: void jxl::N_AVX2::Separable5Impl::RunBorderRows<1ul>(unsigned long, unsigned long)
Unexecuted instantiation: void jxl::N_AVX2::Separable5Impl::RunBorderRows<2ul>(unsigned long, unsigned long)
Unexecuted instantiation: void jxl::N_AVX2::Separable5Impl::RunBorderRows<3ul>(unsigned long, unsigned long)
Unexecuted instantiation: void jxl::N_SSE2::Separable5Impl::RunBorderRows<0ul>(unsigned long, unsigned long)
Unexecuted instantiation: void jxl::N_SSE2::Separable5Impl::RunBorderRows<1ul>(unsigned long, unsigned long)
Unexecuted instantiation: void jxl::N_SSE2::Separable5Impl::RunBorderRows<2ul>(unsigned long, unsigned long)
Unexecuted instantiation: void jxl::N_SSE2::Separable5Impl::RunBorderRows<3ul>(unsigned long, unsigned long)
272
273
  template <size_t kSizeModN>
274
0
  JXL_INLINE void RunInteriorRows(const size_t ybegin, const size_t yend) {
275
0
    const auto process_row = [&](const uint32_t y, size_t /*thread*/) HWY_ATTR {
276
0
      ConvolveRow<kSizeModN, false>(y);
277
0
      return true;
278
0
    };
Unexecuted instantiation: jxl::N_SSE4::Separable5Impl::RunInteriorRows<0ul>(unsigned long, unsigned long)::{lambda(unsigned int, unsigned long)#1}::operator()(unsigned int, unsigned long) const
Unexecuted instantiation: jxl::N_SSE4::Separable5Impl::RunInteriorRows<1ul>(unsigned long, unsigned long)::{lambda(unsigned int, unsigned long)#1}::operator()(unsigned int, unsigned long) const
Unexecuted instantiation: jxl::N_SSE4::Separable5Impl::RunInteriorRows<2ul>(unsigned long, unsigned long)::{lambda(unsigned int, unsigned long)#1}::operator()(unsigned int, unsigned long) const
Unexecuted instantiation: jxl::N_SSE4::Separable5Impl::RunInteriorRows<3ul>(unsigned long, unsigned long)::{lambda(unsigned int, unsigned long)#1}::operator()(unsigned int, unsigned long) const
Unexecuted instantiation: jxl::N_AVX2::Separable5Impl::RunInteriorRows<0ul>(unsigned long, unsigned long)::{lambda(unsigned int, unsigned long)#1}::operator()(unsigned int, unsigned long) const
Unexecuted instantiation: jxl::N_AVX2::Separable5Impl::RunInteriorRows<1ul>(unsigned long, unsigned long)::{lambda(unsigned int, unsigned long)#1}::operator()(unsigned int, unsigned long) const
Unexecuted instantiation: jxl::N_AVX2::Separable5Impl::RunInteriorRows<2ul>(unsigned long, unsigned long)::{lambda(unsigned int, unsigned long)#1}::operator()(unsigned int, unsigned long) const
Unexecuted instantiation: jxl::N_AVX2::Separable5Impl::RunInteriorRows<3ul>(unsigned long, unsigned long)::{lambda(unsigned int, unsigned long)#1}::operator()(unsigned int, unsigned long) const
Unexecuted instantiation: jxl::N_SSE2::Separable5Impl::RunInteriorRows<0ul>(unsigned long, unsigned long)::{lambda(unsigned int, unsigned long)#1}::operator()(unsigned int, unsigned long) const
Unexecuted instantiation: jxl::N_SSE2::Separable5Impl::RunInteriorRows<1ul>(unsigned long, unsigned long)::{lambda(unsigned int, unsigned long)#1}::operator()(unsigned int, unsigned long) const
Unexecuted instantiation: jxl::N_SSE2::Separable5Impl::RunInteriorRows<2ul>(unsigned long, unsigned long)::{lambda(unsigned int, unsigned long)#1}::operator()(unsigned int, unsigned long) const
Unexecuted instantiation: jxl::N_SSE2::Separable5Impl::RunInteriorRows<3ul>(unsigned long, unsigned long)::{lambda(unsigned int, unsigned long)#1}::operator()(unsigned int, unsigned long) const
279
0
    Status status = RunOnPool(pool, ybegin, yend, ThreadPool::NoInit,
280
0
                              process_row, "Convolve");
281
0
    JXL_DASSERT(status);
282
0
    (void)status;
283
0
  }
Unexecuted instantiation: void jxl::N_SSE4::Separable5Impl::RunInteriorRows<0ul>(unsigned long, unsigned long)
Unexecuted instantiation: void jxl::N_SSE4::Separable5Impl::RunInteriorRows<1ul>(unsigned long, unsigned long)
Unexecuted instantiation: void jxl::N_SSE4::Separable5Impl::RunInteriorRows<2ul>(unsigned long, unsigned long)
Unexecuted instantiation: void jxl::N_SSE4::Separable5Impl::RunInteriorRows<3ul>(unsigned long, unsigned long)
Unexecuted instantiation: void jxl::N_AVX2::Separable5Impl::RunInteriorRows<0ul>(unsigned long, unsigned long)
Unexecuted instantiation: void jxl::N_AVX2::Separable5Impl::RunInteriorRows<1ul>(unsigned long, unsigned long)
Unexecuted instantiation: void jxl::N_AVX2::Separable5Impl::RunInteriorRows<2ul>(unsigned long, unsigned long)
Unexecuted instantiation: void jxl::N_AVX2::Separable5Impl::RunInteriorRows<3ul>(unsigned long, unsigned long)
Unexecuted instantiation: void jxl::N_SSE2::Separable5Impl::RunInteriorRows<0ul>(unsigned long, unsigned long)
Unexecuted instantiation: void jxl::N_SSE2::Separable5Impl::RunInteriorRows<1ul>(unsigned long, unsigned long)
Unexecuted instantiation: void jxl::N_SSE2::Separable5Impl::RunInteriorRows<2ul>(unsigned long, unsigned long)
Unexecuted instantiation: void jxl::N_SSE2::Separable5Impl::RunInteriorRows<3ul>(unsigned long, unsigned long)
284
285
  // Returns IndicesFromVec(d, indices) such that TableLookupLanes on the
286
  // rightmost unaligned vector (rightmost sample in its most-significant lane)
287
  // returns the mirrored values, with the mirror outside the last valid sample.
288
  template <size_t M>
289
0
  static JXL_INLINE I MirrorLanes() {
290
0
    static_assert(M >= 1 && M <= 2, "Only M in range {1..2} is supported");
291
0
    D d;
292
0
    DI32 di32;
293
0
    const VI32 up = Min(Iota(di32, M), Set(di32, Lanes(d) - 1));
294
0
    const VI32 down = Max(Iota(di32, M - Lanes(d)), Zero(di32));
295
0
    return IndicesFromVec(d, Sub(up, down));
296
0
  }
Unexecuted instantiation: hwy::N_SSE4::Indices128<float, 4ul> jxl::N_SSE4::Separable5Impl::MirrorLanes<1ul>()
Unexecuted instantiation: hwy::N_SSE4::Indices128<float, 4ul> jxl::N_SSE4::Separable5Impl::MirrorLanes<2ul>()
Unexecuted instantiation: hwy::N_AVX2::Indices256<float> jxl::N_AVX2::Separable5Impl::MirrorLanes<1ul>()
Unexecuted instantiation: hwy::N_AVX2::Indices256<float> jxl::N_AVX2::Separable5Impl::MirrorLanes<2ul>()
Unexecuted instantiation: hwy::N_SSE2::Indices128<float, 4ul> jxl::N_SSE2::Separable5Impl::MirrorLanes<1ul>()
Unexecuted instantiation: hwy::N_SSE2::Indices128<float, 4ul> jxl::N_SSE2::Separable5Impl::MirrorLanes<2ul>()
297
298
  // Same as HorzConvolve for the first/last vector in a row.
299
  static JXL_MAYBE_INLINE V HorzConvolveFirst(
300
      const float* const JXL_RESTRICT row, const int64_t x, const int64_t xsize,
301
0
      const V wh0, const V wh1, const V wh2) {
302
0
    const D d;
303
0
    const V c = LoadU(d, row + x);
304
0
    const V mul0 = Mul(c, wh0);
305
306
#if HWY_TARGET == HWY_SCALAR
307
    const V l1 = LoadU(d, row + Mirror(x - 1, xsize));
308
    const V l2 = LoadU(d, row + Mirror(x - 2, xsize));
309
#else
310
0
    (void)xsize;
311
0
    const V l1 = Neighbors::FirstL1(c);
312
0
    const V l2 = Neighbors::FirstL2(c);
313
0
#endif
314
315
0
    const V r1 = LoadU(d, row + x + 1);
316
0
    const V r2 = LoadU(d, row + x + 2);
317
318
0
    const V mul1 = MulAdd(Add(l1, r1), wh1, mul0);
319
0
    const V mul2 = MulAdd(Add(l2, r2), wh2, mul1);
320
0
    return mul2;
321
0
  }
Unexecuted instantiation: jxl::N_SSE4::Separable5Impl::HorzConvolveFirst(float const*, long, long, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>)
Unexecuted instantiation: jxl::N_AVX2::Separable5Impl::HorzConvolveFirst(float const*, long, long, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>)
Unexecuted instantiation: jxl::N_SSE2::Separable5Impl::HorzConvolveFirst(float const*, long, long, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>)
322
323
  template <size_t kSizeModN>
324
  static JXL_MAYBE_INLINE V HorzConvolveLast(
325
      const float* const JXL_RESTRICT row, const int64_t x, const int64_t xsize,
326
0
      const V wh0, const V wh1, const V wh2, const I ml1, const I ml2) {
327
0
    const D d;
328
0
    const V c = LoadU(d, row + x);
329
0
    const V mul0 = Mul(c, wh0);
330
331
0
    const V l1 = LoadU(d, row + x - 1);
332
0
    const V l2 = LoadU(d, row + x - 2);
333
334
0
    V r1;
335
0
    V r2;
336
#if HWY_TARGET == HWY_SCALAR
337
    r1 = LoadU(d, row + Mirror(x + 1, xsize));
338
    r2 = LoadU(d, row + Mirror(x + 2, xsize));
339
    (void)ml1;
340
    (void)ml2;
341
#else
342
0
    const size_t N = Lanes(d);
343
0
    if (kSizeModN == 0) {
344
0
      r2 = TableLookupLanes(c, ml2);
345
0
      r1 = TableLookupLanes(c, ml1);
346
0
    } else {  // == 1
347
0
      const auto last = LoadU(d, row + xsize - N);
348
0
      r2 = TableLookupLanes(last, ml1);
349
0
      r1 = last;
350
0
    }
351
0
#endif
352
353
    // Sum of pixels with Manhattan distance i, multiplied by weights[i].
354
0
    const V sum1 = Add(l1, r1);
355
0
    const V mul1 = MulAdd(sum1, wh1, mul0);
356
0
    const V sum2 = Add(l2, r2);
357
0
    const V mul2 = MulAdd(sum2, wh2, mul1);
358
0
    return mul2;
359
0
  }
Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::Separable5Impl::HorzConvolveLast<0ul>(float const*, long, long, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Indices128<float, 4ul>, hwy::N_SSE4::Indices128<float, 4ul>)
Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::Separable5Impl::HorzConvolveLast<1ul>(float const*, long, long, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Indices128<float, 4ul>, hwy::N_SSE4::Indices128<float, 4ul>)
Unexecuted instantiation: hwy::N_AVX2::Vec256<float> jxl::N_AVX2::Separable5Impl::HorzConvolveLast<0ul>(float const*, long, long, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Indices256<float>, hwy::N_AVX2::Indices256<float>)
Unexecuted instantiation: hwy::N_AVX2::Vec256<float> jxl::N_AVX2::Separable5Impl::HorzConvolveLast<1ul>(float const*, long, long, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Indices256<float>, hwy::N_AVX2::Indices256<float>)
Unexecuted instantiation: hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::Separable5Impl::HorzConvolveLast<0ul>(float const*, long, long, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Indices128<float, 4ul>, hwy::N_SSE2::Indices128<float, 4ul>)
Unexecuted instantiation: hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::Separable5Impl::HorzConvolveLast<1ul>(float const*, long, long, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Indices128<float, 4ul>, hwy::N_SSE2::Indices128<float, 4ul>)
Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::Separable5Impl::HorzConvolveLast<2ul>(float const*, long, long, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Indices128<float, 4ul>, hwy::N_SSE4::Indices128<float, 4ul>)
Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::Separable5Impl::HorzConvolveLast<3ul>(float const*, long, long, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Indices128<float, 4ul>, hwy::N_SSE4::Indices128<float, 4ul>)
Unexecuted instantiation: hwy::N_AVX2::Vec256<float> jxl::N_AVX2::Separable5Impl::HorzConvolveLast<2ul>(float const*, long, long, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Indices256<float>, hwy::N_AVX2::Indices256<float>)
Unexecuted instantiation: hwy::N_AVX2::Vec256<float> jxl::N_AVX2::Separable5Impl::HorzConvolveLast<3ul>(float const*, long, long, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Indices256<float>, hwy::N_AVX2::Indices256<float>)
Unexecuted instantiation: hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::Separable5Impl::HorzConvolveLast<2ul>(float const*, long, long, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Indices128<float, 4ul>, hwy::N_SSE2::Indices128<float, 4ul>)
Unexecuted instantiation: hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::Separable5Impl::HorzConvolveLast<3ul>(float const*, long, long, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Indices128<float, 4ul>, hwy::N_SSE2::Indices128<float, 4ul>)
360
361
  // Requires kRadius valid pixels before/after pos.
362
  static JXL_MAYBE_INLINE V HorzConvolve(const float* const JXL_RESTRICT pos,
363
                                         const V wh0, const V wh1,
364
0
                                         const V wh2) {
365
0
    const D d;
366
0
    const V c = LoadU(d, pos);
367
0
    const V mul0 = Mul(c, wh0);
368
369
    // Loading anew is faster than combining vectors.
370
0
    const V l1 = LoadU(d, pos - 1);
371
0
    const V r1 = LoadU(d, pos + 1);
372
0
    const V l2 = LoadU(d, pos - 2);
373
0
    const V r2 = LoadU(d, pos + 2);
374
    // Sum of pixels with Manhattan distance i, multiplied by weights[i].
375
0
    const V sum1 = Add(l1, r1);
376
0
    const V mul1 = MulAdd(sum1, wh1, mul0);
377
0
    const V sum2 = Add(l2, r2);
378
0
    const V mul2 = MulAdd(sum2, wh2, mul1);
379
0
    return mul2;
380
0
  }
Unexecuted instantiation: jxl::N_SSE4::Separable5Impl::HorzConvolve(float const*, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>)
Unexecuted instantiation: jxl::N_AVX2::Separable5Impl::HorzConvolve(float const*, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>)
Unexecuted instantiation: jxl::N_SSE2::Separable5Impl::HorzConvolve(float const*, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>)
381
382
  const ImageF* in;
383
  const Rect rect;
384
  const WeightsSeparable5* weights;
385
  ThreadPool* pool;
386
  ImageF* out;
387
};
388
389
Status Separable5(const ImageF& in, const Rect& rect,
390
                  const WeightsSeparable5& weights, ThreadPool* pool,
391
0
                  ImageF* out) {
392
0
  Separable5Impl impl(&in, rect, &weights, pool, out);
393
0
  return impl.Run();
394
0
}
Unexecuted instantiation: jxl::N_SSE4::Separable5(jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, jxl::WeightsSeparable5 const&, jxl::ThreadPool*, jxl::Plane<float>*)
Unexecuted instantiation: jxl::N_AVX2::Separable5(jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, jxl::WeightsSeparable5 const&, jxl::ThreadPool*, jxl::Plane<float>*)
Unexecuted instantiation: jxl::N_SSE2::Separable5(jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, jxl::WeightsSeparable5 const&, jxl::ThreadPool*, jxl::Plane<float>*)
395
396
// NOLINTNEXTLINE(google-readability-namespace-comments)
397
}  // namespace HWY_NAMESPACE
398
}  // namespace jxl
399
HWY_AFTER_NAMESPACE();
400
401
#if HWY_ONCE
402
namespace jxl {
403
404
HWY_EXPORT(Separable5);
405
Status Separable5(const ImageF& in, const Rect& rect,
406
                  const WeightsSeparable5& weights, ThreadPool* pool,
407
0
                  ImageF* out) {
408
0
  return HWY_DYNAMIC_DISPATCH(Separable5)(in, rect, weights, pool, out);
409
0
}
410
411
}  // namespace jxl
412
#endif  // HWY_ONCE