Coverage Report

Created: 2025-07-23 08:18

/src/libjxl/lib/jxl/convolve_symmetric5.cc
Line
Count
Source
1
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
2
//
3
// Use of this source code is governed by a BSD-style
4
// license that can be found in the LICENSE file.
5
6
#include <algorithm>
7
#include <cstddef>
8
#include <cstdint>
9
#include <cstdio>
10
11
#include "lib/jxl/base/compiler_specific.h"
12
#include "lib/jxl/base/data_parallel.h"
13
#include "lib/jxl/base/status.h"
14
#include "lib/jxl/convolve.h"
15
#include "lib/jxl/image.h"
16
17
#undef HWY_TARGET_INCLUDE
18
#define HWY_TARGET_INCLUDE "lib/jxl/convolve_symmetric5.cc"
19
#include <hwy/foreach_target.h>
20
#include <hwy/highway.h>
21
22
#include "lib/jxl/base/common.h"
23
#include "lib/jxl/base/rect.h"
24
#include "lib/jxl/image_ops.h"
25
26
HWY_BEFORE_NAMESPACE();
27
namespace jxl {
28
namespace HWY_NAMESPACE {
29
30
// These templates are not found via ADL.
31
using hwy::HWY_NAMESPACE::Add;
32
using hwy::HWY_NAMESPACE::Mul;
33
using hwy::HWY_NAMESPACE::Vec;
34
35
// Weighted sum of 1x5 pixels around ix, iy with [wx2 wx1 wx0 wx1 wx2].
36
template <class WrapY>
37
static float WeightedSumBorder(const ImageF& in, const WrapY wrap_y,
38
                               const int64_t ix, const int64_t iy,
39
                               const size_t xsize, const size_t ysize,
40
                               const float wx0, const float wx1,
41
159M
                               const float wx2) {
42
159M
  const WrapMirror wrap_x;
43
159M
  const float* JXL_RESTRICT row = in.ConstRow(wrap_y(iy, ysize));
44
159M
  const float in_m2 = row[wrap_x(ix - 2, xsize)];
45
159M
  const float in_p2 = row[wrap_x(ix + 2, xsize)];
46
159M
  const float in_m1 = row[wrap_x(ix - 1, xsize)];
47
159M
  const float in_p1 = row[wrap_x(ix + 1, xsize)];
48
159M
  const float in_00 = row[ix];
49
159M
  const float sum_2 = wx2 * (in_m2 + in_p2);
50
159M
  const float sum_1 = wx1 * (in_m1 + in_p1);
51
159M
  const float sum_0 = wx0 * in_00;
52
159M
  return sum_2 + (sum_1 + sum_0);
53
159M
}
Unexecuted instantiation: convolve_symmetric5.cc:float jxl::N_SSE4::WeightedSumBorder<jxl::WrapMirror>(jxl::Plane<float> const&, jxl::WrapMirror, long, long, unsigned long, unsigned long, float, float, float)
Unexecuted instantiation: convolve_symmetric5.cc:float jxl::N_SSE4::WeightedSumBorder<jxl::WrapUnchanged>(jxl::Plane<float> const&, jxl::WrapUnchanged, long, long, unsigned long, unsigned long, float, float, float)
convolve_symmetric5.cc:float jxl::N_AVX2::WeightedSumBorder<jxl::WrapMirror>(jxl::Plane<float> const&, jxl::WrapMirror, long, long, unsigned long, unsigned long, float, float, float)
Line
Count
Source
41
2.62M
                               const float wx2) {
42
2.62M
  const WrapMirror wrap_x;
43
2.62M
  const float* JXL_RESTRICT row = in.ConstRow(wrap_y(iy, ysize));
44
2.62M
  const float in_m2 = row[wrap_x(ix - 2, xsize)];
45
2.62M
  const float in_p2 = row[wrap_x(ix + 2, xsize)];
46
2.62M
  const float in_m1 = row[wrap_x(ix - 1, xsize)];
47
2.62M
  const float in_p1 = row[wrap_x(ix + 1, xsize)];
48
2.62M
  const float in_00 = row[ix];
49
2.62M
  const float sum_2 = wx2 * (in_m2 + in_p2);
50
2.62M
  const float sum_1 = wx1 * (in_m1 + in_p1);
51
2.62M
  const float sum_0 = wx0 * in_00;
52
2.62M
  return sum_2 + (sum_1 + sum_0);
53
2.62M
}
convolve_symmetric5.cc:float jxl::N_AVX2::WeightedSumBorder<jxl::WrapUnchanged>(jxl::Plane<float> const&, jxl::WrapUnchanged, long, long, unsigned long, unsigned long, float, float, float)
Line
Count
Source
41
156M
                               const float wx2) {
42
156M
  const WrapMirror wrap_x;
43
156M
  const float* JXL_RESTRICT row = in.ConstRow(wrap_y(iy, ysize));
44
156M
  const float in_m2 = row[wrap_x(ix - 2, xsize)];
45
156M
  const float in_p2 = row[wrap_x(ix + 2, xsize)];
46
156M
  const float in_m1 = row[wrap_x(ix - 1, xsize)];
47
156M
  const float in_p1 = row[wrap_x(ix + 1, xsize)];
48
156M
  const float in_00 = row[ix];
49
156M
  const float sum_2 = wx2 * (in_m2 + in_p2);
50
156M
  const float sum_1 = wx1 * (in_m1 + in_p1);
51
156M
  const float sum_0 = wx0 * in_00;
52
156M
  return sum_2 + (sum_1 + sum_0);
53
156M
}
Unexecuted instantiation: convolve_symmetric5.cc:float jxl::N_AVX3::WeightedSumBorder<jxl::WrapMirror>(jxl::Plane<float> const&, jxl::WrapMirror, long, long, unsigned long, unsigned long, float, float, float)
Unexecuted instantiation: convolve_symmetric5.cc:float jxl::N_AVX3::WeightedSumBorder<jxl::WrapUnchanged>(jxl::Plane<float> const&, jxl::WrapUnchanged, long, long, unsigned long, unsigned long, float, float, float)
Unexecuted instantiation: convolve_symmetric5.cc:float jxl::N_AVX3_ZEN4::WeightedSumBorder<jxl::WrapMirror>(jxl::Plane<float> const&, jxl::WrapMirror, long, long, unsigned long, unsigned long, float, float, float)
Unexecuted instantiation: convolve_symmetric5.cc:float jxl::N_AVX3_ZEN4::WeightedSumBorder<jxl::WrapUnchanged>(jxl::Plane<float> const&, jxl::WrapUnchanged, long, long, unsigned long, unsigned long, float, float, float)
Unexecuted instantiation: convolve_symmetric5.cc:float jxl::N_AVX3_SPR::WeightedSumBorder<jxl::WrapMirror>(jxl::Plane<float> const&, jxl::WrapMirror, long, long, unsigned long, unsigned long, float, float, float)
Unexecuted instantiation: convolve_symmetric5.cc:float jxl::N_AVX3_SPR::WeightedSumBorder<jxl::WrapUnchanged>(jxl::Plane<float> const&, jxl::WrapUnchanged, long, long, unsigned long, unsigned long, float, float, float)
Unexecuted instantiation: convolve_symmetric5.cc:float jxl::N_SSE2::WeightedSumBorder<jxl::WrapMirror>(jxl::Plane<float> const&, jxl::WrapMirror, long, long, unsigned long, unsigned long, float, float, float)
Unexecuted instantiation: convolve_symmetric5.cc:float jxl::N_SSE2::WeightedSumBorder<jxl::WrapUnchanged>(jxl::Plane<float> const&, jxl::WrapUnchanged, long, long, unsigned long, unsigned long, float, float, float)
54
55
template <class WrapY, class V>
56
static V WeightedSum(const ImageF& in, const WrapY wrap_y, const size_t ix,
57
                     const int64_t iy, const size_t ysize, const V wx0,
58
444M
                     const V wx1, const V wx2) {
59
444M
  const HWY_FULL(float) d;
60
444M
  const float* JXL_RESTRICT center = in.ConstRow(wrap_y(iy, ysize)) + ix;
61
444M
  const auto in_m2 = LoadU(d, center - 2);
62
444M
  const auto in_p2 = LoadU(d, center + 2);
63
444M
  const auto in_m1 = LoadU(d, center - 1);
64
444M
  const auto in_p1 = LoadU(d, center + 1);
65
444M
  const auto in_00 = LoadU(d, center);
66
444M
  const auto sum_2 = Mul(wx2, Add(in_m2, in_p2));
67
444M
  const auto sum_1 = Mul(wx1, Add(in_m1, in_p1));
68
444M
  const auto sum_0 = Mul(wx0, in_00);
69
444M
  return Add(sum_2, Add(sum_1, sum_0));
70
444M
}
Unexecuted instantiation: convolve_symmetric5.cc:hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::WeightedSum<jxl::WrapMirror, hwy::N_SSE4::Vec128<float, 4ul> >(jxl::Plane<float> const&, jxl::WrapMirror, unsigned long, long, unsigned long, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>)
Unexecuted instantiation: convolve_symmetric5.cc:hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::WeightedSum<jxl::WrapUnchanged, hwy::N_SSE4::Vec128<float, 4ul> >(jxl::Plane<float> const&, jxl::WrapUnchanged, unsigned long, long, unsigned long, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>)
convolve_symmetric5.cc:hwy::N_AVX2::Vec256<float> jxl::N_AVX2::WeightedSum<jxl::WrapMirror, hwy::N_AVX2::Vec256<float> >(jxl::Plane<float> const&, jxl::WrapMirror, unsigned long, long, unsigned long, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>)
Line
Count
Source
58
5.13M
                     const V wx1, const V wx2) {
59
5.13M
  const HWY_FULL(float) d;
60
5.13M
  const float* JXL_RESTRICT center = in.ConstRow(wrap_y(iy, ysize)) + ix;
61
5.13M
  const auto in_m2 = LoadU(d, center - 2);
62
5.13M
  const auto in_p2 = LoadU(d, center + 2);
63
5.13M
  const auto in_m1 = LoadU(d, center - 1);
64
5.13M
  const auto in_p1 = LoadU(d, center + 1);
65
5.13M
  const auto in_00 = LoadU(d, center);
66
5.13M
  const auto sum_2 = Mul(wx2, Add(in_m2, in_p2));
67
5.13M
  const auto sum_1 = Mul(wx1, Add(in_m1, in_p1));
68
5.13M
  const auto sum_0 = Mul(wx0, in_00);
69
5.13M
  return Add(sum_2, Add(sum_1, sum_0));
70
5.13M
}
convolve_symmetric5.cc:hwy::N_AVX2::Vec256<float> jxl::N_AVX2::WeightedSum<jxl::WrapUnchanged, hwy::N_AVX2::Vec256<float> >(jxl::Plane<float> const&, jxl::WrapUnchanged, unsigned long, long, unsigned long, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>)
Line
Count
Source
58
438M
                     const V wx1, const V wx2) {
59
438M
  const HWY_FULL(float) d;
60
438M
  const float* JXL_RESTRICT center = in.ConstRow(wrap_y(iy, ysize)) + ix;
61
438M
  const auto in_m2 = LoadU(d, center - 2);
62
438M
  const auto in_p2 = LoadU(d, center + 2);
63
438M
  const auto in_m1 = LoadU(d, center - 1);
64
438M
  const auto in_p1 = LoadU(d, center + 1);
65
438M
  const auto in_00 = LoadU(d, center);
66
438M
  const auto sum_2 = Mul(wx2, Add(in_m2, in_p2));
67
438M
  const auto sum_1 = Mul(wx1, Add(in_m1, in_p1));
68
438M
  const auto sum_0 = Mul(wx0, in_00);
69
438M
  return Add(sum_2, Add(sum_1, sum_0));
70
438M
}
Unexecuted instantiation: convolve_symmetric5.cc:hwy::N_AVX3::Vec512<float> jxl::N_AVX3::WeightedSum<jxl::WrapMirror, hwy::N_AVX3::Vec512<float> >(jxl::Plane<float> const&, jxl::WrapMirror, unsigned long, long, unsigned long, hwy::N_AVX3::Vec512<float>, hwy::N_AVX3::Vec512<float>, hwy::N_AVX3::Vec512<float>)
Unexecuted instantiation: convolve_symmetric5.cc:hwy::N_AVX3::Vec512<float> jxl::N_AVX3::WeightedSum<jxl::WrapUnchanged, hwy::N_AVX3::Vec512<float> >(jxl::Plane<float> const&, jxl::WrapUnchanged, unsigned long, long, unsigned long, hwy::N_AVX3::Vec512<float>, hwy::N_AVX3::Vec512<float>, hwy::N_AVX3::Vec512<float>)
Unexecuted instantiation: convolve_symmetric5.cc:hwy::N_AVX3_ZEN4::Vec512<float> jxl::N_AVX3_ZEN4::WeightedSum<jxl::WrapMirror, hwy::N_AVX3_ZEN4::Vec512<float> >(jxl::Plane<float> const&, jxl::WrapMirror, unsigned long, long, unsigned long, hwy::N_AVX3_ZEN4::Vec512<float>, hwy::N_AVX3_ZEN4::Vec512<float>, hwy::N_AVX3_ZEN4::Vec512<float>)
Unexecuted instantiation: convolve_symmetric5.cc:hwy::N_AVX3_ZEN4::Vec512<float> jxl::N_AVX3_ZEN4::WeightedSum<jxl::WrapUnchanged, hwy::N_AVX3_ZEN4::Vec512<float> >(jxl::Plane<float> const&, jxl::WrapUnchanged, unsigned long, long, unsigned long, hwy::N_AVX3_ZEN4::Vec512<float>, hwy::N_AVX3_ZEN4::Vec512<float>, hwy::N_AVX3_ZEN4::Vec512<float>)
Unexecuted instantiation: convolve_symmetric5.cc:hwy::N_AVX3_SPR::Vec512<float> jxl::N_AVX3_SPR::WeightedSum<jxl::WrapMirror, hwy::N_AVX3_SPR::Vec512<float> >(jxl::Plane<float> const&, jxl::WrapMirror, unsigned long, long, unsigned long, hwy::N_AVX3_SPR::Vec512<float>, hwy::N_AVX3_SPR::Vec512<float>, hwy::N_AVX3_SPR::Vec512<float>)
Unexecuted instantiation: convolve_symmetric5.cc:hwy::N_AVX3_SPR::Vec512<float> jxl::N_AVX3_SPR::WeightedSum<jxl::WrapUnchanged, hwy::N_AVX3_SPR::Vec512<float> >(jxl::Plane<float> const&, jxl::WrapUnchanged, unsigned long, long, unsigned long, hwy::N_AVX3_SPR::Vec512<float>, hwy::N_AVX3_SPR::Vec512<float>, hwy::N_AVX3_SPR::Vec512<float>)
Unexecuted instantiation: convolve_symmetric5.cc:hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::WeightedSum<jxl::WrapMirror, hwy::N_SSE2::Vec128<float, 4ul> >(jxl::Plane<float> const&, jxl::WrapMirror, unsigned long, long, unsigned long, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>)
Unexecuted instantiation: convolve_symmetric5.cc:hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::WeightedSum<jxl::WrapUnchanged, hwy::N_SSE2::Vec128<float, 4ul> >(jxl::Plane<float> const&, jxl::WrapUnchanged, unsigned long, long, unsigned long, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>)
71
72
// Produces result for one pixel
73
template <class WrapY>
74
float Symmetric5Border(const ImageF& in, const int64_t ix, const int64_t iy,
75
31.8M
                       const WeightsSymmetric5& weights) {
76
31.8M
  const float w0 = weights.c[0];
77
31.8M
  const float w1 = weights.r[0];
78
31.8M
  const float w2 = weights.R[0];
79
31.8M
  const float w4 = weights.d[0];
80
31.8M
  const float w5 = weights.L[0];
81
31.8M
  const float w8 = weights.D[0];
82
83
31.8M
  const size_t xsize = in.xsize();
84
31.8M
  const size_t ysize = in.ysize();
85
31.8M
  const WrapY wrap_y;
86
  // Unrolled loop over all 5 rows of the kernel.
87
31.8M
  float sum0 = WeightedSumBorder(in, wrap_y, ix, iy, xsize, ysize, w0, w1, w2);
88
89
31.8M
  sum0 += WeightedSumBorder(in, wrap_y, ix, iy - 2, xsize, ysize, w2, w5, w8);
90
31.8M
  float sum1 =
91
31.8M
      WeightedSumBorder(in, wrap_y, ix, iy + 2, xsize, ysize, w2, w5, w8);
92
93
31.8M
  sum0 += WeightedSumBorder(in, wrap_y, ix, iy - 1, xsize, ysize, w1, w4, w5);
94
31.8M
  sum1 += WeightedSumBorder(in, wrap_y, ix, iy + 1, xsize, ysize, w1, w4, w5);
95
96
31.8M
  return sum0 + sum1;
97
31.8M
}
Unexecuted instantiation: float jxl::N_SSE4::Symmetric5Border<jxl::WrapMirror>(jxl::Plane<float> const&, long, long, jxl::WeightsSymmetric5 const&)
Unexecuted instantiation: float jxl::N_SSE4::Symmetric5Border<jxl::WrapUnchanged>(jxl::Plane<float> const&, long, long, jxl::WeightsSymmetric5 const&)
float jxl::N_AVX2::Symmetric5Border<jxl::WrapMirror>(jxl::Plane<float> const&, long, long, jxl::WeightsSymmetric5 const&)
Line
Count
Source
75
525k
                       const WeightsSymmetric5& weights) {
76
525k
  const float w0 = weights.c[0];
77
525k
  const float w1 = weights.r[0];
78
525k
  const float w2 = weights.R[0];
79
525k
  const float w4 = weights.d[0];
80
525k
  const float w5 = weights.L[0];
81
525k
  const float w8 = weights.D[0];
82
83
525k
  const size_t xsize = in.xsize();
84
525k
  const size_t ysize = in.ysize();
85
525k
  const WrapY wrap_y;
86
  // Unrolled loop over all 5 rows of the kernel.
87
525k
  float sum0 = WeightedSumBorder(in, wrap_y, ix, iy, xsize, ysize, w0, w1, w2);
88
89
525k
  sum0 += WeightedSumBorder(in, wrap_y, ix, iy - 2, xsize, ysize, w2, w5, w8);
90
525k
  float sum1 =
91
525k
      WeightedSumBorder(in, wrap_y, ix, iy + 2, xsize, ysize, w2, w5, w8);
92
93
525k
  sum0 += WeightedSumBorder(in, wrap_y, ix, iy - 1, xsize, ysize, w1, w4, w5);
94
525k
  sum1 += WeightedSumBorder(in, wrap_y, ix, iy + 1, xsize, ysize, w1, w4, w5);
95
96
525k
  return sum0 + sum1;
97
525k
}
float jxl::N_AVX2::Symmetric5Border<jxl::WrapUnchanged>(jxl::Plane<float> const&, long, long, jxl::WeightsSymmetric5 const&)
Line
Count
Source
75
31.3M
                       const WeightsSymmetric5& weights) {
76
31.3M
  const float w0 = weights.c[0];
77
31.3M
  const float w1 = weights.r[0];
78
31.3M
  const float w2 = weights.R[0];
79
31.3M
  const float w4 = weights.d[0];
80
31.3M
  const float w5 = weights.L[0];
81
31.3M
  const float w8 = weights.D[0];
82
83
31.3M
  const size_t xsize = in.xsize();
84
31.3M
  const size_t ysize = in.ysize();
85
31.3M
  const WrapY wrap_y;
86
  // Unrolled loop over all 5 rows of the kernel.
87
31.3M
  float sum0 = WeightedSumBorder(in, wrap_y, ix, iy, xsize, ysize, w0, w1, w2);
88
89
31.3M
  sum0 += WeightedSumBorder(in, wrap_y, ix, iy - 2, xsize, ysize, w2, w5, w8);
90
31.3M
  float sum1 =
91
31.3M
      WeightedSumBorder(in, wrap_y, ix, iy + 2, xsize, ysize, w2, w5, w8);
92
93
31.3M
  sum0 += WeightedSumBorder(in, wrap_y, ix, iy - 1, xsize, ysize, w1, w4, w5);
94
31.3M
  sum1 += WeightedSumBorder(in, wrap_y, ix, iy + 1, xsize, ysize, w1, w4, w5);
95
96
31.3M
  return sum0 + sum1;
97
31.3M
}
Unexecuted instantiation: float jxl::N_AVX3::Symmetric5Border<jxl::WrapMirror>(jxl::Plane<float> const&, long, long, jxl::WeightsSymmetric5 const&)
Unexecuted instantiation: float jxl::N_AVX3::Symmetric5Border<jxl::WrapUnchanged>(jxl::Plane<float> const&, long, long, jxl::WeightsSymmetric5 const&)
Unexecuted instantiation: float jxl::N_AVX3_ZEN4::Symmetric5Border<jxl::WrapMirror>(jxl::Plane<float> const&, long, long, jxl::WeightsSymmetric5 const&)
Unexecuted instantiation: float jxl::N_AVX3_ZEN4::Symmetric5Border<jxl::WrapUnchanged>(jxl::Plane<float> const&, long, long, jxl::WeightsSymmetric5 const&)
Unexecuted instantiation: float jxl::N_AVX3_SPR::Symmetric5Border<jxl::WrapMirror>(jxl::Plane<float> const&, long, long, jxl::WeightsSymmetric5 const&)
Unexecuted instantiation: float jxl::N_AVX3_SPR::Symmetric5Border<jxl::WrapUnchanged>(jxl::Plane<float> const&, long, long, jxl::WeightsSymmetric5 const&)
Unexecuted instantiation: float jxl::N_SSE2::Symmetric5Border<jxl::WrapMirror>(jxl::Plane<float> const&, long, long, jxl::WeightsSymmetric5 const&)
Unexecuted instantiation: float jxl::N_SSE2::Symmetric5Border<jxl::WrapUnchanged>(jxl::Plane<float> const&, long, long, jxl::WeightsSymmetric5 const&)
98
99
// Produces result for one vector's worth of pixels
100
template <class WrapY>
101
static void Symmetric5Interior(const ImageF& in, const int64_t ix,
102
                               const int64_t rix, const int64_t iy,
103
                               const WeightsSymmetric5& weights,
104
88.8M
                               float* JXL_RESTRICT row_out) {
105
88.8M
  const HWY_FULL(float) d;
106
107
88.8M
  const auto w0 = LoadDup128(d, weights.c);
108
88.8M
  const auto w1 = LoadDup128(d, weights.r);
109
88.8M
  const auto w2 = LoadDup128(d, weights.R);
110
88.8M
  const auto w4 = LoadDup128(d, weights.d);
111
88.8M
  const auto w5 = LoadDup128(d, weights.L);
112
88.8M
  const auto w8 = LoadDup128(d, weights.D);
113
114
88.8M
  const size_t ysize = in.ysize();
115
88.8M
  const WrapY wrap_y;
116
  // Unrolled loop over all 5 rows of the kernel.
117
88.8M
  auto sum0 = WeightedSum(in, wrap_y, ix, iy, ysize, w0, w1, w2);
118
119
88.8M
  sum0 = Add(sum0, WeightedSum(in, wrap_y, ix, iy - 2, ysize, w2, w5, w8));
120
88.8M
  auto sum1 = WeightedSum(in, wrap_y, ix, iy + 2, ysize, w2, w5, w8);
121
122
88.8M
  sum0 = Add(sum0, WeightedSum(in, wrap_y, ix, iy - 1, ysize, w1, w4, w5));
123
88.8M
  sum1 = Add(sum1, WeightedSum(in, wrap_y, ix, iy + 1, ysize, w1, w4, w5));
124
125
88.8M
  StoreU(Add(sum0, sum1), d, row_out + rix);
126
88.8M
}
Unexecuted instantiation: convolve_symmetric5.cc:void jxl::N_SSE4::Symmetric5Interior<jxl::WrapMirror>(jxl::Plane<float> const&, long, long, long, jxl::WeightsSymmetric5 const&, float*)
Unexecuted instantiation: convolve_symmetric5.cc:void jxl::N_SSE4::Symmetric5Interior<jxl::WrapUnchanged>(jxl::Plane<float> const&, long, long, long, jxl::WeightsSymmetric5 const&, float*)
convolve_symmetric5.cc:void jxl::N_AVX2::Symmetric5Interior<jxl::WrapMirror>(jxl::Plane<float> const&, long, long, long, jxl::WeightsSymmetric5 const&, float*)
Line
Count
Source
104
1.02M
                               float* JXL_RESTRICT row_out) {
105
1.02M
  const HWY_FULL(float) d;
106
107
1.02M
  const auto w0 = LoadDup128(d, weights.c);
108
1.02M
  const auto w1 = LoadDup128(d, weights.r);
109
1.02M
  const auto w2 = LoadDup128(d, weights.R);
110
1.02M
  const auto w4 = LoadDup128(d, weights.d);
111
1.02M
  const auto w5 = LoadDup128(d, weights.L);
112
1.02M
  const auto w8 = LoadDup128(d, weights.D);
113
114
1.02M
  const size_t ysize = in.ysize();
115
1.02M
  const WrapY wrap_y;
116
  // Unrolled loop over all 5 rows of the kernel.
117
1.02M
  auto sum0 = WeightedSum(in, wrap_y, ix, iy, ysize, w0, w1, w2);
118
119
1.02M
  sum0 = Add(sum0, WeightedSum(in, wrap_y, ix, iy - 2, ysize, w2, w5, w8));
120
1.02M
  auto sum1 = WeightedSum(in, wrap_y, ix, iy + 2, ysize, w2, w5, w8);
121
122
1.02M
  sum0 = Add(sum0, WeightedSum(in, wrap_y, ix, iy - 1, ysize, w1, w4, w5));
123
1.02M
  sum1 = Add(sum1, WeightedSum(in, wrap_y, ix, iy + 1, ysize, w1, w4, w5));
124
125
1.02M
  StoreU(Add(sum0, sum1), d, row_out + rix);
126
1.02M
}
convolve_symmetric5.cc:void jxl::N_AVX2::Symmetric5Interior<jxl::WrapUnchanged>(jxl::Plane<float> const&, long, long, long, jxl::WeightsSymmetric5 const&, float*)
Line
Count
Source
104
87.7M
                               float* JXL_RESTRICT row_out) {
105
87.7M
  const HWY_FULL(float) d;
106
107
87.7M
  const auto w0 = LoadDup128(d, weights.c);
108
87.7M
  const auto w1 = LoadDup128(d, weights.r);
109
87.7M
  const auto w2 = LoadDup128(d, weights.R);
110
87.7M
  const auto w4 = LoadDup128(d, weights.d);
111
87.7M
  const auto w5 = LoadDup128(d, weights.L);
112
87.7M
  const auto w8 = LoadDup128(d, weights.D);
113
114
87.7M
  const size_t ysize = in.ysize();
115
87.7M
  const WrapY wrap_y;
116
  // Unrolled loop over all 5 rows of the kernel.
117
87.7M
  auto sum0 = WeightedSum(in, wrap_y, ix, iy, ysize, w0, w1, w2);
118
119
87.7M
  sum0 = Add(sum0, WeightedSum(in, wrap_y, ix, iy - 2, ysize, w2, w5, w8));
120
87.7M
  auto sum1 = WeightedSum(in, wrap_y, ix, iy + 2, ysize, w2, w5, w8);
121
122
87.7M
  sum0 = Add(sum0, WeightedSum(in, wrap_y, ix, iy - 1, ysize, w1, w4, w5));
123
87.7M
  sum1 = Add(sum1, WeightedSum(in, wrap_y, ix, iy + 1, ysize, w1, w4, w5));
124
125
87.7M
  StoreU(Add(sum0, sum1), d, row_out + rix);
126
87.7M
}
Unexecuted instantiation: convolve_symmetric5.cc:void jxl::N_AVX3::Symmetric5Interior<jxl::WrapMirror>(jxl::Plane<float> const&, long, long, long, jxl::WeightsSymmetric5 const&, float*)
Unexecuted instantiation: convolve_symmetric5.cc:void jxl::N_AVX3::Symmetric5Interior<jxl::WrapUnchanged>(jxl::Plane<float> const&, long, long, long, jxl::WeightsSymmetric5 const&, float*)
Unexecuted instantiation: convolve_symmetric5.cc:void jxl::N_AVX3_ZEN4::Symmetric5Interior<jxl::WrapMirror>(jxl::Plane<float> const&, long, long, long, jxl::WeightsSymmetric5 const&, float*)
Unexecuted instantiation: convolve_symmetric5.cc:void jxl::N_AVX3_ZEN4::Symmetric5Interior<jxl::WrapUnchanged>(jxl::Plane<float> const&, long, long, long, jxl::WeightsSymmetric5 const&, float*)
Unexecuted instantiation: convolve_symmetric5.cc:void jxl::N_AVX3_SPR::Symmetric5Interior<jxl::WrapMirror>(jxl::Plane<float> const&, long, long, long, jxl::WeightsSymmetric5 const&, float*)
Unexecuted instantiation: convolve_symmetric5.cc:void jxl::N_AVX3_SPR::Symmetric5Interior<jxl::WrapUnchanged>(jxl::Plane<float> const&, long, long, long, jxl::WeightsSymmetric5 const&, float*)
Unexecuted instantiation: convolve_symmetric5.cc:void jxl::N_SSE2::Symmetric5Interior<jxl::WrapMirror>(jxl::Plane<float> const&, long, long, long, jxl::WeightsSymmetric5 const&, float*)
Unexecuted instantiation: convolve_symmetric5.cc:void jxl::N_SSE2::Symmetric5Interior<jxl::WrapUnchanged>(jxl::Plane<float> const&, long, long, long, jxl::WeightsSymmetric5 const&, float*)
127
128
template <class WrapY>
129
static void Symmetric5Row(const ImageF& in, const Rect& rect, const int64_t iy,
130
                          const WeightsSymmetric5& weights,
131
2.03M
                          float* JXL_RESTRICT row_out) {
132
2.03M
  const int64_t kRadius = 2;
133
2.03M
  const size_t xend = rect.x1();
134
135
2.03M
  size_t rix = 0;
136
2.03M
  size_t ix = rect.x0();
137
2.03M
  const HWY_FULL(float) d;
138
2.03M
  const size_t N = Lanes(d);
139
2.03M
  const size_t aligned_x = RoundUpTo(kRadius, N);
140
18.3M
  for (; ix < std::min(aligned_x, xend); ++ix, ++rix) {
141
16.2M
    row_out[rix] = Symmetric5Border<WrapY>(in, ix, iy, weights);
142
16.2M
  }
143
90.8M
  for (; ix + N + kRadius <= xend; ix += N, rix += N) {
144
88.8M
    Symmetric5Interior<WrapY>(in, ix, rix, iy, weights, row_out);
145
88.8M
  }
146
17.6M
  for (; ix < xend; ++ix, ++rix) {
147
15.6M
    row_out[rix] = Symmetric5Border<WrapY>(in, ix, iy, weights);
148
15.6M
  }
149
2.03M
}
Unexecuted instantiation: convolve_symmetric5.cc:void jxl::N_SSE4::Symmetric5Row<jxl::WrapMirror>(jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, long, jxl::WeightsSymmetric5 const&, float*)
Unexecuted instantiation: convolve_symmetric5.cc:void jxl::N_SSE4::Symmetric5Row<jxl::WrapUnchanged>(jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, long, jxl::WeightsSymmetric5 const&, float*)
convolve_symmetric5.cc:void jxl::N_AVX2::Symmetric5Row<jxl::WrapMirror>(jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, long, jxl::WeightsSymmetric5 const&, float*)
Line
Count
Source
131
34.0k
                          float* JXL_RESTRICT row_out) {
132
34.0k
  const int64_t kRadius = 2;
133
34.0k
  const size_t xend = rect.x1();
134
135
34.0k
  size_t rix = 0;
136
34.0k
  size_t ix = rect.x0();
137
34.0k
  const HWY_FULL(float) d;
138
34.0k
  const size_t N = Lanes(d);
139
34.0k
  const size_t aligned_x = RoundUpTo(kRadius, N);
140
306k
  for (; ix < std::min(aligned_x, xend); ++ix, ++rix) {
141
272k
    row_out[rix] = Symmetric5Border<WrapY>(in, ix, iy, weights);
142
272k
  }
143
1.06M
  for (; ix + N + kRadius <= xend; ix += N, rix += N) {
144
1.02M
    Symmetric5Interior<WrapY>(in, ix, rix, iy, weights, row_out);
145
1.02M
  }
146
287k
  for (; ix < xend; ++ix, ++rix) {
147
252k
    row_out[rix] = Symmetric5Border<WrapY>(in, ix, iy, weights);
148
252k
  }
149
34.0k
}
convolve_symmetric5.cc:void jxl::N_AVX2::Symmetric5Row<jxl::WrapUnchanged>(jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, long, jxl::WeightsSymmetric5 const&, float*)
Line
Count
Source
131
2.00M
                          float* JXL_RESTRICT row_out) {
132
2.00M
  const int64_t kRadius = 2;
133
2.00M
  const size_t xend = rect.x1();
134
135
2.00M
  size_t rix = 0;
136
2.00M
  size_t ix = rect.x0();
137
2.00M
  const HWY_FULL(float) d;
138
2.00M
  const size_t N = Lanes(d);
139
2.00M
  const size_t aligned_x = RoundUpTo(kRadius, N);
140
18.0M
  for (; ix < std::min(aligned_x, xend); ++ix, ++rix) {
141
16.0M
    row_out[rix] = Symmetric5Border<WrapY>(in, ix, iy, weights);
142
16.0M
  }
143
89.7M
  for (; ix + N + kRadius <= xend; ix += N, rix += N) {
144
87.7M
    Symmetric5Interior<WrapY>(in, ix, rix, iy, weights, row_out);
145
87.7M
  }
146
17.3M
  for (; ix < xend; ++ix, ++rix) {
147
15.3M
    row_out[rix] = Symmetric5Border<WrapY>(in, ix, iy, weights);
148
15.3M
  }
149
2.00M
}
Unexecuted instantiation: convolve_symmetric5.cc:void jxl::N_AVX3::Symmetric5Row<jxl::WrapMirror>(jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, long, jxl::WeightsSymmetric5 const&, float*)
Unexecuted instantiation: convolve_symmetric5.cc:void jxl::N_AVX3::Symmetric5Row<jxl::WrapUnchanged>(jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, long, jxl::WeightsSymmetric5 const&, float*)
Unexecuted instantiation: convolve_symmetric5.cc:void jxl::N_AVX3_ZEN4::Symmetric5Row<jxl::WrapMirror>(jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, long, jxl::WeightsSymmetric5 const&, float*)
Unexecuted instantiation: convolve_symmetric5.cc:void jxl::N_AVX3_ZEN4::Symmetric5Row<jxl::WrapUnchanged>(jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, long, jxl::WeightsSymmetric5 const&, float*)
Unexecuted instantiation: convolve_symmetric5.cc:void jxl::N_AVX3_SPR::Symmetric5Row<jxl::WrapMirror>(jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, long, jxl::WeightsSymmetric5 const&, float*)
Unexecuted instantiation: convolve_symmetric5.cc:void jxl::N_AVX3_SPR::Symmetric5Row<jxl::WrapUnchanged>(jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, long, jxl::WeightsSymmetric5 const&, float*)
Unexecuted instantiation: convolve_symmetric5.cc:void jxl::N_SSE2::Symmetric5Row<jxl::WrapMirror>(jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, long, jxl::WeightsSymmetric5 const&, float*)
Unexecuted instantiation: convolve_symmetric5.cc:void jxl::N_SSE2::Symmetric5Row<jxl::WrapUnchanged>(jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, long, jxl::WeightsSymmetric5 const&, float*)
150
151
// Semi-vectorized (interior pixels Fonly); called directly like slow::, unlike
152
// the fully vectorized strategies below.
153
Status Symmetric5(const ImageF& in, const Rect& in_rect,
154
                  const WeightsSymmetric5& weights, ThreadPool* pool,
155
8.52k
                  ImageF* JXL_RESTRICT out, const Rect& out_rect) {
156
8.52k
  JXL_ENSURE(in_rect.xsize() == out_rect.xsize());
157
8.52k
  JXL_ENSURE(in_rect.ysize() == out_rect.ysize());
158
8.52k
  const size_t ysize = in_rect.ysize();
159
8.52k
  const auto process_row = [&](const uint32_t task,
160
2.03M
                               size_t /*thread*/) -> Status {
161
2.03M
    const int64_t riy = task;
162
2.03M
    const int64_t iy = in_rect.y0() + riy;
163
164
2.03M
    if (iy < 2 || iy >= static_cast<ssize_t>(in.ysize()) - 2) {
165
34.0k
      Symmetric5Row<WrapMirror>(in, in_rect, iy, weights,
166
34.0k
                                out_rect.Row(out, riy));
167
2.00M
    } else {
168
2.00M
      Symmetric5Row<WrapUnchanged>(in, in_rect, iy, weights,
169
2.00M
                                   out_rect.Row(out, riy));
170
2.00M
    }
171
2.03M
    return true;
172
2.03M
  };
Unexecuted instantiation: convolve_symmetric5.cc:jxl::N_SSE4::Symmetric5(jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, jxl::WeightsSymmetric5 const&, jxl::ThreadPool*, jxl::Plane<float>*, jxl::RectT<unsigned long> const&)::$_0::operator()(unsigned int, unsigned long) const
convolve_symmetric5.cc:jxl::N_AVX2::Symmetric5(jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, jxl::WeightsSymmetric5 const&, jxl::ThreadPool*, jxl::Plane<float>*, jxl::RectT<unsigned long> const&)::$_0::operator()(unsigned int, unsigned long) const
Line
Count
Source
160
2.03M
                               size_t /*thread*/) -> Status {
161
2.03M
    const int64_t riy = task;
162
2.03M
    const int64_t iy = in_rect.y0() + riy;
163
164
2.03M
    if (iy < 2 || iy >= static_cast<ssize_t>(in.ysize()) - 2) {
165
34.0k
      Symmetric5Row<WrapMirror>(in, in_rect, iy, weights,
166
34.0k
                                out_rect.Row(out, riy));
167
2.00M
    } else {
168
2.00M
      Symmetric5Row<WrapUnchanged>(in, in_rect, iy, weights,
169
2.00M
                                   out_rect.Row(out, riy));
170
2.00M
    }
171
2.03M
    return true;
172
2.03M
  };
Unexecuted instantiation: convolve_symmetric5.cc:jxl::N_AVX3::Symmetric5(jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, jxl::WeightsSymmetric5 const&, jxl::ThreadPool*, jxl::Plane<float>*, jxl::RectT<unsigned long> const&)::$_0::operator()(unsigned int, unsigned long) const
Unexecuted instantiation: convolve_symmetric5.cc:jxl::N_AVX3_ZEN4::Symmetric5(jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, jxl::WeightsSymmetric5 const&, jxl::ThreadPool*, jxl::Plane<float>*, jxl::RectT<unsigned long> const&)::$_0::operator()(unsigned int, unsigned long) const
Unexecuted instantiation: convolve_symmetric5.cc:jxl::N_AVX3_SPR::Symmetric5(jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, jxl::WeightsSymmetric5 const&, jxl::ThreadPool*, jxl::Plane<float>*, jxl::RectT<unsigned long> const&)::$_0::operator()(unsigned int, unsigned long) const
Unexecuted instantiation: convolve_symmetric5.cc:jxl::N_SSE2::Symmetric5(jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, jxl::WeightsSymmetric5 const&, jxl::ThreadPool*, jxl::Plane<float>*, jxl::RectT<unsigned long> const&)::$_0::operator()(unsigned int, unsigned long) const
173
8.52k
  JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, static_cast<uint32_t>(ysize),
174
8.52k
                                ThreadPool::NoInit, process_row,
175
8.52k
                                "Symmetric5x5Convolution"));
176
8.52k
  return true;
177
8.52k
}
Unexecuted instantiation: jxl::N_SSE4::Symmetric5(jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, jxl::WeightsSymmetric5 const&, jxl::ThreadPool*, jxl::Plane<float>*, jxl::RectT<unsigned long> const&)
jxl::N_AVX2::Symmetric5(jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, jxl::WeightsSymmetric5 const&, jxl::ThreadPool*, jxl::Plane<float>*, jxl::RectT<unsigned long> const&)
Line
Count
Source
155
8.52k
                  ImageF* JXL_RESTRICT out, const Rect& out_rect) {
156
8.52k
  JXL_ENSURE(in_rect.xsize() == out_rect.xsize());
157
8.52k
  JXL_ENSURE(in_rect.ysize() == out_rect.ysize());
158
8.52k
  const size_t ysize = in_rect.ysize();
159
8.52k
  const auto process_row = [&](const uint32_t task,
160
8.52k
                               size_t /*thread*/) -> Status {
161
8.52k
    const int64_t riy = task;
162
8.52k
    const int64_t iy = in_rect.y0() + riy;
163
164
8.52k
    if (iy < 2 || iy >= static_cast<ssize_t>(in.ysize()) - 2) {
165
8.52k
      Symmetric5Row<WrapMirror>(in, in_rect, iy, weights,
166
8.52k
                                out_rect.Row(out, riy));
167
8.52k
    } else {
168
8.52k
      Symmetric5Row<WrapUnchanged>(in, in_rect, iy, weights,
169
8.52k
                                   out_rect.Row(out, riy));
170
8.52k
    }
171
8.52k
    return true;
172
8.52k
  };
173
8.52k
  JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, static_cast<uint32_t>(ysize),
174
8.52k
                                ThreadPool::NoInit, process_row,
175
8.52k
                                "Symmetric5x5Convolution"));
176
8.52k
  return true;
177
8.52k
}
Unexecuted instantiation: jxl::N_AVX3::Symmetric5(jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, jxl::WeightsSymmetric5 const&, jxl::ThreadPool*, jxl::Plane<float>*, jxl::RectT<unsigned long> const&)
Unexecuted instantiation: jxl::N_AVX3_ZEN4::Symmetric5(jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, jxl::WeightsSymmetric5 const&, jxl::ThreadPool*, jxl::Plane<float>*, jxl::RectT<unsigned long> const&)
Unexecuted instantiation: jxl::N_AVX3_SPR::Symmetric5(jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, jxl::WeightsSymmetric5 const&, jxl::ThreadPool*, jxl::Plane<float>*, jxl::RectT<unsigned long> const&)
Unexecuted instantiation: jxl::N_SSE2::Symmetric5(jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, jxl::WeightsSymmetric5 const&, jxl::ThreadPool*, jxl::Plane<float>*, jxl::RectT<unsigned long> const&)
178
179
// NOLINTNEXTLINE(google-readability-namespace-comments)
180
}  // namespace HWY_NAMESPACE
181
}  // namespace jxl
182
HWY_AFTER_NAMESPACE();
183
184
#if HWY_ONCE
185
namespace jxl {
186
187
HWY_EXPORT(Symmetric5);
188
Status Symmetric5(const ImageF& in, const Rect& in_rect,
189
                  const WeightsSymmetric5& weights, ThreadPool* pool,
190
8.52k
                  ImageF* JXL_RESTRICT out, const Rect& out_rect) {
191
8.52k
  return HWY_DYNAMIC_DISPATCH(Symmetric5)(in, in_rect, weights, pool, out,
192
8.52k
                                          out_rect);
193
8.52k
}
194
195
}  // namespace jxl
196
#endif  // HWY_ONCE