/src/libjxl/lib/jxl/convolve_symmetric5.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) the JPEG XL Project Authors. All rights reserved. |
2 | | // |
3 | | // Use of this source code is governed by a BSD-style |
4 | | // license that can be found in the LICENSE file. |
5 | | |
6 | | #include "lib/jxl/convolve.h" |
7 | | |
8 | | #undef HWY_TARGET_INCLUDE |
9 | | #define HWY_TARGET_INCLUDE "lib/jxl/convolve_symmetric5.cc" |
10 | | #include <hwy/foreach_target.h> |
11 | | #include <hwy/highway.h> |
12 | | |
13 | | #include "lib/jxl/base/common.h" |
14 | | #include "lib/jxl/base/rect.h" |
15 | | #include "lib/jxl/convolve-inl.h" |
16 | | |
17 | | HWY_BEFORE_NAMESPACE(); |
18 | | namespace jxl { |
19 | | namespace HWY_NAMESPACE { |
20 | | |
21 | | // These templates are not found via ADL. |
22 | | using hwy::HWY_NAMESPACE::Add; |
23 | | using hwy::HWY_NAMESPACE::Mul; |
24 | | using hwy::HWY_NAMESPACE::Vec; |
25 | | |
26 | | // Weighted sum of 1x5 pixels around ix, iy with [wx2 wx1 wx0 wx1 wx2]. |
27 | | template <class WrapY> |
28 | | static float WeightedSumBorder(const ImageF& in, const WrapY wrap_y, |
29 | | const int64_t ix, const int64_t iy, |
30 | | const size_t xsize, const size_t ysize, |
31 | | const float wx0, const float wx1, |
32 | 0 | const float wx2) { |
33 | 0 | const WrapMirror wrap_x; |
34 | 0 | const float* JXL_RESTRICT row = in.ConstRow(wrap_y(iy, ysize)); |
35 | 0 | const float in_m2 = row[wrap_x(ix - 2, xsize)]; |
36 | 0 | const float in_p2 = row[wrap_x(ix + 2, xsize)]; |
37 | 0 | const float in_m1 = row[wrap_x(ix - 1, xsize)]; |
38 | 0 | const float in_p1 = row[wrap_x(ix + 1, xsize)]; |
39 | 0 | const float in_00 = row[ix]; |
40 | 0 | const float sum_2 = wx2 * (in_m2 + in_p2); |
41 | 0 | const float sum_1 = wx1 * (in_m1 + in_p1); |
42 | 0 | const float sum_0 = wx0 * in_00; |
43 | 0 | return sum_2 + (sum_1 + sum_0); |
44 | 0 | } Unexecuted instantiation: convolve_symmetric5.cc:float jxl::N_SSE4::WeightedSumBorder<jxl::WrapMirror>(jxl::Plane<float> const&, jxl::WrapMirror, long, long, unsigned long, unsigned long, float, float, float) Unexecuted instantiation: convolve_symmetric5.cc:float jxl::N_SSE4::WeightedSumBorder<jxl::WrapUnchanged>(jxl::Plane<float> const&, jxl::WrapUnchanged, long, long, unsigned long, unsigned long, float, float, float) Unexecuted instantiation: convolve_symmetric5.cc:float jxl::N_AVX2::WeightedSumBorder<jxl::WrapMirror>(jxl::Plane<float> const&, jxl::WrapMirror, long, long, unsigned long, unsigned long, float, float, float) Unexecuted instantiation: convolve_symmetric5.cc:float jxl::N_AVX2::WeightedSumBorder<jxl::WrapUnchanged>(jxl::Plane<float> const&, jxl::WrapUnchanged, long, long, unsigned long, unsigned long, float, float, float) Unexecuted instantiation: convolve_symmetric5.cc:float jxl::N_SSE2::WeightedSumBorder<jxl::WrapMirror>(jxl::Plane<float> const&, jxl::WrapMirror, long, long, unsigned long, unsigned long, float, float, float) Unexecuted instantiation: convolve_symmetric5.cc:float jxl::N_SSE2::WeightedSumBorder<jxl::WrapUnchanged>(jxl::Plane<float> const&, jxl::WrapUnchanged, long, long, unsigned long, unsigned long, float, float, float) |
45 | | |
46 | | template <class WrapY, class V> |
47 | | static V WeightedSum(const ImageF& in, const WrapY wrap_y, const size_t ix, |
48 | | const int64_t iy, const size_t ysize, const V wx0, |
49 | 0 | const V wx1, const V wx2) { |
50 | 0 | const HWY_FULL(float) d; |
51 | 0 | const float* JXL_RESTRICT center = in.ConstRow(wrap_y(iy, ysize)) + ix; |
52 | 0 | const auto in_m2 = LoadU(d, center - 2); |
53 | 0 | const auto in_p2 = LoadU(d, center + 2); |
54 | 0 | const auto in_m1 = LoadU(d, center - 1); |
55 | 0 | const auto in_p1 = LoadU(d, center + 1); |
56 | 0 | const auto in_00 = LoadU(d, center); |
57 | 0 | const auto sum_2 = Mul(wx2, Add(in_m2, in_p2)); |
58 | 0 | const auto sum_1 = Mul(wx1, Add(in_m1, in_p1)); |
59 | 0 | const auto sum_0 = Mul(wx0, in_00); |
60 | 0 | return Add(sum_2, Add(sum_1, sum_0)); |
61 | 0 | } Unexecuted instantiation: convolve_symmetric5.cc:hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::WeightedSum<jxl::WrapMirror, hwy::N_SSE4::Vec128<float, 4ul> >(jxl::Plane<float> const&, jxl::WrapMirror, unsigned long, long, unsigned long, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>) Unexecuted instantiation: convolve_symmetric5.cc:hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::WeightedSum<jxl::WrapUnchanged, hwy::N_SSE4::Vec128<float, 4ul> >(jxl::Plane<float> const&, jxl::WrapUnchanged, unsigned long, long, unsigned long, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>) Unexecuted instantiation: convolve_symmetric5.cc:hwy::N_AVX2::Vec256<float> jxl::N_AVX2::WeightedSum<jxl::WrapMirror, hwy::N_AVX2::Vec256<float> >(jxl::Plane<float> const&, jxl::WrapMirror, unsigned long, long, unsigned long, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>) Unexecuted instantiation: convolve_symmetric5.cc:hwy::N_AVX2::Vec256<float> jxl::N_AVX2::WeightedSum<jxl::WrapUnchanged, hwy::N_AVX2::Vec256<float> >(jxl::Plane<float> const&, jxl::WrapUnchanged, unsigned long, long, unsigned long, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>) Unexecuted instantiation: convolve_symmetric5.cc:hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::WeightedSum<jxl::WrapMirror, hwy::N_SSE2::Vec128<float, 4ul> >(jxl::Plane<float> const&, jxl::WrapMirror, unsigned long, long, unsigned long, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>) Unexecuted instantiation: convolve_symmetric5.cc:hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::WeightedSum<jxl::WrapUnchanged, hwy::N_SSE2::Vec128<float, 4ul> >(jxl::Plane<float> const&, jxl::WrapUnchanged, unsigned long, long, unsigned long, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>) |
62 | | |
63 | | // Produces result for one pixel |
64 | | template <class WrapY> |
65 | | float Symmetric5Border(const ImageF& in, const int64_t ix, const int64_t iy, |
66 | 0 | const WeightsSymmetric5& weights) { |
67 | 0 | const float w0 = weights.c[0]; |
68 | 0 | const float w1 = weights.r[0]; |
69 | 0 | const float w2 = weights.R[0]; |
70 | 0 | const float w4 = weights.d[0]; |
71 | 0 | const float w5 = weights.L[0]; |
72 | 0 | const float w8 = weights.D[0]; |
73 | |
|
74 | 0 | const size_t xsize = in.xsize(); |
75 | 0 | const size_t ysize = in.ysize(); |
76 | 0 | const WrapY wrap_y; |
77 | | // Unrolled loop over all 5 rows of the kernel. |
78 | 0 | float sum0 = WeightedSumBorder(in, wrap_y, ix, iy, xsize, ysize, w0, w1, w2); |
79 | |
|
80 | 0 | sum0 += WeightedSumBorder(in, wrap_y, ix, iy - 2, xsize, ysize, w2, w5, w8); |
81 | 0 | float sum1 = |
82 | 0 | WeightedSumBorder(in, wrap_y, ix, iy + 2, xsize, ysize, w2, w5, w8); |
83 | |
|
84 | 0 | sum0 += WeightedSumBorder(in, wrap_y, ix, iy - 1, xsize, ysize, w1, w4, w5); |
85 | 0 | sum1 += WeightedSumBorder(in, wrap_y, ix, iy + 1, xsize, ysize, w1, w4, w5); |
86 | |
|
87 | 0 | return sum0 + sum1; |
88 | 0 | } Unexecuted instantiation: float jxl::N_SSE4::Symmetric5Border<jxl::WrapMirror>(jxl::Plane<float> const&, long, long, jxl::WeightsSymmetric5 const&) Unexecuted instantiation: float jxl::N_SSE4::Symmetric5Border<jxl::WrapUnchanged>(jxl::Plane<float> const&, long, long, jxl::WeightsSymmetric5 const&) Unexecuted instantiation: float jxl::N_AVX2::Symmetric5Border<jxl::WrapMirror>(jxl::Plane<float> const&, long, long, jxl::WeightsSymmetric5 const&) Unexecuted instantiation: float jxl::N_AVX2::Symmetric5Border<jxl::WrapUnchanged>(jxl::Plane<float> const&, long, long, jxl::WeightsSymmetric5 const&) Unexecuted instantiation: float jxl::N_SSE2::Symmetric5Border<jxl::WrapMirror>(jxl::Plane<float> const&, long, long, jxl::WeightsSymmetric5 const&) Unexecuted instantiation: float jxl::N_SSE2::Symmetric5Border<jxl::WrapUnchanged>(jxl::Plane<float> const&, long, long, jxl::WeightsSymmetric5 const&) |
89 | | |
90 | | // Produces result for one vector's worth of pixels |
91 | | template <class WrapY> |
92 | | static void Symmetric5Interior(const ImageF& in, const int64_t ix, |
93 | | const int64_t rix, const int64_t iy, |
94 | | const WeightsSymmetric5& weights, |
95 | 0 | float* JXL_RESTRICT row_out) { |
96 | 0 | const HWY_FULL(float) d; |
97 | |
|
98 | 0 | const auto w0 = LoadDup128(d, weights.c); |
99 | 0 | const auto w1 = LoadDup128(d, weights.r); |
100 | 0 | const auto w2 = LoadDup128(d, weights.R); |
101 | 0 | const auto w4 = LoadDup128(d, weights.d); |
102 | 0 | const auto w5 = LoadDup128(d, weights.L); |
103 | 0 | const auto w8 = LoadDup128(d, weights.D); |
104 | |
|
105 | 0 | const size_t ysize = in.ysize(); |
106 | 0 | const WrapY wrap_y; |
107 | | // Unrolled loop over all 5 rows of the kernel. |
108 | 0 | auto sum0 = WeightedSum(in, wrap_y, ix, iy, ysize, w0, w1, w2); |
109 | |
|
110 | 0 | sum0 = Add(sum0, WeightedSum(in, wrap_y, ix, iy - 2, ysize, w2, w5, w8)); |
111 | 0 | auto sum1 = WeightedSum(in, wrap_y, ix, iy + 2, ysize, w2, w5, w8); |
112 | |
|
113 | 0 | sum0 = Add(sum0, WeightedSum(in, wrap_y, ix, iy - 1, ysize, w1, w4, w5)); |
114 | 0 | sum1 = Add(sum1, WeightedSum(in, wrap_y, ix, iy + 1, ysize, w1, w4, w5)); |
115 | |
|
116 | 0 | StoreU(Add(sum0, sum1), d, row_out + rix); |
117 | 0 | } Unexecuted instantiation: convolve_symmetric5.cc:void jxl::N_SSE4::Symmetric5Interior<jxl::WrapMirror>(jxl::Plane<float> const&, long, long, long, jxl::WeightsSymmetric5 const&, float*) Unexecuted instantiation: convolve_symmetric5.cc:void jxl::N_SSE4::Symmetric5Interior<jxl::WrapUnchanged>(jxl::Plane<float> const&, long, long, long, jxl::WeightsSymmetric5 const&, float*) Unexecuted instantiation: convolve_symmetric5.cc:void jxl::N_AVX2::Symmetric5Interior<jxl::WrapMirror>(jxl::Plane<float> const&, long, long, long, jxl::WeightsSymmetric5 const&, float*) Unexecuted instantiation: convolve_symmetric5.cc:void jxl::N_AVX2::Symmetric5Interior<jxl::WrapUnchanged>(jxl::Plane<float> const&, long, long, long, jxl::WeightsSymmetric5 const&, float*) Unexecuted instantiation: convolve_symmetric5.cc:void jxl::N_SSE2::Symmetric5Interior<jxl::WrapMirror>(jxl::Plane<float> const&, long, long, long, jxl::WeightsSymmetric5 const&, float*) Unexecuted instantiation: convolve_symmetric5.cc:void jxl::N_SSE2::Symmetric5Interior<jxl::WrapUnchanged>(jxl::Plane<float> const&, long, long, long, jxl::WeightsSymmetric5 const&, float*) |
118 | | |
119 | | template <class WrapY> |
120 | | static void Symmetric5Row(const ImageF& in, const Rect& rect, const int64_t iy, |
121 | | const WeightsSymmetric5& weights, |
122 | 0 | float* JXL_RESTRICT row_out) { |
123 | 0 | const int64_t kRadius = 2; |
124 | 0 | const size_t xend = rect.x1(); |
125 | |
|
126 | 0 | size_t rix = 0; |
127 | 0 | size_t ix = rect.x0(); |
128 | 0 | const HWY_FULL(float) d; |
129 | 0 | const size_t N = Lanes(d); |
130 | 0 | const size_t aligned_x = RoundUpTo(kRadius, N); |
131 | 0 | for (; ix < std::min(aligned_x, xend); ++ix, ++rix) { |
132 | 0 | row_out[rix] = Symmetric5Border<WrapY>(in, ix, iy, weights); |
133 | 0 | } |
134 | 0 | for (; ix + N + kRadius <= xend; ix += N, rix += N) { |
135 | 0 | Symmetric5Interior<WrapY>(in, ix, rix, iy, weights, row_out); |
136 | 0 | } |
137 | 0 | for (; ix < xend; ++ix, ++rix) { |
138 | 0 | row_out[rix] = Symmetric5Border<WrapY>(in, ix, iy, weights); |
139 | 0 | } |
140 | 0 | } Unexecuted instantiation: convolve_symmetric5.cc:void jxl::N_SSE4::Symmetric5Row<jxl::WrapMirror>(jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, long, jxl::WeightsSymmetric5 const&, float*) Unexecuted instantiation: convolve_symmetric5.cc:void jxl::N_SSE4::Symmetric5Row<jxl::WrapUnchanged>(jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, long, jxl::WeightsSymmetric5 const&, float*) Unexecuted instantiation: convolve_symmetric5.cc:void jxl::N_AVX2::Symmetric5Row<jxl::WrapMirror>(jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, long, jxl::WeightsSymmetric5 const&, float*) Unexecuted instantiation: convolve_symmetric5.cc:void jxl::N_AVX2::Symmetric5Row<jxl::WrapUnchanged>(jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, long, jxl::WeightsSymmetric5 const&, float*) Unexecuted instantiation: convolve_symmetric5.cc:void jxl::N_SSE2::Symmetric5Row<jxl::WrapMirror>(jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, long, jxl::WeightsSymmetric5 const&, float*) Unexecuted instantiation: convolve_symmetric5.cc:void jxl::N_SSE2::Symmetric5Row<jxl::WrapUnchanged>(jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, long, jxl::WeightsSymmetric5 const&, float*) |
141 | | |
142 | | // Semi-vectorized (interior pixels Fonly); called directly like slow::, unlike |
143 | | // the fully vectorized strategies below. |
144 | | void Symmetric5(const ImageF& in, const Rect& in_rect, |
145 | | const WeightsSymmetric5& weights, ThreadPool* pool, |
146 | 0 | ImageF* JXL_RESTRICT out, const Rect& out_rect) { |
147 | 0 | JXL_ASSERT(in_rect.xsize() == out_rect.xsize()); |
148 | 0 | JXL_ASSERT(in_rect.ysize() == out_rect.ysize()); |
149 | 0 | const size_t ysize = in_rect.ysize(); |
150 | 0 | JXL_CHECK(RunOnPool( |
151 | 0 | pool, 0, static_cast<uint32_t>(ysize), ThreadPool::NoInit, |
152 | 0 | [&](const uint32_t task, size_t /*thread*/) { |
153 | 0 | const int64_t riy = task; |
154 | 0 | const int64_t iy = in_rect.y0() + riy; |
155 | |
|
156 | 0 | if (iy < 2 || iy >= static_cast<ssize_t>(in.ysize()) - 2) { |
157 | 0 | Symmetric5Row<WrapMirror>(in, in_rect, iy, weights, |
158 | 0 | out_rect.Row(out, riy)); |
159 | 0 | } else { |
160 | 0 | Symmetric5Row<WrapUnchanged>(in, in_rect, iy, weights, |
161 | 0 | out_rect.Row(out, riy)); |
162 | 0 | } |
163 | 0 | }, |
164 | 0 | "Symmetric5x5Convolution")); |
165 | 0 | } Unexecuted instantiation: jxl::N_SSE4::Symmetric5(jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, jxl::WeightsSymmetric5 const&, jxl::ThreadPool*, jxl::Plane<float>*, jxl::RectT<unsigned long> const&) Unexecuted instantiation: jxl::N_AVX2::Symmetric5(jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, jxl::WeightsSymmetric5 const&, jxl::ThreadPool*, jxl::Plane<float>*, jxl::RectT<unsigned long> const&) Unexecuted instantiation: jxl::N_SSE2::Symmetric5(jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, jxl::WeightsSymmetric5 const&, jxl::ThreadPool*, jxl::Plane<float>*, jxl::RectT<unsigned long> const&) |
166 | | |
167 | | // NOLINTNEXTLINE(google-readability-namespace-comments) |
168 | | } // namespace HWY_NAMESPACE |
169 | | } // namespace jxl |
170 | | HWY_AFTER_NAMESPACE(); |
171 | | |
172 | | #if HWY_ONCE |
173 | | namespace jxl { |
174 | | |
175 | | HWY_EXPORT(Symmetric5); |
176 | | void Symmetric5(const ImageF& in, const Rect& in_rect, |
177 | | const WeightsSymmetric5& weights, ThreadPool* pool, |
178 | 0 | ImageF* JXL_RESTRICT out, const Rect& out_rect) { |
179 | 0 | HWY_DYNAMIC_DISPATCH(Symmetric5)(in, in_rect, weights, pool, out, out_rect); |
180 | 0 | } |
181 | | |
182 | | void Symmetric5(const ImageF& in, const Rect& rect, |
183 | | const WeightsSymmetric5& weights, ThreadPool* pool, |
184 | 0 | ImageF* JXL_RESTRICT out) { |
185 | 0 | Symmetric5(in, rect, weights, pool, out, Rect(*out)); |
186 | 0 | } |
187 | | |
188 | | } // namespace jxl |
189 | | #endif // HWY_ONCE |