/src/libjxl/lib/jxl/convolve_symmetric5.cc
Line | Count | Source |
1 | | // Copyright (c) the JPEG XL Project Authors. All rights reserved. |
2 | | // |
3 | | // Use of this source code is governed by a BSD-style |
4 | | // license that can be found in the LICENSE file. |
5 | | |
6 | | #include <algorithm> |
7 | | #include <cstddef> |
8 | | #include <cstdint> |
9 | | #include <cstdio> |
10 | | |
11 | | #include "lib/jxl/base/compiler_specific.h" |
12 | | #include "lib/jxl/base/data_parallel.h" |
13 | | #include "lib/jxl/base/status.h" |
14 | | #include "lib/jxl/convolve.h" |
15 | | #include "lib/jxl/image.h" |
16 | | |
17 | | #undef HWY_TARGET_INCLUDE |
18 | | #define HWY_TARGET_INCLUDE "lib/jxl/convolve_symmetric5.cc" |
19 | | #include <hwy/foreach_target.h> |
20 | | #include <hwy/highway.h> |
21 | | |
22 | | #include "lib/jxl/base/common.h" |
23 | | #include "lib/jxl/base/rect.h" |
24 | | #include "lib/jxl/image_ops.h" |
25 | | |
26 | | HWY_BEFORE_NAMESPACE(); |
27 | | namespace jxl { |
28 | | namespace HWY_NAMESPACE { |
29 | | |
30 | | // These templates are not found via ADL. |
31 | | using hwy::HWY_NAMESPACE::Add; |
32 | | using hwy::HWY_NAMESPACE::Mul; |
33 | | using hwy::HWY_NAMESPACE::Vec; |
34 | | |
35 | | // Weighted sum of 1x5 pixels around ix, iy with [wx2 wx1 wx0 wx1 wx2]. |
36 | | template <class WrapY> |
37 | | static float WeightedSumBorder(const ImageF& in, const WrapY wrap_y, |
38 | | const int64_t ix, const int64_t iy, |
39 | | const size_t xsize, const size_t ysize, |
40 | | const float wx0, const float wx1, |
41 | 18.7M | const float wx2) { |
42 | 18.7M | const WrapMirror wrap_x; |
43 | 18.7M | const float* JXL_RESTRICT row = in.ConstRow(wrap_y(iy, ysize)); |
44 | 18.7M | const float in_m2 = row[wrap_x(ix - 2, xsize)]; |
45 | 18.7M | const float in_p2 = row[wrap_x(ix + 2, xsize)]; |
46 | 18.7M | const float in_m1 = row[wrap_x(ix - 1, xsize)]; |
47 | 18.7M | const float in_p1 = row[wrap_x(ix + 1, xsize)]; |
48 | 18.7M | const float in_00 = row[ix]; |
49 | 18.7M | const float sum_2 = wx2 * (in_m2 + in_p2); |
50 | 18.7M | const float sum_1 = wx1 * (in_m1 + in_p1); |
51 | 18.7M | const float sum_0 = wx0 * in_00; |
52 | 18.7M | return sum_2 + (sum_1 + sum_0); |
53 | 18.7M | } Unexecuted instantiation: convolve_symmetric5.cc:float jxl::N_SSE4::WeightedSumBorder<jxl::WrapMirror>(jxl::Plane<float> const&, jxl::WrapMirror, long, long, unsigned long, unsigned long, float, float, float) Unexecuted instantiation: convolve_symmetric5.cc:float jxl::N_SSE4::WeightedSumBorder<jxl::WrapUnchanged>(jxl::Plane<float> const&, jxl::WrapUnchanged, long, long, unsigned long, unsigned long, float, float, float) convolve_symmetric5.cc:float jxl::N_AVX2::WeightedSumBorder<jxl::WrapMirror>(jxl::Plane<float> const&, jxl::WrapMirror, long, long, unsigned long, unsigned long, float, float, float) Line | Count | Source | 41 | 224k | const float wx2) { | 42 | 224k | const WrapMirror wrap_x; | 43 | 224k | const float* JXL_RESTRICT row = in.ConstRow(wrap_y(iy, ysize)); | 44 | 224k | const float in_m2 = row[wrap_x(ix - 2, xsize)]; | 45 | 224k | const float in_p2 = row[wrap_x(ix + 2, xsize)]; | 46 | 224k | const float in_m1 = row[wrap_x(ix - 1, xsize)]; | 47 | 224k | const float in_p1 = row[wrap_x(ix + 1, xsize)]; | 48 | 224k | const float in_00 = row[ix]; | 49 | 224k | const float sum_2 = wx2 * (in_m2 + in_p2); | 50 | 224k | const float sum_1 = wx1 * (in_m1 + in_p1); | 51 | 224k | const float sum_0 = wx0 * in_00; | 52 | 224k | return sum_2 + (sum_1 + sum_0); | 53 | 224k | } |
convolve_symmetric5.cc:float jxl::N_AVX2::WeightedSumBorder<jxl::WrapUnchanged>(jxl::Plane<float> const&, jxl::WrapUnchanged, long, long, unsigned long, unsigned long, float, float, float) Line | Count | Source | 41 | 18.4M | const float wx2) { | 42 | 18.4M | const WrapMirror wrap_x; | 43 | 18.4M | const float* JXL_RESTRICT row = in.ConstRow(wrap_y(iy, ysize)); | 44 | 18.4M | const float in_m2 = row[wrap_x(ix - 2, xsize)]; | 45 | 18.4M | const float in_p2 = row[wrap_x(ix + 2, xsize)]; | 46 | 18.4M | const float in_m1 = row[wrap_x(ix - 1, xsize)]; | 47 | 18.4M | const float in_p1 = row[wrap_x(ix + 1, xsize)]; | 48 | 18.4M | const float in_00 = row[ix]; | 49 | 18.4M | const float sum_2 = wx2 * (in_m2 + in_p2); | 50 | 18.4M | const float sum_1 = wx1 * (in_m1 + in_p1); | 51 | 18.4M | const float sum_0 = wx0 * in_00; | 52 | 18.4M | return sum_2 + (sum_1 + sum_0); | 53 | 18.4M | } |
Unexecuted instantiation: convolve_symmetric5.cc:float jxl::N_SSE2::WeightedSumBorder<jxl::WrapMirror>(jxl::Plane<float> const&, jxl::WrapMirror, long, long, unsigned long, unsigned long, float, float, float) Unexecuted instantiation: convolve_symmetric5.cc:float jxl::N_SSE2::WeightedSumBorder<jxl::WrapUnchanged>(jxl::Plane<float> const&, jxl::WrapUnchanged, long, long, unsigned long, unsigned long, float, float, float) |
54 | | |
55 | | template <class WrapY, class V> |
56 | | static V WeightedSum(const ImageF& in, const WrapY wrap_y, const size_t ix, |
57 | | const int64_t iy, const size_t ysize, const V wx0, |
58 | 61.3M | const V wx1, const V wx2) { |
59 | 61.3M | const HWY_FULL(float) d; |
60 | 61.3M | const float* JXL_RESTRICT center = in.ConstRow(wrap_y(iy, ysize)) + ix; |
61 | 61.3M | const auto in_m2 = LoadU(d, center - 2); |
62 | 61.3M | const auto in_p2 = LoadU(d, center + 2); |
63 | 61.3M | const auto in_m1 = LoadU(d, center - 1); |
64 | 61.3M | const auto in_p1 = LoadU(d, center + 1); |
65 | 61.3M | const auto in_00 = LoadU(d, center); |
66 | 61.3M | const auto sum_2 = Mul(wx2, Add(in_m2, in_p2)); |
67 | 61.3M | const auto sum_1 = Mul(wx1, Add(in_m1, in_p1)); |
68 | 61.3M | const auto sum_0 = Mul(wx0, in_00); |
69 | 61.3M | return Add(sum_2, Add(sum_1, sum_0)); |
70 | 61.3M | } Unexecuted instantiation: convolve_symmetric5.cc:hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::WeightedSum<jxl::WrapMirror, hwy::N_SSE4::Vec128<float, 4ul> >(jxl::Plane<float> const&, jxl::WrapMirror, unsigned long, long, unsigned long, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>) Unexecuted instantiation: convolve_symmetric5.cc:hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::WeightedSum<jxl::WrapUnchanged, hwy::N_SSE4::Vec128<float, 4ul> >(jxl::Plane<float> const&, jxl::WrapUnchanged, unsigned long, long, unsigned long, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>) convolve_symmetric5.cc:hwy::N_AVX2::Vec256<float> jxl::N_AVX2::WeightedSum<jxl::WrapMirror, hwy::N_AVX2::Vec256<float> >(jxl::Plane<float> const&, jxl::WrapMirror, unsigned long, long, unsigned long, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>) Line | Count | Source | 58 | 617k | const V wx1, const V wx2) { | 59 | 617k | const HWY_FULL(float) d; | 60 | 617k | const float* JXL_RESTRICT center = in.ConstRow(wrap_y(iy, ysize)) + ix; | 61 | 617k | const auto in_m2 = LoadU(d, center - 2); | 62 | 617k | const auto in_p2 = LoadU(d, center + 2); | 63 | 617k | const auto in_m1 = LoadU(d, center - 1); | 64 | 617k | const auto in_p1 = LoadU(d, center + 1); | 65 | 617k | const auto in_00 = LoadU(d, center); | 66 | 617k | const auto sum_2 = Mul(wx2, Add(in_m2, in_p2)); | 67 | 617k | const auto sum_1 = Mul(wx1, Add(in_m1, in_p1)); | 68 | 617k | const auto sum_0 = Mul(wx0, in_00); | 69 | 617k | return Add(sum_2, Add(sum_1, sum_0)); | 70 | 617k | } |
convolve_symmetric5.cc:hwy::N_AVX2::Vec256<float> jxl::N_AVX2::WeightedSum<jxl::WrapUnchanged, hwy::N_AVX2::Vec256<float> >(jxl::Plane<float> const&, jxl::WrapUnchanged, unsigned long, long, unsigned long, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>) Line | Count | Source | 58 | 60.7M | const V wx1, const V wx2) { | 59 | 60.7M | const HWY_FULL(float) d; | 60 | 60.7M | const float* JXL_RESTRICT center = in.ConstRow(wrap_y(iy, ysize)) + ix; | 61 | 60.7M | const auto in_m2 = LoadU(d, center - 2); | 62 | 60.7M | const auto in_p2 = LoadU(d, center + 2); | 63 | 60.7M | const auto in_m1 = LoadU(d, center - 1); | 64 | 60.7M | const auto in_p1 = LoadU(d, center + 1); | 65 | 60.7M | const auto in_00 = LoadU(d, center); | 66 | 60.7M | const auto sum_2 = Mul(wx2, Add(in_m2, in_p2)); | 67 | 60.7M | const auto sum_1 = Mul(wx1, Add(in_m1, in_p1)); | 68 | 60.7M | const auto sum_0 = Mul(wx0, in_00); | 69 | 60.7M | return Add(sum_2, Add(sum_1, sum_0)); | 70 | 60.7M | } |
Unexecuted instantiation: convolve_symmetric5.cc:hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::WeightedSum<jxl::WrapMirror, hwy::N_SSE2::Vec128<float, 4ul> >(jxl::Plane<float> const&, jxl::WrapMirror, unsigned long, long, unsigned long, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>) Unexecuted instantiation: convolve_symmetric5.cc:hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::WeightedSum<jxl::WrapUnchanged, hwy::N_SSE2::Vec128<float, 4ul> >(jxl::Plane<float> const&, jxl::WrapUnchanged, unsigned long, long, unsigned long, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>) |
71 | | |
72 | | // Produces result for one pixel |
73 | | template <class WrapY> |
74 | | float Symmetric5Border(const ImageF& in, const int64_t ix, const int64_t iy, |
75 | 3.74M | const WeightsSymmetric5& weights) { |
76 | 3.74M | const float w0 = weights.c[0]; |
77 | 3.74M | const float w1 = weights.r[0]; |
78 | 3.74M | const float w2 = weights.R[0]; |
79 | 3.74M | const float w4 = weights.d[0]; |
80 | 3.74M | const float w5 = weights.L[0]; |
81 | 3.74M | const float w8 = weights.D[0]; |
82 | | |
83 | 3.74M | const size_t xsize = in.xsize(); |
84 | 3.74M | const size_t ysize = in.ysize(); |
85 | 3.74M | const WrapY wrap_y; |
86 | | // Unrolled loop over all 5 rows of the kernel. |
87 | 3.74M | float sum0 = WeightedSumBorder(in, wrap_y, ix, iy, xsize, ysize, w0, w1, w2); |
88 | | |
89 | 3.74M | sum0 += WeightedSumBorder(in, wrap_y, ix, iy - 2, xsize, ysize, w2, w5, w8); |
90 | 3.74M | float sum1 = |
91 | 3.74M | WeightedSumBorder(in, wrap_y, ix, iy + 2, xsize, ysize, w2, w5, w8); |
92 | | |
93 | 3.74M | sum0 += WeightedSumBorder(in, wrap_y, ix, iy - 1, xsize, ysize, w1, w4, w5); |
94 | 3.74M | sum1 += WeightedSumBorder(in, wrap_y, ix, iy + 1, xsize, ysize, w1, w4, w5); |
95 | | |
96 | 3.74M | return sum0 + sum1; |
97 | 3.74M | } Unexecuted instantiation: float jxl::N_SSE4::Symmetric5Border<jxl::WrapMirror>(jxl::Plane<float> const&, long, long, jxl::WeightsSymmetric5 const&) Unexecuted instantiation: float jxl::N_SSE4::Symmetric5Border<jxl::WrapUnchanged>(jxl::Plane<float> const&, long, long, jxl::WeightsSymmetric5 const&) float jxl::N_AVX2::Symmetric5Border<jxl::WrapMirror>(jxl::Plane<float> const&, long, long, jxl::WeightsSymmetric5 const&) Line | Count | Source | 75 | 44.9k | const WeightsSymmetric5& weights) { | 76 | 44.9k | const float w0 = weights.c[0]; | 77 | 44.9k | const float w1 = weights.r[0]; | 78 | 44.9k | const float w2 = weights.R[0]; | 79 | 44.9k | const float w4 = weights.d[0]; | 80 | 44.9k | const float w5 = weights.L[0]; | 81 | 44.9k | const float w8 = weights.D[0]; | 82 | | | 83 | 44.9k | const size_t xsize = in.xsize(); | 84 | 44.9k | const size_t ysize = in.ysize(); | 85 | 44.9k | const WrapY wrap_y; | 86 | | // Unrolled loop over all 5 rows of the kernel. | 87 | 44.9k | float sum0 = WeightedSumBorder(in, wrap_y, ix, iy, xsize, ysize, w0, w1, w2); | 88 | | | 89 | 44.9k | sum0 += WeightedSumBorder(in, wrap_y, ix, iy - 2, xsize, ysize, w2, w5, w8); | 90 | 44.9k | float sum1 = | 91 | 44.9k | WeightedSumBorder(in, wrap_y, ix, iy + 2, xsize, ysize, w2, w5, w8); | 92 | | | 93 | 44.9k | sum0 += WeightedSumBorder(in, wrap_y, ix, iy - 1, xsize, ysize, w1, w4, w5); | 94 | 44.9k | sum1 += WeightedSumBorder(in, wrap_y, ix, iy + 1, xsize, ysize, w1, w4, w5); | 95 | | | 96 | 44.9k | return sum0 + sum1; | 97 | 44.9k | } |
float jxl::N_AVX2::Symmetric5Border<jxl::WrapUnchanged>(jxl::Plane<float> const&, long, long, jxl::WeightsSymmetric5 const&) Line | Count | Source | 75 | 3.69M | const WeightsSymmetric5& weights) { | 76 | 3.69M | const float w0 = weights.c[0]; | 77 | 3.69M | const float w1 = weights.r[0]; | 78 | 3.69M | const float w2 = weights.R[0]; | 79 | 3.69M | const float w4 = weights.d[0]; | 80 | 3.69M | const float w5 = weights.L[0]; | 81 | 3.69M | const float w8 = weights.D[0]; | 82 | | | 83 | 3.69M | const size_t xsize = in.xsize(); | 84 | 3.69M | const size_t ysize = in.ysize(); | 85 | 3.69M | const WrapY wrap_y; | 86 | | // Unrolled loop over all 5 rows of the kernel. | 87 | 3.69M | float sum0 = WeightedSumBorder(in, wrap_y, ix, iy, xsize, ysize, w0, w1, w2); | 88 | | | 89 | 3.69M | sum0 += WeightedSumBorder(in, wrap_y, ix, iy - 2, xsize, ysize, w2, w5, w8); | 90 | 3.69M | float sum1 = | 91 | 3.69M | WeightedSumBorder(in, wrap_y, ix, iy + 2, xsize, ysize, w2, w5, w8); | 92 | | | 93 | 3.69M | sum0 += WeightedSumBorder(in, wrap_y, ix, iy - 1, xsize, ysize, w1, w4, w5); | 94 | 3.69M | sum1 += WeightedSumBorder(in, wrap_y, ix, iy + 1, xsize, ysize, w1, w4, w5); | 95 | | | 96 | 3.69M | return sum0 + sum1; | 97 | 3.69M | } |
Unexecuted instantiation: float jxl::N_SSE2::Symmetric5Border<jxl::WrapMirror>(jxl::Plane<float> const&, long, long, jxl::WeightsSymmetric5 const&) Unexecuted instantiation: float jxl::N_SSE2::Symmetric5Border<jxl::WrapUnchanged>(jxl::Plane<float> const&, long, long, jxl::WeightsSymmetric5 const&) |
98 | | |
99 | | // Produces result for one vector's worth of pixels |
100 | | template <class WrapY> |
101 | | static void Symmetric5Interior(const ImageF& in, const int64_t ix, |
102 | | const int64_t rix, const int64_t iy, |
103 | | const WeightsSymmetric5& weights, |
104 | 12.2M | float* JXL_RESTRICT row_out) { |
105 | 12.2M | const HWY_FULL(float) d; |
106 | | |
107 | 12.2M | const auto w0 = LoadDup128(d, weights.c); |
108 | 12.2M | const auto w1 = LoadDup128(d, weights.r); |
109 | 12.2M | const auto w2 = LoadDup128(d, weights.R); |
110 | 12.2M | const auto w4 = LoadDup128(d, weights.d); |
111 | 12.2M | const auto w5 = LoadDup128(d, weights.L); |
112 | 12.2M | const auto w8 = LoadDup128(d, weights.D); |
113 | | |
114 | 12.2M | const size_t ysize = in.ysize(); |
115 | 12.2M | const WrapY wrap_y; |
116 | | // Unrolled loop over all 5 rows of the kernel. |
117 | 12.2M | auto sum0 = WeightedSum(in, wrap_y, ix, iy, ysize, w0, w1, w2); |
118 | | |
119 | 12.2M | sum0 = Add(sum0, WeightedSum(in, wrap_y, ix, iy - 2, ysize, w2, w5, w8)); |
120 | 12.2M | auto sum1 = WeightedSum(in, wrap_y, ix, iy + 2, ysize, w2, w5, w8); |
121 | | |
122 | 12.2M | sum0 = Add(sum0, WeightedSum(in, wrap_y, ix, iy - 1, ysize, w1, w4, w5)); |
123 | 12.2M | sum1 = Add(sum1, WeightedSum(in, wrap_y, ix, iy + 1, ysize, w1, w4, w5)); |
124 | | |
125 | 12.2M | StoreU(Add(sum0, sum1), d, row_out + rix); |
126 | 12.2M | } Unexecuted instantiation: convolve_symmetric5.cc:void jxl::N_SSE4::Symmetric5Interior<jxl::WrapMirror>(jxl::Plane<float> const&, long, long, long, jxl::WeightsSymmetric5 const&, float*) Unexecuted instantiation: convolve_symmetric5.cc:void jxl::N_SSE4::Symmetric5Interior<jxl::WrapUnchanged>(jxl::Plane<float> const&, long, long, long, jxl::WeightsSymmetric5 const&, float*) convolve_symmetric5.cc:void jxl::N_AVX2::Symmetric5Interior<jxl::WrapMirror>(jxl::Plane<float> const&, long, long, long, jxl::WeightsSymmetric5 const&, float*) Line | Count | Source | 104 | 123k | float* JXL_RESTRICT row_out) { | 105 | 123k | const HWY_FULL(float) d; | 106 | | | 107 | 123k | const auto w0 = LoadDup128(d, weights.c); | 108 | 123k | const auto w1 = LoadDup128(d, weights.r); | 109 | 123k | const auto w2 = LoadDup128(d, weights.R); | 110 | 123k | const auto w4 = LoadDup128(d, weights.d); | 111 | 123k | const auto w5 = LoadDup128(d, weights.L); | 112 | 123k | const auto w8 = LoadDup128(d, weights.D); | 113 | | | 114 | 123k | const size_t ysize = in.ysize(); | 115 | 123k | const WrapY wrap_y; | 116 | | // Unrolled loop over all 5 rows of the kernel. | 117 | 123k | auto sum0 = WeightedSum(in, wrap_y, ix, iy, ysize, w0, w1, w2); | 118 | | | 119 | 123k | sum0 = Add(sum0, WeightedSum(in, wrap_y, ix, iy - 2, ysize, w2, w5, w8)); | 120 | 123k | auto sum1 = WeightedSum(in, wrap_y, ix, iy + 2, ysize, w2, w5, w8); | 121 | | | 122 | 123k | sum0 = Add(sum0, WeightedSum(in, wrap_y, ix, iy - 1, ysize, w1, w4, w5)); | 123 | 123k | sum1 = Add(sum1, WeightedSum(in, wrap_y, ix, iy + 1, ysize, w1, w4, w5)); | 124 | | | 125 | 123k | StoreU(Add(sum0, sum1), d, row_out + rix); | 126 | 123k | } |
convolve_symmetric5.cc:void jxl::N_AVX2::Symmetric5Interior<jxl::WrapUnchanged>(jxl::Plane<float> const&, long, long, long, jxl::WeightsSymmetric5 const&, float*) Line | Count | Source | 104 | 12.1M | float* JXL_RESTRICT row_out) { | 105 | 12.1M | const HWY_FULL(float) d; | 106 | | | 107 | 12.1M | const auto w0 = LoadDup128(d, weights.c); | 108 | 12.1M | const auto w1 = LoadDup128(d, weights.r); | 109 | 12.1M | const auto w2 = LoadDup128(d, weights.R); | 110 | 12.1M | const auto w4 = LoadDup128(d, weights.d); | 111 | 12.1M | const auto w5 = LoadDup128(d, weights.L); | 112 | 12.1M | const auto w8 = LoadDup128(d, weights.D); | 113 | | | 114 | 12.1M | const size_t ysize = in.ysize(); | 115 | 12.1M | const WrapY wrap_y; | 116 | | // Unrolled loop over all 5 rows of the kernel. | 117 | 12.1M | auto sum0 = WeightedSum(in, wrap_y, ix, iy, ysize, w0, w1, w2); | 118 | | | 119 | 12.1M | sum0 = Add(sum0, WeightedSum(in, wrap_y, ix, iy - 2, ysize, w2, w5, w8)); | 120 | 12.1M | auto sum1 = WeightedSum(in, wrap_y, ix, iy + 2, ysize, w2, w5, w8); | 121 | | | 122 | 12.1M | sum0 = Add(sum0, WeightedSum(in, wrap_y, ix, iy - 1, ysize, w1, w4, w5)); | 123 | 12.1M | sum1 = Add(sum1, WeightedSum(in, wrap_y, ix, iy + 1, ysize, w1, w4, w5)); | 124 | | | 125 | 12.1M | StoreU(Add(sum0, sum1), d, row_out + rix); | 126 | 12.1M | } |
Unexecuted instantiation: convolve_symmetric5.cc:void jxl::N_SSE2::Symmetric5Interior<jxl::WrapMirror>(jxl::Plane<float> const&, long, long, long, jxl::WeightsSymmetric5 const&, float*) Unexecuted instantiation: convolve_symmetric5.cc:void jxl::N_SSE2::Symmetric5Interior<jxl::WrapUnchanged>(jxl::Plane<float> const&, long, long, long, jxl::WeightsSymmetric5 const&, float*) |
127 | | |
128 | | template <class WrapY> |
129 | | static void Symmetric5Row(const ImageF& in, const Rect& rect, const int64_t iy, |
130 | | const WeightsSymmetric5& weights, |
131 | 234k | float* JXL_RESTRICT row_out) { |
132 | 234k | const int64_t kRadius = 2; |
133 | 234k | const size_t xend = rect.x1(); |
134 | | |
135 | 234k | size_t rix = 0; |
136 | 234k | size_t ix = rect.x0(); |
137 | 234k | const HWY_FULL(float) d; |
138 | 234k | const size_t N = Lanes(d); |
139 | 234k | const size_t aligned_x = RoundUpTo(kRadius, N); |
140 | 2.10M | for (; ix < std::min(aligned_x, xend); ++ix, ++rix) { |
141 | 1.87M | row_out[rix] = Symmetric5Border<WrapY>(in, ix, iy, weights); |
142 | 1.87M | } |
143 | 12.5M | for (; ix + N + kRadius <= xend; ix += N, rix += N) { |
144 | 12.2M | Symmetric5Interior<WrapY>(in, ix, rix, iy, weights, row_out); |
145 | 12.2M | } |
146 | 2.10M | for (; ix < xend; ++ix, ++rix) { |
147 | 1.86M | row_out[rix] = Symmetric5Border<WrapY>(in, ix, iy, weights); |
148 | 1.86M | } |
149 | 234k | } Unexecuted instantiation: convolve_symmetric5.cc:void jxl::N_SSE4::Symmetric5Row<jxl::WrapMirror>(jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, long, jxl::WeightsSymmetric5 const&, float*) Unexecuted instantiation: convolve_symmetric5.cc:void jxl::N_SSE4::Symmetric5Row<jxl::WrapUnchanged>(jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, long, jxl::WeightsSymmetric5 const&, float*) convolve_symmetric5.cc:void jxl::N_AVX2::Symmetric5Row<jxl::WrapMirror>(jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, long, jxl::WeightsSymmetric5 const&, float*) Line | Count | Source | 131 | 2.97k | float* JXL_RESTRICT row_out) { | 132 | 2.97k | const int64_t kRadius = 2; | 133 | 2.97k | const size_t xend = rect.x1(); | 134 | | | 135 | 2.97k | size_t rix = 0; | 136 | 2.97k | size_t ix = rect.x0(); | 137 | 2.97k | const HWY_FULL(float) d; | 138 | 2.97k | const size_t N = Lanes(d); | 139 | 2.97k | const size_t aligned_x = RoundUpTo(kRadius, N); | 140 | 26.7k | for (; ix < std::min(aligned_x, xend); ++ix, ++rix) { | 141 | 23.8k | row_out[rix] = Symmetric5Border<WrapY>(in, ix, iy, weights); | 142 | 23.8k | } | 143 | 126k | for (; ix + N + kRadius <= xend; ix += N, rix += N) { | 144 | 123k | Symmetric5Interior<WrapY>(in, ix, rix, iy, weights, row_out); | 145 | 123k | } | 146 | 24.0k | for (; ix < xend; ++ix, ++rix) { | 147 | 21.1k | row_out[rix] = Symmetric5Border<WrapY>(in, ix, iy, weights); | 148 | 21.1k | } | 149 | 2.97k | } |
convolve_symmetric5.cc:void jxl::N_AVX2::Symmetric5Row<jxl::WrapUnchanged>(jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, long, jxl::WeightsSymmetric5 const&, float*) Line | Count | Source | 131 | 231k | float* JXL_RESTRICT row_out) { | 132 | 231k | const int64_t kRadius = 2; | 133 | 231k | const size_t xend = rect.x1(); | 134 | | | 135 | 231k | size_t rix = 0; | 136 | 231k | size_t ix = rect.x0(); | 137 | 231k | const HWY_FULL(float) d; | 138 | 231k | const size_t N = Lanes(d); | 139 | 231k | const size_t aligned_x = RoundUpTo(kRadius, N); | 140 | 2.08M | for (; ix < std::min(aligned_x, xend); ++ix, ++rix) { | 141 | 1.85M | row_out[rix] = Symmetric5Border<WrapY>(in, ix, iy, weights); | 142 | 1.85M | } | 143 | 12.3M | for (; ix + N + kRadius <= xend; ix += N, rix += N) { | 144 | 12.1M | Symmetric5Interior<WrapY>(in, ix, rix, iy, weights, row_out); | 145 | 12.1M | } | 146 | 2.07M | for (; ix < xend; ++ix, ++rix) { | 147 | 1.84M | row_out[rix] = Symmetric5Border<WrapY>(in, ix, iy, weights); | 148 | 1.84M | } | 149 | 231k | } |
Unexecuted instantiation: convolve_symmetric5.cc:void jxl::N_SSE2::Symmetric5Row<jxl::WrapMirror>(jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, long, jxl::WeightsSymmetric5 const&, float*) Unexecuted instantiation: convolve_symmetric5.cc:void jxl::N_SSE2::Symmetric5Row<jxl::WrapUnchanged>(jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, long, jxl::WeightsSymmetric5 const&, float*) |
150 | | |
151 | | // Semi-vectorized (interior pixels Fonly); called directly like slow::, unlike |
152 | | // the fully vectorized strategies below. |
153 | | Status Symmetric5(const ImageF& in, const Rect& in_rect, |
154 | | const WeightsSymmetric5& weights, ThreadPool* pool, |
155 | 744 | ImageF* JXL_RESTRICT out, const Rect& out_rect) { |
156 | 744 | JXL_ENSURE(in_rect.xsize() == out_rect.xsize()); |
157 | 744 | JXL_ENSURE(in_rect.ysize() == out_rect.ysize()); |
158 | 744 | const size_t ysize = in_rect.ysize(); |
159 | 744 | const auto process_row = [&](const uint32_t task, |
160 | 234k | size_t /*thread*/) -> Status { |
161 | 234k | const int64_t riy = task; |
162 | 234k | const int64_t iy = in_rect.y0() + riy; |
163 | | |
164 | 234k | if (iy < 2 || iy >= static_cast<ssize_t>(in.ysize()) - 2) { |
165 | 2.97k | Symmetric5Row<WrapMirror>(in, in_rect, iy, weights, |
166 | 2.97k | out_rect.Row(out, riy)); |
167 | 231k | } else { |
168 | 231k | Symmetric5Row<WrapUnchanged>(in, in_rect, iy, weights, |
169 | 231k | out_rect.Row(out, riy)); |
170 | 231k | } |
171 | 234k | return true; |
172 | 234k | }; Unexecuted instantiation: convolve_symmetric5.cc:jxl::N_SSE4::Symmetric5(jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, jxl::WeightsSymmetric5 const&, jxl::ThreadPool*, jxl::Plane<float>*, jxl::RectT<unsigned long> const&)::$_0::operator()(unsigned int, unsigned long) const convolve_symmetric5.cc:jxl::N_AVX2::Symmetric5(jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, jxl::WeightsSymmetric5 const&, jxl::ThreadPool*, jxl::Plane<float>*, jxl::RectT<unsigned long> const&)::$_0::operator()(unsigned int, unsigned long) const Line | Count | Source | 160 | 234k | size_t /*thread*/) -> Status { | 161 | 234k | const int64_t riy = task; | 162 | 234k | const int64_t iy = in_rect.y0() + riy; | 163 | | | 164 | 234k | if (iy < 2 || iy >= static_cast<ssize_t>(in.ysize()) - 2) { | 165 | 2.97k | Symmetric5Row<WrapMirror>(in, in_rect, iy, weights, | 166 | 2.97k | out_rect.Row(out, riy)); | 167 | 231k | } else { | 168 | 231k | Symmetric5Row<WrapUnchanged>(in, in_rect, iy, weights, | 169 | 231k | out_rect.Row(out, riy)); | 170 | 231k | } | 171 | 234k | return true; | 172 | 234k | }; |
Unexecuted instantiation: convolve_symmetric5.cc:jxl::N_SSE2::Symmetric5(jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, jxl::WeightsSymmetric5 const&, jxl::ThreadPool*, jxl::Plane<float>*, jxl::RectT<unsigned long> const&)::$_0::operator()(unsigned int, unsigned long) const |
173 | 744 | JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, static_cast<uint32_t>(ysize), |
174 | 744 | ThreadPool::NoInit, process_row, |
175 | 744 | "Symmetric5x5Convolution")); |
176 | 744 | return true; |
177 | 744 | } Unexecuted instantiation: jxl::N_SSE4::Symmetric5(jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, jxl::WeightsSymmetric5 const&, jxl::ThreadPool*, jxl::Plane<float>*, jxl::RectT<unsigned long> const&) jxl::N_AVX2::Symmetric5(jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, jxl::WeightsSymmetric5 const&, jxl::ThreadPool*, jxl::Plane<float>*, jxl::RectT<unsigned long> const&) Line | Count | Source | 155 | 744 | ImageF* JXL_RESTRICT out, const Rect& out_rect) { | 156 | 744 | JXL_ENSURE(in_rect.xsize() == out_rect.xsize()); | 157 | 744 | JXL_ENSURE(in_rect.ysize() == out_rect.ysize()); | 158 | 744 | const size_t ysize = in_rect.ysize(); | 159 | 744 | const auto process_row = [&](const uint32_t task, | 160 | 744 | size_t /*thread*/) -> Status { | 161 | 744 | const int64_t riy = task; | 162 | 744 | const int64_t iy = in_rect.y0() + riy; | 163 | | | 164 | 744 | if (iy < 2 || iy >= static_cast<ssize_t>(in.ysize()) - 2) { | 165 | 744 | Symmetric5Row<WrapMirror>(in, in_rect, iy, weights, | 166 | 744 | out_rect.Row(out, riy)); | 167 | 744 | } else { | 168 | 744 | Symmetric5Row<WrapUnchanged>(in, in_rect, iy, weights, | 169 | 744 | out_rect.Row(out, riy)); | 170 | 744 | } | 171 | 744 | return true; | 172 | 744 | }; | 173 | 744 | JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, static_cast<uint32_t>(ysize), | 174 | 744 | ThreadPool::NoInit, process_row, | 175 | 744 | "Symmetric5x5Convolution")); | 176 | 744 | return true; | 177 | 744 | } |
Unexecuted instantiation: jxl::N_SSE2::Symmetric5(jxl::Plane<float> const&, jxl::RectT<unsigned long> const&, jxl::WeightsSymmetric5 const&, jxl::ThreadPool*, jxl::Plane<float>*, jxl::RectT<unsigned long> const&) |
178 | | |
179 | | // NOLINTNEXTLINE(google-readability-namespace-comments) |
180 | | } // namespace HWY_NAMESPACE |
181 | | } // namespace jxl |
182 | | HWY_AFTER_NAMESPACE(); |
183 | | |
184 | | #if HWY_ONCE |
185 | | namespace jxl { |
186 | | |
187 | | HWY_EXPORT(Symmetric5); |
188 | | Status Symmetric5(const ImageF& in, const Rect& in_rect, |
189 | | const WeightsSymmetric5& weights, ThreadPool* pool, |
190 | 744 | ImageF* JXL_RESTRICT out, const Rect& out_rect) { |
191 | 744 | return HWY_DYNAMIC_DISPATCH(Symmetric5)(in, in_rect, weights, pool, out, |
192 | 744 | out_rect); |
193 | 744 | } |
194 | | |
195 | | } // namespace jxl |
196 | | #endif // HWY_ONCE |