/src/libjxl/lib/jxl/simd_util-inl.h
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) the JPEG XL Project Authors. All rights reserved. |
2 | | // |
3 | | // Use of this source code is governed by a BSD-style |
4 | | // license that can be found in the LICENSE file. |
5 | | |
6 | | // Misc utilities for SIMD operations |
7 | | |
8 | | #include <cstddef> |
9 | | #include <cstdint> |
10 | | |
11 | | #if defined(LIB_JXL_SIMD_UTIL_INL_H_) == defined(HWY_TARGET_TOGGLE) |
12 | | #ifdef LIB_JXL_SIMD_UTIL_INL_H_ |
13 | | #undef LIB_JXL_SIMD_UTIL_INL_H_ |
14 | | #else |
15 | | #define LIB_JXL_SIMD_UTIL_INL_H_ |
16 | | #endif |
17 | | |
18 | | #include <hwy/highway.h> |
19 | | |
20 | | #include "lib/jxl/base/compiler_specific.h" |
21 | | |
22 | | HWY_BEFORE_NAMESPACE(); |
23 | | namespace jxl { |
24 | | namespace HWY_NAMESPACE { |
25 | | |
26 | | #if HWY_CAP_GE512 |
27 | | using hwy::HWY_NAMESPACE::Half; |
28 | | using hwy::HWY_NAMESPACE::Vec; |
29 | | template <size_t i, class DF, class V> |
30 | | HWY_INLINE Vec<Half<Half<DF>>> Quarter(const DF df, V v) { |
31 | | using HF = Half<DF>; |
32 | | using HHF = Half<HF>; |
33 | | auto half = i >= 2 ? UpperHalf(HF(), v) : LowerHalf(HF(), v); |
34 | | return i & 1 ? UpperHalf(HHF(), half) : LowerHalf(HHF(), half); |
35 | | } |
36 | | |
37 | | template <class DF, class V> |
38 | | HWY_INLINE Vec<DF> Concat4(const DF df, V v0, V v1, V v2, V v3) { |
39 | | using HF = Half<DF>; |
40 | | return Combine(DF(), Combine(HF(), v3, v2), Combine(HF(), v1, v0)); |
41 | | } |
42 | | |
43 | | #endif |
44 | | |
45 | | // Stores v0[0], v1[0], v0[1], v1[1], ... to mem, in this order. Mem must be |
46 | | // aligned. |
47 | | template <class DF, class V, typename T> |
48 | 14.2M | void StoreInterleaved(const DF df, V v0, V v1, T* mem) { |
49 | 14.2M | static_assert(sizeof(T) == 4, "only use StoreInterleaved for 4-byte types"); |
50 | | #if HWY_TARGET == HWY_SCALAR |
51 | | Store(v0, df, mem); |
52 | | Store(v1, df, mem + 1); |
53 | | #elif !HWY_CAP_GE256 |
54 | | Store(InterleaveLower(df, v0, v1), df, mem); |
55 | | Store(InterleaveUpper(df, v0, v1), df, mem + Lanes(df)); |
56 | | #else |
57 | 14.2M | if (!HWY_CAP_GE512 || Lanes(df) == 8) { |
58 | 14.2M | auto t0 = InterleaveLower(df, v0, v1); |
59 | 14.2M | auto t1 = InterleaveUpper(df, v0, v1); |
60 | 14.2M | Store(ConcatLowerLower(df, t1, t0), df, mem); |
61 | 14.2M | Store(ConcatUpperUpper(df, t1, t0), df, mem + Lanes(df)); |
62 | 14.2M | } else { |
63 | | #if HWY_CAP_GE512 |
64 | | auto t0 = InterleaveLower(df, v0, v1); |
65 | | auto t1 = InterleaveUpper(df, v0, v1); |
66 | | Store(Concat4(df, Quarter<0>(df, t0), Quarter<0>(df, t1), |
67 | | Quarter<1>(df, t0), Quarter<1>(df, t1)), |
68 | | df, mem); |
69 | | Store(Concat4(df, Quarter<2>(df, t0), Quarter<2>(df, t1), |
70 | | Quarter<3>(df, t0), Quarter<3>(df, t1)), |
71 | | df, mem + Lanes(df)); |
72 | | #endif |
73 | 0 | } |
74 | | #endif |
75 | 14.2M | } Unexecuted instantiation: void jxl::N_SSE4::StoreInterleaved<hwy::N_SSE4::Simd<int, 4ul, 0>, hwy::N_SSE4::Vec128<int, 4ul>, int>(hwy::N_SSE4::Simd<int, 4ul, 0>, hwy::N_SSE4::Vec128<int, 4ul>, hwy::N_SSE4::Vec128<int, 4ul>, int*) void jxl::N_AVX2::StoreInterleaved<hwy::N_AVX2::Simd<int, 8ul, 0>, hwy::N_AVX2::Vec256<int>, int>(hwy::N_AVX2::Simd<int, 8ul, 0>, hwy::N_AVX2::Vec256<int>, hwy::N_AVX2::Vec256<int>, int*) Line | Count | Source | 48 | 5.32M | void StoreInterleaved(const DF df, V v0, V v1, T* mem) { | 49 | 5.32M | static_assert(sizeof(T) == 4, "only use StoreInterleaved for 4-byte types"); | 50 | | #if HWY_TARGET == HWY_SCALAR | 51 | | Store(v0, df, mem); | 52 | | Store(v1, df, mem + 1); | 53 | | #elif !HWY_CAP_GE256 | 54 | | Store(InterleaveLower(df, v0, v1), df, mem); | 55 | | Store(InterleaveUpper(df, v0, v1), df, mem + Lanes(df)); | 56 | | #else | 57 | 5.32M | if (!HWY_CAP_GE512 || Lanes(df) == 8) { | 58 | 5.32M | auto t0 = InterleaveLower(df, v0, v1); | 59 | 5.32M | auto t1 = InterleaveUpper(df, v0, v1); | 60 | 5.32M | Store(ConcatLowerLower(df, t1, t0), df, mem); | 61 | 5.32M | Store(ConcatUpperUpper(df, t1, t0), df, mem + Lanes(df)); | 62 | 5.32M | } else { | 63 | | #if HWY_CAP_GE512 | 64 | | auto t0 = InterleaveLower(df, v0, v1); | 65 | | auto t1 = InterleaveUpper(df, v0, v1); | 66 | | Store(Concat4(df, Quarter<0>(df, t0), Quarter<0>(df, t1), | 67 | | Quarter<1>(df, t0), Quarter<1>(df, t1)), | 68 | | df, mem); | 69 | | Store(Concat4(df, Quarter<2>(df, t0), Quarter<2>(df, t1), | 70 | | Quarter<3>(df, t0), Quarter<3>(df, t1)), | 71 | | df, mem + Lanes(df)); | 72 | | #endif | 73 | 0 | } | 74 | 5.32M | #endif | 75 | 5.32M | } |
Unexecuted instantiation: void jxl::N_SSE2::StoreInterleaved<hwy::N_SSE2::Simd<int, 4ul, 0>, hwy::N_SSE2::Vec128<int, 4ul>, int>(hwy::N_SSE2::Simd<int, 4ul, 0>, hwy::N_SSE2::Vec128<int, 4ul>, hwy::N_SSE2::Vec128<int, 4ul>, int*) Unexecuted instantiation: void jxl::N_SSE4::StoreInterleaved<hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul>, float>(hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, float*) void jxl::N_AVX2::StoreInterleaved<hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float>, float>(hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, float*) Line | Count | Source | 48 | 8.91M | void StoreInterleaved(const DF df, V v0, V v1, T* mem) { | 49 | 8.91M | static_assert(sizeof(T) == 4, "only use StoreInterleaved for 4-byte types"); | 50 | | #if HWY_TARGET == HWY_SCALAR | 51 | | Store(v0, df, mem); | 52 | | Store(v1, df, mem + 1); | 53 | | #elif !HWY_CAP_GE256 | 54 | | Store(InterleaveLower(df, v0, v1), df, mem); | 55 | | Store(InterleaveUpper(df, v0, v1), df, mem + Lanes(df)); | 56 | | #else | 57 | 8.91M | if (!HWY_CAP_GE512 || Lanes(df) == 8) { | 58 | 8.91M | auto t0 = InterleaveLower(df, v0, v1); | 59 | 8.91M | auto t1 = InterleaveUpper(df, v0, v1); | 60 | 8.91M | Store(ConcatLowerLower(df, t1, t0), df, mem); | 61 | 8.91M | Store(ConcatUpperUpper(df, t1, t0), df, mem + Lanes(df)); | 62 | 8.91M | } else { | 63 | | #if HWY_CAP_GE512 | 64 | | auto t0 = InterleaveLower(df, v0, v1); | 65 | | auto t1 = InterleaveUpper(df, v0, v1); | 66 | | Store(Concat4(df, Quarter<0>(df, t0), Quarter<0>(df, t1), | 67 | | Quarter<1>(df, t0), Quarter<1>(df, t1)), | 68 | | df, mem); | 69 | | Store(Concat4(df, Quarter<2>(df, t0), Quarter<2>(df, t1), | 70 | | Quarter<3>(df, t0), Quarter<3>(df, t1)), | 71 | | df, mem + Lanes(df)); | 72 | | #endif | 73 | 0 | } | 74 | 8.91M | #endif | 75 | 8.91M | } |
Unexecuted instantiation: void jxl::N_SSE2::StoreInterleaved<hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul>, float>(hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, float*) |
76 | | |
77 | | // Stores v0[0], v1[0], v2[0], v3[0], v0[1] ... to mem, in this order. Mem must |
78 | | // be aligned. |
79 | | template <class DF, class V, typename T> |
80 | 28.2M | void StoreInterleaved(const DF df, V v0, V v1, V v2, V v3, T* mem) { |
81 | 28.2M | static_assert(sizeof(T) == 4, "only use StoreInterleaved for 4-byte types"); |
82 | | #if HWY_TARGET == HWY_SCALAR |
83 | | Store(v0, df, mem); |
84 | | Store(v1, df, mem + 1); |
85 | | Store(v2, df, mem + 2); |
86 | | Store(v3, df, mem + 3); |
87 | | #elif !HWY_CAP_GE256 |
88 | | auto t0 = InterleaveLower(df, v0, v2); |
89 | | auto t1 = InterleaveLower(df, v1, v3); |
90 | | auto t2 = InterleaveUpper(df, v0, v2); |
91 | | auto t3 = InterleaveUpper(df, v1, v3); |
92 | | Store(InterleaveLower(df, t0, t1), df, mem); |
93 | | Store(InterleaveUpper(df, t0, t1), df, mem + Lanes(df)); |
94 | | Store(InterleaveLower(df, t2, t3), df, mem + 2 * Lanes(df)); |
95 | | Store(InterleaveUpper(df, t2, t3), df, mem + 3 * Lanes(df)); |
96 | | #elif !HWY_CAP_GE512 |
97 | | auto t0 = InterleaveLower(df, v0, v2); |
98 | | auto t1 = InterleaveLower(df, v1, v3); |
99 | | auto t2 = InterleaveUpper(df, v0, v2); |
100 | | auto t3 = InterleaveUpper(df, v1, v3); |
101 | | |
102 | | auto m0 = InterleaveLower(df, t0, t1); |
103 | | auto m1 = InterleaveUpper(df, t0, t1); |
104 | | auto m2 = InterleaveLower(df, t2, t3); |
105 | | auto m3 = InterleaveUpper(df, t2, t3); |
106 | | |
107 | | Store(ConcatLowerLower(df, m1, m0), df, mem); |
108 | | Store(ConcatLowerLower(df, m3, m2), df, mem + Lanes(df)); |
109 | | Store(ConcatUpperUpper(df, m1, m0), df, mem + 2 * Lanes(df)); |
110 | | Store(ConcatUpperUpper(df, m3, m2), df, mem + 3 * Lanes(df)); |
111 | | #else |
112 | | auto t0 = InterleaveLower(df, v0, v2); |
113 | | auto t1 = InterleaveLower(df, v1, v3); |
114 | | auto t2 = InterleaveUpper(df, v0, v2); |
115 | | auto t3 = InterleaveUpper(df, v1, v3); |
116 | | |
117 | | auto m0 = InterleaveLower(df, t0, t1); |
118 | | auto m1 = InterleaveUpper(df, t0, t1); |
119 | | auto m2 = InterleaveLower(df, t2, t3); |
120 | | auto m3 = InterleaveUpper(df, t2, t3); |
121 | | |
122 | | Store(Concat4(df, Quarter<0>(df, m0), Quarter<0>(df, m1), Quarter<0>(df, m2), |
123 | | Quarter<0>(df, m3)), |
124 | | df, mem); |
125 | | Store(Concat4(df, Quarter<1>(df, m0), Quarter<1>(df, m1), Quarter<1>(df, m2), |
126 | | Quarter<1>(df, m3)), |
127 | | df, mem + Lanes(df)); |
128 | | Store(Concat4(df, Quarter<2>(df, m0), Quarter<2>(df, m1), Quarter<2>(df, m2), |
129 | | Quarter<2>(df, m3)), |
130 | | df, mem + 2 * Lanes(df)); |
131 | | Store(Concat4(df, Quarter<3>(df, m0), Quarter<3>(df, m1), Quarter<3>(df, m2), |
132 | | Quarter<3>(df, m3)), |
133 | | df, mem + 3 * Lanes(df)); |
134 | | #endif |
135 | 28.2M | } Unexecuted instantiation: void jxl::N_SSE4::StoreInterleaved<hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul>, float>(hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, float*) void jxl::N_AVX2::StoreInterleaved<hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float>, float>(hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, float*) Line | Count | Source | 80 | 28.2M | void StoreInterleaved(const DF df, V v0, V v1, V v2, V v3, T* mem) { | 81 | 28.2M | static_assert(sizeof(T) == 4, "only use StoreInterleaved for 4-byte types"); | 82 | | #if HWY_TARGET == HWY_SCALAR | 83 | | Store(v0, df, mem); | 84 | | Store(v1, df, mem + 1); | 85 | | Store(v2, df, mem + 2); | 86 | | Store(v3, df, mem + 3); | 87 | | #elif !HWY_CAP_GE256 | 88 | | auto t0 = InterleaveLower(df, v0, v2); | 89 | | auto t1 = InterleaveLower(df, v1, v3); | 90 | | auto t2 = InterleaveUpper(df, v0, v2); | 91 | | auto t3 = InterleaveUpper(df, v1, v3); | 92 | | Store(InterleaveLower(df, t0, t1), df, mem); | 93 | | Store(InterleaveUpper(df, t0, t1), df, mem + Lanes(df)); | 94 | | Store(InterleaveLower(df, t2, t3), df, mem + 2 * Lanes(df)); | 95 | | Store(InterleaveUpper(df, t2, t3), df, mem + 3 * Lanes(df)); | 96 | | #elif !HWY_CAP_GE512 | 97 | | auto t0 = InterleaveLower(df, v0, v2); | 98 | 28.2M | auto t1 = InterleaveLower(df, v1, v3); | 99 | 28.2M | auto t2 = InterleaveUpper(df, v0, v2); | 100 | 28.2M | auto t3 = InterleaveUpper(df, v1, v3); | 101 | | | 102 | 28.2M | auto m0 = InterleaveLower(df, t0, t1); | 103 | 28.2M | auto m1 = InterleaveUpper(df, t0, t1); | 104 | 28.2M | auto m2 = InterleaveLower(df, t2, t3); | 105 | 28.2M | auto m3 = InterleaveUpper(df, t2, t3); | 106 | | | 107 | 28.2M | Store(ConcatLowerLower(df, m1, m0), df, mem); | 108 | 28.2M | Store(ConcatLowerLower(df, m3, m2), df, mem + Lanes(df)); | 109 | 28.2M | Store(ConcatUpperUpper(df, m1, m0), df, mem + 2 * Lanes(df)); | 110 | 28.2M | Store(ConcatUpperUpper(df, m3, m2), df, mem + 3 * Lanes(df)); | 111 | | #else | 112 | | auto t0 = InterleaveLower(df, v0, v2); | 113 | | auto t1 = InterleaveLower(df, v1, v3); | 114 | | auto t2 = InterleaveUpper(df, v0, v2); | 115 | | auto t3 = InterleaveUpper(df, v1, v3); | 116 | | | 117 | | auto m0 = InterleaveLower(df, t0, t1); | 118 | | auto m1 = InterleaveUpper(df, t0, t1); | 119 | | auto m2 = InterleaveLower(df, t2, t3); | 120 | | auto m3 = InterleaveUpper(df, t2, t3); | 121 | | | 122 | | Store(Concat4(df, Quarter<0>(df, m0), Quarter<0>(df, m1), Quarter<0>(df, m2), | 123 | | Quarter<0>(df, m3)), | 124 | | df, mem); | 125 | | Store(Concat4(df, Quarter<1>(df, m0), Quarter<1>(df, m1), Quarter<1>(df, m2), | 126 | | Quarter<1>(df, m3)), | 127 | | df, mem + Lanes(df)); | 128 | | Store(Concat4(df, Quarter<2>(df, m0), Quarter<2>(df, m1), Quarter<2>(df, m2), | 129 | | Quarter<2>(df, m3)), | 130 | | df, mem + 2 * Lanes(df)); | 131 | | Store(Concat4(df, Quarter<3>(df, m0), Quarter<3>(df, m1), Quarter<3>(df, m2), | 132 | | Quarter<3>(df, m3)), | 133 | | df, mem + 3 * Lanes(df)); | 134 | | #endif | 135 | 28.2M | } |
Unexecuted instantiation: void jxl::N_SSE2::StoreInterleaved<hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul>, float>(hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, float*) |
136 | | |
137 | | // Stores v0[0], v1[0], v2[0], v3[0], v4[0], v5[0], v6[0], v7[0], v0[1] ... to |
138 | | // mem, in this order. Mem must be aligned. |
139 | | template <class DF, class V> |
140 | | void StoreInterleaved(const DF df, V v0, V v1, V v2, V v3, V v4, V v5, V v6, |
141 | 3.85M | V v7, float* mem) { |
142 | | #if HWY_TARGET == HWY_SCALAR |
143 | | Store(v0, df, mem); |
144 | | Store(v1, df, mem + 1); |
145 | | Store(v2, df, mem + 2); |
146 | | Store(v3, df, mem + 3); |
147 | | Store(v4, df, mem + 4); |
148 | | Store(v5, df, mem + 5); |
149 | | Store(v6, df, mem + 6); |
150 | | Store(v7, df, mem + 7); |
151 | | #elif !HWY_CAP_GE256 |
152 | | auto t0 = InterleaveLower(df, v0, v4); |
153 | | auto t1 = InterleaveLower(df, v1, v5); |
154 | | auto t2 = InterleaveLower(df, v2, v6); |
155 | | auto t3 = InterleaveLower(df, v3, v7); |
156 | | auto t4 = InterleaveUpper(df, v0, v4); |
157 | | auto t5 = InterleaveUpper(df, v1, v5); |
158 | | auto t6 = InterleaveUpper(df, v2, v6); |
159 | | auto t7 = InterleaveUpper(df, v3, v7); |
160 | | |
161 | | auto w0 = InterleaveLower(df, t0, t2); |
162 | | auto w1 = InterleaveLower(df, t1, t3); |
163 | | auto w2 = InterleaveUpper(df, t0, t2); |
164 | | auto w3 = InterleaveUpper(df, t1, t3); |
165 | | auto w4 = InterleaveLower(df, t4, t6); |
166 | | auto w5 = InterleaveLower(df, t5, t7); |
167 | | auto w6 = InterleaveUpper(df, t4, t6); |
168 | | auto w7 = InterleaveUpper(df, t5, t7); |
169 | | |
170 | | Store(InterleaveLower(df, w0, w1), df, mem); |
171 | | Store(InterleaveUpper(df, w0, w1), df, mem + Lanes(df)); |
172 | | Store(InterleaveLower(df, w2, w3), df, mem + 2 * Lanes(df)); |
173 | | Store(InterleaveUpper(df, w2, w3), df, mem + 3 * Lanes(df)); |
174 | | Store(InterleaveLower(df, w4, w5), df, mem + 4 * Lanes(df)); |
175 | | Store(InterleaveUpper(df, w4, w5), df, mem + 5 * Lanes(df)); |
176 | | Store(InterleaveLower(df, w6, w7), df, mem + 6 * Lanes(df)); |
177 | | Store(InterleaveUpper(df, w6, w7), df, mem + 7 * Lanes(df)); |
178 | | #elif !HWY_CAP_GE512 |
179 | | auto t0 = InterleaveLower(df, v0, v4); |
180 | | auto t1 = InterleaveLower(df, v1, v5); |
181 | | auto t2 = InterleaveLower(df, v2, v6); |
182 | | auto t3 = InterleaveLower(df, v3, v7); |
183 | | auto t4 = InterleaveUpper(df, v0, v4); |
184 | | auto t5 = InterleaveUpper(df, v1, v5); |
185 | | auto t6 = InterleaveUpper(df, v2, v6); |
186 | | auto t7 = InterleaveUpper(df, v3, v7); |
187 | | |
188 | | auto w0 = InterleaveLower(df, t0, t2); |
189 | | auto w1 = InterleaveLower(df, t1, t3); |
190 | | auto w2 = InterleaveUpper(df, t0, t2); |
191 | | auto w3 = InterleaveUpper(df, t1, t3); |
192 | | auto w4 = InterleaveLower(df, t4, t6); |
193 | | auto w5 = InterleaveLower(df, t5, t7); |
194 | | auto w6 = InterleaveUpper(df, t4, t6); |
195 | | auto w7 = InterleaveUpper(df, t5, t7); |
196 | | |
197 | | auto m0 = InterleaveLower(df, w0, w1); |
198 | | auto m1 = InterleaveUpper(df, w0, w1); |
199 | | auto m2 = InterleaveLower(df, w2, w3); |
200 | | auto m3 = InterleaveUpper(df, w2, w3); |
201 | | auto m4 = InterleaveLower(df, w4, w5); |
202 | | auto m5 = InterleaveUpper(df, w4, w5); |
203 | | auto m6 = InterleaveLower(df, w6, w7); |
204 | | auto m7 = InterleaveUpper(df, w6, w7); |
205 | | |
206 | | Store(ConcatLowerLower(df, m1, m0), df, mem); |
207 | | Store(ConcatLowerLower(df, m3, m2), df, mem + Lanes(df)); |
208 | | Store(ConcatLowerLower(df, m5, m4), df, mem + 2 * Lanes(df)); |
209 | | Store(ConcatLowerLower(df, m7, m6), df, mem + 3 * Lanes(df)); |
210 | | Store(ConcatUpperUpper(df, m1, m0), df, mem + 4 * Lanes(df)); |
211 | | Store(ConcatUpperUpper(df, m3, m2), df, mem + 5 * Lanes(df)); |
212 | | Store(ConcatUpperUpper(df, m5, m4), df, mem + 6 * Lanes(df)); |
213 | | Store(ConcatUpperUpper(df, m7, m6), df, mem + 7 * Lanes(df)); |
214 | | #else |
215 | | auto t0 = InterleaveLower(df, v0, v4); |
216 | | auto t1 = InterleaveLower(df, v1, v5); |
217 | | auto t2 = InterleaveLower(df, v2, v6); |
218 | | auto t3 = InterleaveLower(df, v3, v7); |
219 | | auto t4 = InterleaveUpper(df, v0, v4); |
220 | | auto t5 = InterleaveUpper(df, v1, v5); |
221 | | auto t6 = InterleaveUpper(df, v2, v6); |
222 | | auto t7 = InterleaveUpper(df, v3, v7); |
223 | | |
224 | | auto w0 = InterleaveLower(df, t0, t2); |
225 | | auto w1 = InterleaveLower(df, t1, t3); |
226 | | auto w2 = InterleaveUpper(df, t0, t2); |
227 | | auto w3 = InterleaveUpper(df, t1, t3); |
228 | | auto w4 = InterleaveLower(df, t4, t6); |
229 | | auto w5 = InterleaveLower(df, t5, t7); |
230 | | auto w6 = InterleaveUpper(df, t4, t6); |
231 | | auto w7 = InterleaveUpper(df, t5, t7); |
232 | | |
233 | | auto m0 = InterleaveLower(df, w0, w1); |
234 | | auto m1 = InterleaveUpper(df, w0, w1); |
235 | | auto m2 = InterleaveLower(df, w2, w3); |
236 | | auto m3 = InterleaveUpper(df, w2, w3); |
237 | | auto m4 = InterleaveLower(df, w4, w5); |
238 | | auto m5 = InterleaveUpper(df, w4, w5); |
239 | | auto m6 = InterleaveLower(df, w6, w7); |
240 | | auto m7 = InterleaveUpper(df, w6, w7); |
241 | | |
242 | | Store(Concat4(df, Quarter<0>(df, m0), Quarter<0>(df, m1), Quarter<0>(df, m2), |
243 | | Quarter<0>(df, m3)), |
244 | | df, mem); |
245 | | Store(Concat4(df, Quarter<0>(df, m4), Quarter<0>(df, m5), Quarter<0>(df, m6), |
246 | | Quarter<0>(df, m7)), |
247 | | df, mem + Lanes(df)); |
248 | | Store(Concat4(df, Quarter<1>(df, m0), Quarter<1>(df, m1), Quarter<1>(df, m2), |
249 | | Quarter<1>(df, m3)), |
250 | | df, mem + 2 * Lanes(df)); |
251 | | Store(Concat4(df, Quarter<1>(df, m4), Quarter<1>(df, m5), Quarter<1>(df, m6), |
252 | | Quarter<1>(df, m7)), |
253 | | df, mem + 3 * Lanes(df)); |
254 | | Store(Concat4(df, Quarter<2>(df, m0), Quarter<2>(df, m1), Quarter<2>(df, m2), |
255 | | Quarter<2>(df, m3)), |
256 | | df, mem + 4 * Lanes(df)); |
257 | | Store(Concat4(df, Quarter<2>(df, m4), Quarter<2>(df, m5), Quarter<2>(df, m6), |
258 | | Quarter<2>(df, m7)), |
259 | | df, mem + 5 * Lanes(df)); |
260 | | Store(Concat4(df, Quarter<3>(df, m0), Quarter<3>(df, m1), Quarter<3>(df, m2), |
261 | | Quarter<3>(df, m3)), |
262 | | df, mem + 6 * Lanes(df)); |
263 | | Store(Concat4(df, Quarter<3>(df, m4), Quarter<3>(df, m5), Quarter<3>(df, m6), |
264 | | Quarter<3>(df, m7)), |
265 | | df, mem + 7 * Lanes(df)); |
266 | | #endif |
267 | 3.85M | } Unexecuted instantiation: void jxl::N_SSE4::StoreInterleaved<hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, float*) void jxl::N_AVX2::StoreInterleaved<hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, float*) Line | Count | Source | 141 | 3.85M | V v7, float* mem) { | 142 | | #if HWY_TARGET == HWY_SCALAR | 143 | | Store(v0, df, mem); | 144 | | Store(v1, df, mem + 1); | 145 | | Store(v2, df, mem + 2); | 146 | | Store(v3, df, mem + 3); | 147 | | Store(v4, df, mem + 4); | 148 | | Store(v5, df, mem + 5); | 149 | | Store(v6, df, mem + 6); | 150 | | Store(v7, df, mem + 7); | 151 | | #elif !HWY_CAP_GE256 | 152 | | auto t0 = InterleaveLower(df, v0, v4); | 153 | | auto t1 = InterleaveLower(df, v1, v5); | 154 | | auto t2 = InterleaveLower(df, v2, v6); | 155 | | auto t3 = InterleaveLower(df, v3, v7); | 156 | | auto t4 = InterleaveUpper(df, v0, v4); | 157 | | auto t5 = InterleaveUpper(df, v1, v5); | 158 | | auto t6 = InterleaveUpper(df, v2, v6); | 159 | | auto t7 = InterleaveUpper(df, v3, v7); | 160 | | | 161 | | auto w0 = InterleaveLower(df, t0, t2); | 162 | | auto w1 = InterleaveLower(df, t1, t3); | 163 | | auto w2 = InterleaveUpper(df, t0, t2); | 164 | | auto w3 = InterleaveUpper(df, t1, t3); | 165 | | auto w4 = InterleaveLower(df, t4, t6); | 166 | | auto w5 = InterleaveLower(df, t5, t7); | 167 | | auto w6 = InterleaveUpper(df, t4, t6); | 168 | | auto w7 = InterleaveUpper(df, t5, t7); | 169 | | | 170 | | Store(InterleaveLower(df, w0, w1), df, mem); | 171 | | Store(InterleaveUpper(df, w0, w1), df, mem + Lanes(df)); | 172 | | Store(InterleaveLower(df, w2, w3), df, mem + 2 * Lanes(df)); | 173 | | Store(InterleaveUpper(df, w2, w3), df, mem + 3 * Lanes(df)); | 174 | | Store(InterleaveLower(df, w4, w5), df, mem + 4 * Lanes(df)); | 175 | | Store(InterleaveUpper(df, w4, w5), df, mem + 5 * Lanes(df)); | 176 | | Store(InterleaveLower(df, w6, w7), df, mem + 6 * Lanes(df)); | 177 | | Store(InterleaveUpper(df, w6, w7), df, mem + 7 * Lanes(df)); | 178 | | #elif !HWY_CAP_GE512 | 179 | | auto t0 = InterleaveLower(df, v0, v4); | 180 | 3.85M | auto t1 = InterleaveLower(df, v1, v5); | 181 | 3.85M | auto t2 = InterleaveLower(df, v2, v6); | 182 | 3.85M | auto t3 = InterleaveLower(df, v3, v7); | 183 | 3.85M | auto t4 = InterleaveUpper(df, v0, v4); | 184 | 3.85M | auto t5 = InterleaveUpper(df, v1, v5); | 185 | 3.85M | auto t6 = InterleaveUpper(df, v2, v6); | 186 | 3.85M | auto t7 = InterleaveUpper(df, v3, v7); | 187 | | | 188 | 3.85M | auto w0 = InterleaveLower(df, t0, t2); | 189 | 3.85M | auto w1 = InterleaveLower(df, t1, t3); | 190 | 3.85M | auto w2 = InterleaveUpper(df, t0, t2); | 191 | 3.85M | auto w3 = InterleaveUpper(df, t1, t3); | 192 | 3.85M | auto w4 = InterleaveLower(df, t4, t6); | 193 | 3.85M | auto w5 = InterleaveLower(df, t5, t7); | 194 | 3.85M | auto w6 = InterleaveUpper(df, t4, t6); | 195 | 3.85M | auto w7 = InterleaveUpper(df, t5, t7); | 196 | | | 197 | 3.85M | auto m0 = InterleaveLower(df, w0, w1); | 198 | 3.85M | auto m1 = InterleaveUpper(df, w0, w1); | 199 | 3.85M | auto m2 = InterleaveLower(df, w2, w3); | 200 | 3.85M | auto m3 = InterleaveUpper(df, w2, w3); | 201 | 3.85M | auto m4 = InterleaveLower(df, w4, w5); | 202 | 3.85M | auto m5 = InterleaveUpper(df, w4, w5); | 203 | 3.85M | auto m6 = InterleaveLower(df, w6, w7); | 204 | 3.85M | auto m7 = InterleaveUpper(df, w6, w7); | 205 | | | 206 | 3.85M | Store(ConcatLowerLower(df, m1, m0), df, mem); | 207 | 3.85M | Store(ConcatLowerLower(df, m3, m2), df, mem + Lanes(df)); | 208 | 3.85M | Store(ConcatLowerLower(df, m5, m4), df, mem + 2 * Lanes(df)); | 209 | 3.85M | Store(ConcatLowerLower(df, m7, m6), df, mem + 3 * Lanes(df)); | 210 | 3.85M | Store(ConcatUpperUpper(df, m1, m0), df, mem + 4 * Lanes(df)); | 211 | 3.85M | Store(ConcatUpperUpper(df, m3, m2), df, mem + 5 * Lanes(df)); | 212 | 3.85M | Store(ConcatUpperUpper(df, m5, m4), df, mem + 6 * Lanes(df)); | 213 | 3.85M | Store(ConcatUpperUpper(df, m7, m6), df, mem + 7 * Lanes(df)); | 214 | | #else | 215 | | auto t0 = InterleaveLower(df, v0, v4); | 216 | | auto t1 = InterleaveLower(df, v1, v5); | 217 | | auto t2 = InterleaveLower(df, v2, v6); | 218 | | auto t3 = InterleaveLower(df, v3, v7); | 219 | | auto t4 = InterleaveUpper(df, v0, v4); | 220 | | auto t5 = InterleaveUpper(df, v1, v5); | 221 | | auto t6 = InterleaveUpper(df, v2, v6); | 222 | | auto t7 = InterleaveUpper(df, v3, v7); | 223 | | | 224 | | auto w0 = InterleaveLower(df, t0, t2); | 225 | | auto w1 = InterleaveLower(df, t1, t3); | 226 | | auto w2 = InterleaveUpper(df, t0, t2); | 227 | | auto w3 = InterleaveUpper(df, t1, t3); | 228 | | auto w4 = InterleaveLower(df, t4, t6); | 229 | | auto w5 = InterleaveLower(df, t5, t7); | 230 | | auto w6 = InterleaveUpper(df, t4, t6); | 231 | | auto w7 = InterleaveUpper(df, t5, t7); | 232 | | | 233 | | auto m0 = InterleaveLower(df, w0, w1); | 234 | | auto m1 = InterleaveUpper(df, w0, w1); | 235 | | auto m2 = InterleaveLower(df, w2, w3); | 236 | | auto m3 = InterleaveUpper(df, w2, w3); | 237 | | auto m4 = InterleaveLower(df, w4, w5); | 238 | | auto m5 = InterleaveUpper(df, w4, w5); | 239 | | auto m6 = InterleaveLower(df, w6, w7); | 240 | | auto m7 = InterleaveUpper(df, w6, w7); | 241 | | | 242 | | Store(Concat4(df, Quarter<0>(df, m0), Quarter<0>(df, m1), Quarter<0>(df, m2), | 243 | | Quarter<0>(df, m3)), | 244 | | df, mem); | 245 | | Store(Concat4(df, Quarter<0>(df, m4), Quarter<0>(df, m5), Quarter<0>(df, m6), | 246 | | Quarter<0>(df, m7)), | 247 | | df, mem + Lanes(df)); | 248 | | Store(Concat4(df, Quarter<1>(df, m0), Quarter<1>(df, m1), Quarter<1>(df, m2), | 249 | | Quarter<1>(df, m3)), | 250 | | df, mem + 2 * Lanes(df)); | 251 | | Store(Concat4(df, Quarter<1>(df, m4), Quarter<1>(df, m5), Quarter<1>(df, m6), | 252 | | Quarter<1>(df, m7)), | 253 | | df, mem + 3 * Lanes(df)); | 254 | | Store(Concat4(df, Quarter<2>(df, m0), Quarter<2>(df, m1), Quarter<2>(df, m2), | 255 | | Quarter<2>(df, m3)), | 256 | | df, mem + 4 * Lanes(df)); | 257 | | Store(Concat4(df, Quarter<2>(df, m4), Quarter<2>(df, m5), Quarter<2>(df, m6), | 258 | | Quarter<2>(df, m7)), | 259 | | df, mem + 5 * Lanes(df)); | 260 | | Store(Concat4(df, Quarter<3>(df, m0), Quarter<3>(df, m1), Quarter<3>(df, m2), | 261 | | Quarter<3>(df, m3)), | 262 | | df, mem + 6 * Lanes(df)); | 263 | | Store(Concat4(df, Quarter<3>(df, m4), Quarter<3>(df, m5), Quarter<3>(df, m6), | 264 | | Quarter<3>(df, m7)), | 265 | | df, mem + 7 * Lanes(df)); | 266 | | #endif | 267 | 3.85M | } |
Unexecuted instantiation: void jxl::N_SSE2::StoreInterleaved<hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, float*) |
268 | | |
269 | | #if HWY_CAP_GE256 |
270 | | JXL_INLINE void Transpose8x8Block(const int32_t* JXL_RESTRICT from, |
271 | 2.66M | int32_t* JXL_RESTRICT to, size_t fromstride) { |
272 | 2.66M | const HWY_CAPPED(int32_t, 8) d; |
273 | 2.66M | auto i0 = Load(d, from); |
274 | 2.66M | auto i1 = Load(d, from + 1 * fromstride); |
275 | 2.66M | auto i2 = Load(d, from + 2 * fromstride); |
276 | 2.66M | auto i3 = Load(d, from + 3 * fromstride); |
277 | 2.66M | auto i4 = Load(d, from + 4 * fromstride); |
278 | 2.66M | auto i5 = Load(d, from + 5 * fromstride); |
279 | 2.66M | auto i6 = Load(d, from + 6 * fromstride); |
280 | 2.66M | auto i7 = Load(d, from + 7 * fromstride); |
281 | | |
282 | 2.66M | const auto q0 = InterleaveLower(d, i0, i2); |
283 | 2.66M | const auto q1 = InterleaveLower(d, i1, i3); |
284 | 2.66M | const auto q2 = InterleaveUpper(d, i0, i2); |
285 | 2.66M | const auto q3 = InterleaveUpper(d, i1, i3); |
286 | 2.66M | const auto q4 = InterleaveLower(d, i4, i6); |
287 | 2.66M | const auto q5 = InterleaveLower(d, i5, i7); |
288 | 2.66M | const auto q6 = InterleaveUpper(d, i4, i6); |
289 | 2.66M | const auto q7 = InterleaveUpper(d, i5, i7); |
290 | | |
291 | 2.66M | const auto r0 = InterleaveLower(d, q0, q1); |
292 | 2.66M | const auto r1 = InterleaveUpper(d, q0, q1); |
293 | 2.66M | const auto r2 = InterleaveLower(d, q2, q3); |
294 | 2.66M | const auto r3 = InterleaveUpper(d, q2, q3); |
295 | 2.66M | const auto r4 = InterleaveLower(d, q4, q5); |
296 | 2.66M | const auto r5 = InterleaveUpper(d, q4, q5); |
297 | 2.66M | const auto r6 = InterleaveLower(d, q6, q7); |
298 | 2.66M | const auto r7 = InterleaveUpper(d, q6, q7); |
299 | | |
300 | 2.66M | i0 = ConcatLowerLower(d, r4, r0); |
301 | 2.66M | i1 = ConcatLowerLower(d, r5, r1); |
302 | 2.66M | i2 = ConcatLowerLower(d, r6, r2); |
303 | 2.66M | i3 = ConcatLowerLower(d, r7, r3); |
304 | 2.66M | i4 = ConcatUpperUpper(d, r4, r0); |
305 | 2.66M | i5 = ConcatUpperUpper(d, r5, r1); |
306 | 2.66M | i6 = ConcatUpperUpper(d, r6, r2); |
307 | 2.66M | i7 = ConcatUpperUpper(d, r7, r3); |
308 | | |
309 | 2.66M | Store(i0, d, to); |
310 | 2.66M | Store(i1, d, to + 1 * 8); |
311 | 2.66M | Store(i2, d, to + 2 * 8); |
312 | 2.66M | Store(i3, d, to + 3 * 8); |
313 | 2.66M | Store(i4, d, to + 4 * 8); |
314 | 2.66M | Store(i5, d, to + 5 * 8); |
315 | 2.66M | Store(i6, d, to + 6 * 8); |
316 | 2.66M | Store(i7, d, to + 7 * 8); |
317 | 2.66M | } |
318 | | #elif HWY_TARGET != HWY_SCALAR |
319 | | JXL_INLINE void Transpose8x8Block(const int32_t* JXL_RESTRICT from, |
320 | 0 | int32_t* JXL_RESTRICT to, size_t fromstride) { |
321 | 0 | const HWY_CAPPED(int32_t, 4) d; |
322 | 0 | for (size_t n = 0; n < 8; n += 4) { |
323 | 0 | for (size_t m = 0; m < 8; m += 4) { |
324 | 0 | auto p0 = Load(d, from + n * fromstride + m); |
325 | 0 | auto p1 = Load(d, from + (n + 1) * fromstride + m); |
326 | 0 | auto p2 = Load(d, from + (n + 2) * fromstride + m); |
327 | 0 | auto p3 = Load(d, from + (n + 3) * fromstride + m); |
328 | 0 | const auto q0 = InterleaveLower(d, p0, p2); |
329 | 0 | const auto q1 = InterleaveLower(d, p1, p3); |
330 | 0 | const auto q2 = InterleaveUpper(d, p0, p2); |
331 | 0 | const auto q3 = InterleaveUpper(d, p1, p3); |
332 | |
|
333 | 0 | const auto r0 = InterleaveLower(d, q0, q1); |
334 | 0 | const auto r1 = InterleaveUpper(d, q0, q1); |
335 | 0 | const auto r2 = InterleaveLower(d, q2, q3); |
336 | 0 | const auto r3 = InterleaveUpper(d, q2, q3); |
337 | 0 | Store(r0, d, to + m * 8 + n); |
338 | 0 | Store(r1, d, to + (1 + m) * 8 + n); |
339 | 0 | Store(r2, d, to + (2 + m) * 8 + n); |
340 | 0 | Store(r3, d, to + (3 + m) * 8 + n); |
341 | 0 | } |
342 | 0 | } |
343 | 0 | } Unexecuted instantiation: jxl::N_SSE4::Transpose8x8Block(int const*, int*, unsigned long) Unexecuted instantiation: jxl::N_SSE2::Transpose8x8Block(int const*, int*, unsigned long) |
344 | | |
345 | | #endif |
346 | | |
347 | | // NOLINTNEXTLINE(google-readability-namespace-comments) |
348 | | } // namespace HWY_NAMESPACE |
349 | | } // namespace jxl |
350 | | HWY_AFTER_NAMESPACE(); |
351 | | |
352 | | #endif // LIB_JXL_SIMD_UTIL_INL_H_ |