/src/astc-encoder/Source/astcenc_vecmathlib_common_4.h
Line | Count | Source (jump to first uncovered line) |
1 | | // SPDX-License-Identifier: Apache-2.0 |
2 | | // ---------------------------------------------------------------------------- |
3 | | // Copyright 2020-2025 Arm Limited |
4 | | // |
5 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not |
6 | | // use this file except in compliance with the License. You may obtain a copy |
7 | | // of the License at: |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, software |
12 | | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT |
13 | | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the |
14 | | // License for the specific language governing permissions and limitations |
15 | | // under the License. |
16 | | // ---------------------------------------------------------------------------- |
17 | | |
18 | | /** |
19 | | * @brief Generic 4x32-bit vector functions. |
20 | | * |
21 | | * This module implements generic 4-wide vector functions that are valid for |
22 | | * all instruction sets, typically implemented using lower level 4-wide |
23 | | * operations that are ISA-specific. |
24 | | */ |
25 | | |
26 | | #ifndef ASTC_VECMATHLIB_COMMON_4_H_INCLUDED |
27 | | #define ASTC_VECMATHLIB_COMMON_4_H_INCLUDED |
28 | | |
29 | | #ifndef ASTCENC_SIMD_INLINE |
30 | | #error "Include astcenc_vecmathlib.h, do not include directly" |
31 | | #endif |
32 | | |
33 | | #include <cstdio> |
34 | | #include <limits> |
35 | | |
36 | | // ============================================================================ |
37 | | // vint4 operators and functions |
38 | | // ============================================================================ |
39 | | |
40 | | /** |
41 | | * @brief Overload: vector by scalar addition. |
42 | | */ |
43 | | ASTCENC_SIMD_INLINE vint4 operator+(vint4 a, int b) |
44 | 0 | { |
45 | 0 | return a + vint4(b); |
46 | 0 | } |
47 | | |
48 | | /** |
49 | | * @brief Overload: vector by vector incremental addition. |
50 | | */ |
51 | | ASTCENC_SIMD_INLINE vint4& operator+=(vint4& a, const vint4& b) |
52 | 0 | { |
53 | 0 | a = a + b; |
54 | 0 | return a; |
55 | 0 | } |
56 | | |
57 | | /** |
58 | | * @brief Overload: vector by scalar subtraction. |
59 | | */ |
60 | | ASTCENC_SIMD_INLINE vint4 operator-(vint4 a, int b) |
61 | 0 | { |
62 | 0 | return a - vint4(b); |
63 | 0 | } |
64 | | |
65 | | /** |
66 | | * @brief Overload: vector by scalar multiplication. |
67 | | */ |
68 | | ASTCENC_SIMD_INLINE vint4 operator*(vint4 a, int b) |
69 | 0 | { |
70 | 0 | return a * vint4(b); |
71 | 0 | } |
72 | | |
73 | | /** |
74 | | * @brief Overload: vector by scalar bitwise or. |
75 | | */ |
76 | | ASTCENC_SIMD_INLINE vint4 operator|(vint4 a, int b) |
77 | 0 | { |
78 | 0 | return a | vint4(b); |
79 | 0 | } |
80 | | |
81 | | /** |
82 | | * @brief Overload: vector by scalar bitwise and. |
83 | | */ |
84 | | ASTCENC_SIMD_INLINE vint4 operator&(vint4 a, int b) |
85 | 0 | { |
86 | 0 | return a & vint4(b); |
87 | 0 | } |
88 | | |
89 | | /** |
90 | | * @brief Overload: vector by scalar bitwise xor. |
91 | | */ |
92 | | ASTCENC_SIMD_INLINE vint4 operator^(vint4 a, int b) |
93 | 0 | { |
94 | 0 | return a ^ vint4(b); |
95 | 0 | } |
96 | | |
97 | | /** |
98 | | * @brief Return the clamped value between min and max. |
99 | | */ |
100 | | ASTCENC_SIMD_INLINE vint4 clamp(int minv, int maxv, vint4 a) |
101 | 0 | { |
102 | 0 | return min(max(a, vint4(minv)), vint4(maxv)); |
103 | 0 | } |
104 | | |
105 | | /** |
106 | | * @brief Return the horizontal sum of RGB vector lanes as a scalar. |
107 | | */ |
108 | | ASTCENC_SIMD_INLINE int hadd_rgb_s(vint4 a) |
109 | 0 | { |
110 | 0 | return a.lane<0>() + a.lane<1>() + a.lane<2>(); |
111 | 0 | } |
112 | | |
113 | | /** |
114 | | * @brief Return the horizontal minimum of a vector. |
115 | | */ |
116 | | ASTCENC_SIMD_INLINE int hmin_s(vint4 a) |
117 | 0 | { |
118 | 0 | return hmin(a).lane<0>(); |
119 | 0 | } |
120 | | |
121 | | /** |
122 | | * @brief Generate a vint4 from a size_t. |
123 | | */ |
124 | | ASTCENC_SIMD_INLINE vint4 vint4_from_size(size_t a) |
125 | 0 | { |
126 | 0 | assert(a <= std::numeric_limits<int>::max()); |
127 | 0 | return vint4(static_cast<int>(a)); |
128 | 0 | } |
129 | | |
130 | | /** |
131 | | * @brief Return the horizontal maximum of a vector. |
132 | | */ |
133 | | ASTCENC_SIMD_INLINE int hmax_s(vint4 a) |
134 | 0 | { |
135 | 0 | return hmax(a).lane<0>(); |
136 | 0 | } |
137 | | |
138 | | // ============================================================================ |
139 | | // vfloat4 operators and functions |
140 | | // ============================================================================ |
141 | | |
142 | | /** |
143 | | * @brief Overload: vector by vector incremental addition. |
144 | | */ |
145 | | ASTCENC_SIMD_INLINE vfloat4& operator+=(vfloat4& a, const vfloat4& b) |
146 | 0 | { |
147 | 0 | a = a + b; |
148 | 0 | return a; |
149 | 0 | } |
150 | | |
151 | | /** |
152 | | * @brief Overload: vector by scalar addition. |
153 | | */ |
154 | | ASTCENC_SIMD_INLINE vfloat4 operator+(vfloat4 a, float b) |
155 | 0 | { |
156 | 0 | return a + vfloat4(b); |
157 | 0 | } |
158 | | |
159 | | /** |
160 | | * @brief Overload: vector by scalar subtraction. |
161 | | */ |
162 | | ASTCENC_SIMD_INLINE vfloat4 operator-(vfloat4 a, float b) |
163 | 0 | { |
164 | 0 | return a - vfloat4(b); |
165 | 0 | } |
166 | | |
167 | | /** |
168 | | * @brief Overload: vector by scalar multiplication. |
169 | | */ |
170 | | ASTCENC_SIMD_INLINE vfloat4 operator*(vfloat4 a, float b) |
171 | 0 | { |
172 | 0 | return a * vfloat4(b); |
173 | 0 | } |
174 | | |
175 | | /** |
176 | | * @brief Overload: scalar by vector multiplication. |
177 | | */ |
178 | | ASTCENC_SIMD_INLINE vfloat4 operator*(float a, vfloat4 b) |
179 | 0 | { |
180 | 0 | return vfloat4(a) * b; |
181 | 0 | } |
182 | | |
183 | | /** |
184 | | * @brief Overload: vector by scalar division. |
185 | | */ |
186 | | ASTCENC_SIMD_INLINE vfloat4 operator/(vfloat4 a, float b) |
187 | 0 | { |
188 | 0 | return a / vfloat4(b); |
189 | 0 | } |
190 | | |
191 | | /** |
192 | | * @brief Overload: scalar by vector division. |
193 | | */ |
194 | | ASTCENC_SIMD_INLINE vfloat4 operator/(float a, vfloat4 b) |
195 | 0 | { |
196 | 0 | return vfloat4(a) / b; |
197 | 0 | } |
198 | | |
199 | | /** |
200 | | * @brief Return the min vector of a vector and a scalar. |
201 | | * |
202 | | * If either lane value is NaN, @c b will be returned for that lane. |
203 | | */ |
204 | | ASTCENC_SIMD_INLINE vfloat4 min(vfloat4 a, float b) |
205 | 0 | { |
206 | 0 | return min(a, vfloat4(b)); |
207 | 0 | } |
208 | | |
209 | | /** |
210 | | * @brief Return the max vector of a vector and a scalar. |
211 | | * |
212 | | * If either lane value is NaN, @c b will be returned for that lane. |
213 | | */ |
214 | | ASTCENC_SIMD_INLINE vfloat4 max(vfloat4 a, float b) |
215 | 0 | { |
216 | 0 | return max(a, vfloat4(b)); |
217 | 0 | } |
218 | | |
219 | | /** |
220 | | * @brief Return the clamped value between min and max. |
221 | | * |
222 | | * It is assumed that neither @c min nor @c max are NaN values. If @c a is NaN |
223 | | * then @c min will be returned for that lane. |
224 | | */ |
225 | | ASTCENC_SIMD_INLINE vfloat4 clamp(float minv, float maxv, vfloat4 a) |
226 | 0 | { |
227 | 0 | // Do not reorder - second operand will return if either is NaN |
228 | 0 | return min(max(a, minv), maxv); |
229 | 0 | } |
230 | | |
231 | | /** |
232 | | * @brief Return the clamped value between 0.0f and 1.0f. |
233 | | * |
234 | | * If @c a is NaN then zero will be returned for that lane. |
235 | | */ |
236 | | ASTCENC_SIMD_INLINE vfloat4 clampzo(vfloat4 a) |
237 | 0 | { |
238 | 0 | // Do not reorder - second operand will return if either is NaN |
239 | 0 | return min(max(a, vfloat4::zero()), 1.0f); |
240 | 0 | } |
241 | | |
242 | | /** |
243 | | * @brief Return the horizontal minimum of a vector. |
244 | | */ |
245 | | ASTCENC_SIMD_INLINE float hmin_s(vfloat4 a) |
246 | 0 | { |
247 | 0 | return hmin(a).lane<0>(); |
248 | 0 | } |
249 | | |
250 | | /** |
251 | | * @brief Return the horizontal min of RGB vector lanes as a scalar. |
252 | | */ |
253 | | ASTCENC_SIMD_INLINE float hmin_rgb_s(vfloat4 a) |
254 | 0 | { |
255 | 0 | a.set_lane<3>(a.lane<0>()); |
256 | 0 | return hmin_s(a); |
257 | 0 | } |
258 | | |
259 | | /** |
260 | | * @brief Return the horizontal maximum of a vector. |
261 | | */ |
262 | | ASTCENC_SIMD_INLINE float hmax_s(vfloat4 a) |
263 | 0 | { |
264 | 0 | return hmax(a).lane<0>(); |
265 | 0 | } |
266 | | |
267 | | /** |
268 | | * @brief Accumulate lane-wise sums for a vector. |
269 | | */ |
270 | | ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat4 a) |
271 | 0 | { |
272 | 0 | accum = accum + a; |
273 | 0 | } |
274 | | |
275 | | /** |
276 | | * @brief Accumulate lane-wise sums for a masked vector. |
277 | | */ |
278 | | ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat4 a, vmask4 m) |
279 | 0 | { |
280 | 0 | a = select(vfloat4::zero(), a, m); |
281 | 0 | haccumulate(accum, a); |
282 | 0 | } |
283 | | |
284 | | /** |
285 | | * @brief Return the horizontal sum of RGB vector lanes as a scalar. |
286 | | */ |
287 | | ASTCENC_SIMD_INLINE float hadd_rgb_s(vfloat4 a) |
288 | 0 | { |
289 | 0 | return a.lane<0>() + a.lane<1>() + a.lane<2>(); |
290 | 0 | } |
291 | | |
292 | | #if !defined(ASTCENC_USE_NATIVE_DOT_PRODUCT) |
293 | | |
294 | | /** |
295 | | * @brief Return the dot product for the full 4 lanes, returning scalar. |
296 | | */ |
297 | | ASTCENC_SIMD_INLINE float dot_s(vfloat4 a, vfloat4 b) |
298 | 0 | { |
299 | 0 | vfloat4 m = a * b; |
300 | 0 | return hadd_s(m); |
301 | 0 | } |
302 | | |
303 | | /** |
304 | | * @brief Return the dot product for the full 4 lanes, returning vector. |
305 | | */ |
306 | | ASTCENC_SIMD_INLINE vfloat4 dot(vfloat4 a, vfloat4 b) |
307 | 0 | { |
308 | 0 | vfloat4 m = a * b; |
309 | 0 | return vfloat4(hadd_s(m)); |
310 | 0 | } |
311 | | |
312 | | /** |
313 | | * @brief Return the dot product for the bottom 3 lanes, returning scalar. |
314 | | */ |
315 | | ASTCENC_SIMD_INLINE float dot3_s(vfloat4 a, vfloat4 b) |
316 | 0 | { |
317 | 0 | vfloat4 m = a * b; |
318 | 0 | return hadd_rgb_s(m); |
319 | 0 | } |
320 | | |
321 | | /** |
322 | | * @brief Return the dot product for the bottom 3 lanes, returning vector. |
323 | | */ |
324 | | ASTCENC_SIMD_INLINE vfloat4 dot3(vfloat4 a, vfloat4 b) |
325 | 0 | { |
326 | 0 | vfloat4 m = a * b; |
327 | 0 | float d3 = hadd_rgb_s(m); |
328 | 0 | return vfloat4(d3, d3, d3, 0.0f); |
329 | 0 | } |
330 | | |
331 | | #endif |
332 | | |
333 | | #if !defined(ASTCENC_USE_NATIVE_POPCOUNT) |
334 | | |
335 | | /** |
336 | | * @brief Population bit count. |
337 | | * |
338 | | * @param v The value to population count. |
339 | | * |
340 | | * @return The number of 1 bits. |
341 | | */ |
342 | | static inline int popcount(uint64_t v) |
343 | 0 | { |
344 | 0 | uint64_t mask1 = 0x5555555555555555ULL; |
345 | 0 | uint64_t mask2 = 0x3333333333333333ULL; |
346 | 0 | uint64_t mask3 = 0x0F0F0F0F0F0F0F0FULL; |
347 | 0 | v -= (v >> 1) & mask1; |
348 | 0 | v = (v & mask2) + ((v >> 2) & mask2); |
349 | 0 | v += v >> 4; |
350 | 0 | v &= mask3; |
351 | 0 | v *= 0x0101010101010101ULL; |
352 | 0 | v >>= 56; |
353 | 0 | return static_cast<int>(v); |
354 | 0 | } Unexecuted instantiation: fuzz_astc_physical_to_symbolic.cpp:popcount(unsigned long) Unexecuted instantiation: astcenc_block_sizes.cpp:popcount(unsigned long) Unexecuted instantiation: astcenc_integer_sequence.cpp:popcount(unsigned long) Unexecuted instantiation: astcenc_mathlib.cpp:popcount(unsigned long) Unexecuted instantiation: astcenc_partition_tables.cpp:popcount(unsigned long) Unexecuted instantiation: astcenc_percentile_tables.cpp:popcount(unsigned long) Unexecuted instantiation: astcenc_symbolic_physical.cpp:popcount(unsigned long) Unexecuted instantiation: astcenc_weight_quant_xfer_tables.cpp:popcount(unsigned long) Unexecuted instantiation: astcenc_quantization.cpp:popcount(unsigned long) |
355 | | |
356 | | #endif |
357 | | |
358 | | /** |
359 | | * @brief Apply signed bit transfer. |
360 | | * |
361 | | * @param input0 The first encoded endpoint. |
362 | | * @param input1 The second encoded endpoint. |
363 | | */ |
364 | | static ASTCENC_SIMD_INLINE void bit_transfer_signed( |
365 | | vint4& input0, |
366 | | vint4& input1 |
367 | 0 | ) { |
368 | 0 | input1 = lsr<1>(input1) | (input0 & 0x80); |
369 | 0 | input0 = lsr<1>(input0) & 0x3F; |
370 | 0 |
|
371 | 0 | vmask4 mask = (input0 & 0x20) != vint4::zero(); |
372 | 0 | input0 = select(input0, input0 - 0x40, mask); |
373 | 0 | } Unexecuted instantiation: fuzz_astc_physical_to_symbolic.cpp:bit_transfer_signed(vint4&, vint4&) Unexecuted instantiation: astcenc_block_sizes.cpp:bit_transfer_signed(vint4&, vint4&) Unexecuted instantiation: astcenc_integer_sequence.cpp:bit_transfer_signed(vint4&, vint4&) Unexecuted instantiation: astcenc_mathlib.cpp:bit_transfer_signed(vint4&, vint4&) Unexecuted instantiation: astcenc_partition_tables.cpp:bit_transfer_signed(vint4&, vint4&) Unexecuted instantiation: astcenc_percentile_tables.cpp:bit_transfer_signed(vint4&, vint4&) Unexecuted instantiation: astcenc_symbolic_physical.cpp:bit_transfer_signed(vint4&, vint4&) Unexecuted instantiation: astcenc_weight_quant_xfer_tables.cpp:bit_transfer_signed(vint4&, vint4&) Unexecuted instantiation: astcenc_quantization.cpp:bit_transfer_signed(vint4&, vint4&) |
374 | | |
375 | | /** |
376 | | * @brief Debug function to print a vector of ints. |
377 | | */ |
378 | | ASTCENC_SIMD_INLINE void print(vint4 a) |
379 | 0 | { |
380 | 0 | ASTCENC_ALIGNAS int v[4]; |
381 | 0 | storea(a, v); |
382 | 0 | printf("v4_i32:\n %8d %8d %8d %8d\n", |
383 | 0 | v[0], v[1], v[2], v[3]); |
384 | 0 | } |
385 | | |
386 | | /** |
387 | | * @brief Debug function to print a vector of ints. |
388 | | */ |
389 | | ASTCENC_SIMD_INLINE void printx(vint4 a) |
390 | 0 | { |
391 | 0 | ASTCENC_ALIGNAS int v[4]; |
392 | 0 | storea(a, v); |
393 | 0 |
|
394 | 0 | unsigned int uv[4]; |
395 | 0 | std::memcpy(uv, v, sizeof(int) * 4); |
396 | 0 |
|
397 | 0 | printf("v4_i32:\n %08x %08x %08x %08x\n", |
398 | 0 | uv[0], uv[1], uv[2], uv[3]); |
399 | 0 | } |
400 | | |
401 | | /** |
402 | | * @brief Debug function to print a vector of floats. |
403 | | */ |
404 | | ASTCENC_SIMD_INLINE void print(vfloat4 a) |
405 | 0 | { |
406 | 0 | ASTCENC_ALIGNAS float v[4]; |
407 | 0 | storea(a, v); |
408 | 0 | printf("v4_f32:\n %0.4f %0.4f %0.4f %0.4f\n", |
409 | 0 | static_cast<double>(v[0]), static_cast<double>(v[1]), |
410 | 0 | static_cast<double>(v[2]), static_cast<double>(v[3])); |
411 | 0 | } |
412 | | |
413 | | /** |
414 | | * @brief Debug function to print a vector of masks. |
415 | | */ |
416 | | ASTCENC_SIMD_INLINE void print(vmask4 a) |
417 | 0 | { |
418 | 0 | print(select(vint4(0), vint4(1), a)); |
419 | 0 | } |
420 | | |
421 | | #endif // #ifndef ASTC_VECMATHLIB_COMMON_4_H_INCLUDED |