/src/astc-encoder/Source/astcenc_vecmathlib_none_4.h
Line | Count | Source (jump to first uncovered line) |
1 | | // SPDX-License-Identifier: Apache-2.0 |
2 | | // ---------------------------------------------------------------------------- |
3 | | // Copyright 2019-2025 Arm Limited |
4 | | // |
5 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not |
6 | | // use this file except in compliance with the License. You may obtain a copy |
7 | | // of the License at: |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, software |
12 | | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT |
13 | | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the |
14 | | // License for the specific language governing permissions and limitations |
15 | | // under the License. |
16 | | // ---------------------------------------------------------------------------- |
17 | | |
18 | | /** |
19 | | * @brief 4x32-bit vectors, implemented using plain C++. |
20 | | * |
21 | | * This module implements 4-wide 32-bit float, int, and mask vectors. This |
22 | | * module provides a scalar fallback for VLA code, primarily useful for |
23 | | * debugging VLA algorithms without the complexity of handling SIMD. Only the |
24 | | * baseline level of functionality needed to support VLA is provided. |
25 | | * |
26 | | * Note that the vector conditional operators implemented by this module are |
27 | | * designed to behave like SIMD conditional operators that generate lane masks. |
28 | | * Rather than returning 0/1 booleans like normal C++ code they will return |
29 | | * 0/-1 to give a full lane-width bitmask. |
30 | | * |
31 | | * Note that the documentation for this module still talks about "vectors" to |
32 | | * help developers think about the implied VLA behavior when writing optimized |
33 | | * paths. |
34 | | */ |
35 | | |
36 | | #ifndef ASTC_VECMATHLIB_NONE_4_H_INCLUDED |
37 | | #define ASTC_VECMATHLIB_NONE_4_H_INCLUDED |
38 | | |
39 | | #ifndef ASTCENC_SIMD_INLINE |
40 | | #error "Include astcenc_vecmathlib.h, do not include directly" |
41 | | #endif |
42 | | |
43 | | #include <algorithm> |
44 | | #include <cstdio> |
45 | | #include <cstring> |
46 | | #include <cfenv> |
47 | | |
48 | | // ============================================================================ |
49 | | // vfloat4 data type |
50 | | // ============================================================================ |
51 | | |
52 | | /** |
53 | | * @brief Data type for 4-wide floats. |
54 | | */ |
55 | | struct vfloat4 |
56 | | { |
57 | | /** |
58 | | * @brief Construct from zero-initialized value. |
59 | | */ |
60 | | ASTCENC_SIMD_INLINE vfloat4() = default; |
61 | | |
62 | | /** |
63 | | * @brief Construct from 4 values loaded from an unaligned address. |
64 | | * |
65 | | * Consider using loada() which is better with wider VLA vectors if data is |
66 | | * aligned to vector length. |
67 | | */ |
68 | | ASTCENC_SIMD_INLINE explicit vfloat4(const float* p) |
69 | 0 | { |
70 | 0 | m[0] = p[0]; |
71 | 0 | m[1] = p[1]; |
72 | 0 | m[2] = p[2]; |
73 | 0 | m[3] = p[3]; |
74 | 0 | } |
75 | | |
76 | | /** |
77 | | * @brief Construct from 4 scalar values replicated across all lanes. |
78 | | * |
79 | | * Consider using zero() for constexpr zeros. |
80 | | */ |
81 | | ASTCENC_SIMD_INLINE explicit vfloat4(float a) |
82 | 0 | { |
83 | 0 | m[0] = a; |
84 | 0 | m[1] = a; |
85 | 0 | m[2] = a; |
86 | 0 | m[3] = a; |
87 | 0 | } |
88 | | |
89 | | /** |
90 | | * @brief Construct from 4 scalar values. |
91 | | * |
92 | | * The value of @c a is stored to lane 0 (LSB) in the SIMD register. |
93 | | */ |
94 | | ASTCENC_SIMD_INLINE explicit vfloat4(float a, float b, float c, float d) |
95 | 0 | { |
96 | 0 | m[0] = a; |
97 | 0 | m[1] = b; |
98 | 0 | m[2] = c; |
99 | 0 | m[3] = d; |
100 | 0 | } |
101 | | |
102 | | /** |
103 | | * @brief Get the scalar value of a single lane. |
104 | | */ |
105 | | template <int l> ASTCENC_SIMD_INLINE float lane() const |
106 | 0 | { |
107 | 0 | return m[l]; |
108 | 0 | } Unexecuted instantiation: float vfloat4::lane<0>() const Unexecuted instantiation: float vfloat4::lane<1>() const Unexecuted instantiation: float vfloat4::lane<2>() const Unexecuted instantiation: float vfloat4::lane<3>() const |
109 | | |
110 | | /** |
111 | | * @brief Set the scalar value of a single lane. |
112 | | */ |
113 | | template <int l> ASTCENC_SIMD_INLINE void set_lane(float a) |
114 | 0 | { |
115 | 0 | m[l] = a; |
116 | 0 | } |
117 | | |
118 | | /** |
119 | | * @brief Factory that returns a vector of zeros. |
120 | | */ |
121 | | static ASTCENC_SIMD_INLINE vfloat4 zero() |
122 | 0 | { |
123 | 0 | return vfloat4(0.0f); |
124 | 0 | } |
125 | | |
126 | | /** |
127 | | * @brief Factory that returns a replicated scalar loaded from memory. |
128 | | */ |
129 | | static ASTCENC_SIMD_INLINE vfloat4 load1(const float* p) |
130 | 0 | { |
131 | 0 | return vfloat4(*p); |
132 | 0 | } |
133 | | |
134 | | /** |
135 | | * @brief Factory that returns a vector loaded from aligned memory. |
136 | | */ |
137 | | static ASTCENC_SIMD_INLINE vfloat4 loada(const float* p) |
138 | 0 | { |
139 | 0 | return vfloat4(p); |
140 | 0 | } |
141 | | |
142 | | /** |
143 | | * @brief Return a swizzled float 2. |
144 | | */ |
145 | | template <int l0, int l1> ASTCENC_SIMD_INLINE vfloat4 swz() const |
146 | | { |
147 | | return vfloat4(lane<l0>(), lane<l1>(), 0.0f, 0.0f); |
148 | | } |
149 | | |
150 | | /** |
151 | | * @brief Return a swizzled float 3. |
152 | | */ |
153 | | template <int l0, int l1, int l2> ASTCENC_SIMD_INLINE vfloat4 swz() const |
154 | | { |
155 | | return vfloat4(lane<l0>(), lane<l1>(), lane<l2>(), 0.0f); |
156 | | } |
157 | | |
158 | | /** |
159 | | * @brief Return a swizzled float 4. |
160 | | */ |
161 | | template <int l0, int l1, int l2, int l3> ASTCENC_SIMD_INLINE vfloat4 swz() const |
162 | | { |
163 | | return vfloat4(lane<l0>(), lane<l1>(), lane<l2>(), lane<l3>()); |
164 | | } |
165 | | |
166 | | /** |
167 | | * @brief The vector ... |
168 | | */ |
169 | | float m[4]; |
170 | | }; |
171 | | |
172 | | // ============================================================================ |
173 | | // vint4 data type |
174 | | // ============================================================================ |
175 | | |
176 | | /** |
177 | | * @brief Data type for 4-wide ints. |
178 | | */ |
179 | | struct vint4 |
180 | | { |
181 | | /** |
182 | | * @brief Construct from zero-initialized value. |
183 | | */ |
184 | | ASTCENC_SIMD_INLINE vint4() = default; |
185 | | |
186 | | /** |
187 | | * @brief Construct from 4 values loaded from an unaligned address. |
188 | | * |
189 | | * Consider using vint4::loada() which is better with wider VLA vectors |
190 | | * if data is aligned. |
191 | | */ |
192 | | ASTCENC_SIMD_INLINE explicit vint4(const int* p) |
193 | 0 | { |
194 | 0 | m[0] = p[0]; |
195 | 0 | m[1] = p[1]; |
196 | 0 | m[2] = p[2]; |
197 | 0 | m[3] = p[3]; |
198 | 0 | } |
199 | | |
200 | | /** |
201 | | * @brief Construct from 4 uint8_t loaded from an unaligned address. |
202 | | */ |
203 | | ASTCENC_SIMD_INLINE explicit vint4(const uint8_t *p) |
204 | 0 | { |
205 | 0 | m[0] = p[0]; |
206 | 0 | m[1] = p[1]; |
207 | 0 | m[2] = p[2]; |
208 | 0 | m[3] = p[3]; |
209 | 0 | } |
210 | | |
211 | | /** |
212 | | * @brief Construct from 4 scalar values. |
213 | | * |
214 | | * The value of @c a is stored to lane 0 (LSB) in the SIMD register. |
215 | | */ |
216 | | ASTCENC_SIMD_INLINE explicit vint4(int a, int b, int c, int d) |
217 | 0 | { |
218 | 0 | m[0] = a; |
219 | 0 | m[1] = b; |
220 | 0 | m[2] = c; |
221 | 0 | m[3] = d; |
222 | 0 | } |
223 | | |
224 | | |
225 | | /** |
226 | | * @brief Construct from 4 scalar values replicated across all lanes. |
227 | | * |
228 | | * Consider using zero() for constexpr zeros. |
229 | | */ |
230 | | ASTCENC_SIMD_INLINE explicit vint4(int a) |
231 | 0 | { |
232 | 0 | m[0] = a; |
233 | 0 | m[1] = a; |
234 | 0 | m[2] = a; |
235 | 0 | m[3] = a; |
236 | 0 | } |
237 | | |
238 | | /** |
239 | | * @brief Get the scalar value of a single lane. |
240 | | */ |
241 | | template <int l> ASTCENC_SIMD_INLINE int lane() const |
242 | 0 | { |
243 | 0 | return m[l]; |
244 | 0 | } Unexecuted instantiation: int vint4::lane<0>() const Unexecuted instantiation: int vint4::lane<1>() const Unexecuted instantiation: int vint4::lane<2>() const Unexecuted instantiation: int vint4::lane<3>() const |
245 | | |
246 | | /** |
247 | | * @brief Set the scalar value of a single lane. |
248 | | */ |
249 | | template <int l> ASTCENC_SIMD_INLINE void set_lane(int a) |
250 | | { |
251 | | m[l] = a; |
252 | | } |
253 | | |
254 | | /** |
255 | | * @brief Factory that returns a vector of zeros. |
256 | | */ |
257 | | static ASTCENC_SIMD_INLINE vint4 zero() |
258 | 0 | { |
259 | 0 | return vint4(0); |
260 | 0 | } |
261 | | |
262 | | /** |
263 | | * @brief Factory that returns a replicated scalar loaded from memory. |
264 | | */ |
265 | | static ASTCENC_SIMD_INLINE vint4 load1(const int* p) |
266 | 0 | { |
267 | 0 | return vint4(*p); |
268 | 0 | } |
269 | | |
270 | | /** |
271 | | * @brief Factory that returns a vector loaded from unaligned memory. |
272 | | */ |
273 | | static ASTCENC_SIMD_INLINE vint4 load(const uint8_t* p) |
274 | 0 | { |
275 | 0 | vint4 data; |
276 | 0 | std::memcpy(&data.m, p, 4 * sizeof(int)); |
277 | 0 | return data; |
278 | 0 | } |
279 | | |
280 | | /** |
281 | | * @brief Factory that returns a vector loaded from 16B aligned memory. |
282 | | */ |
283 | | static ASTCENC_SIMD_INLINE vint4 loada(const int* p) |
284 | 0 | { |
285 | 0 | return vint4(p); |
286 | 0 | } |
287 | | |
288 | | /** |
289 | | * @brief Factory that returns a vector containing the lane IDs. |
290 | | */ |
291 | | static ASTCENC_SIMD_INLINE vint4 lane_id() |
292 | 0 | { |
293 | 0 | return vint4(0, 1, 2, 3); |
294 | 0 | } |
295 | | |
296 | | /** |
297 | | * @brief The vector ... |
298 | | */ |
299 | | int m[4]; |
300 | | }; |
301 | | |
302 | | // ============================================================================ |
303 | | // vmask4 data type |
304 | | // ============================================================================ |
305 | | |
306 | | /** |
307 | | * @brief Data type for 4-wide control plane masks. |
308 | | */ |
309 | | struct vmask4 |
310 | | { |
311 | | /** |
312 | | * @brief Construct from an existing mask value. |
313 | | */ |
314 | | ASTCENC_SIMD_INLINE explicit vmask4(int* p) |
315 | 0 | { |
316 | 0 | m[0] = p[0]; |
317 | 0 | m[1] = p[1]; |
318 | 0 | m[2] = p[2]; |
319 | 0 | m[3] = p[3]; |
320 | 0 | } |
321 | | |
322 | | /** |
323 | | * @brief Construct from 1 scalar value. |
324 | | */ |
325 | | ASTCENC_SIMD_INLINE explicit vmask4(bool a) |
326 | 0 | { |
327 | 0 | m[0] = a == false ? 0 : -1; |
328 | 0 | m[1] = a == false ? 0 : -1; |
329 | 0 | m[2] = a == false ? 0 : -1; |
330 | 0 | m[3] = a == false ? 0 : -1; |
331 | 0 | } |
332 | | |
333 | | /** |
334 | | * @brief Construct from 4 scalar values. |
335 | | * |
336 | | * The value of @c a is stored to lane 0 (LSB) in the SIMD register. |
337 | | */ |
338 | | ASTCENC_SIMD_INLINE explicit vmask4(bool a, bool b, bool c, bool d) |
339 | 0 | { |
340 | 0 | m[0] = a == false ? 0 : -1; |
341 | 0 | m[1] = b == false ? 0 : -1; |
342 | 0 | m[2] = c == false ? 0 : -1; |
343 | 0 | m[3] = d == false ? 0 : -1; |
344 | 0 | } |
345 | | |
346 | | /** |
347 | | * @brief Get the scalar value of a single lane. |
348 | | */ |
349 | | template <int l> ASTCENC_SIMD_INLINE bool lane() const |
350 | | { |
351 | | return m[l] != 0; |
352 | | } |
353 | | |
354 | | /** |
355 | | * @brief The vector ... |
356 | | */ |
357 | | int m[4]; |
358 | | }; |
359 | | |
360 | | // ============================================================================ |
361 | | // vmask4 operators and functions |
362 | | // ============================================================================ |
363 | | |
364 | | /** |
365 | | * @brief Overload: mask union (or). |
366 | | */ |
367 | | ASTCENC_SIMD_INLINE vmask4 operator|(vmask4 a, vmask4 b) |
368 | 0 | { |
369 | 0 | return vmask4(a.m[0] | b.m[0], |
370 | 0 | a.m[1] | b.m[1], |
371 | 0 | a.m[2] | b.m[2], |
372 | 0 | a.m[3] | b.m[3]); |
373 | 0 | } |
374 | | |
375 | | /** |
376 | | * @brief Overload: mask intersect (and). |
377 | | */ |
378 | | ASTCENC_SIMD_INLINE vmask4 operator&(vmask4 a, vmask4 b) |
379 | 0 | { |
380 | 0 | return vmask4(a.m[0] & b.m[0], |
381 | 0 | a.m[1] & b.m[1], |
382 | 0 | a.m[2] & b.m[2], |
383 | 0 | a.m[3] & b.m[3]); |
384 | 0 | } |
385 | | |
386 | | /** |
387 | | * @brief Overload: mask difference (xor). |
388 | | */ |
389 | | ASTCENC_SIMD_INLINE vmask4 operator^(vmask4 a, vmask4 b) |
390 | 0 | { |
391 | 0 | return vmask4(a.m[0] ^ b.m[0], |
392 | 0 | a.m[1] ^ b.m[1], |
393 | 0 | a.m[2] ^ b.m[2], |
394 | 0 | a.m[3] ^ b.m[3]); |
395 | 0 | } |
396 | | |
397 | | /** |
398 | | * @brief Overload: mask invert (not). |
399 | | */ |
400 | | ASTCENC_SIMD_INLINE vmask4 operator~(vmask4 a) |
401 | 0 | { |
402 | 0 | return vmask4(~a.m[0], |
403 | 0 | ~a.m[1], |
404 | 0 | ~a.m[2], |
405 | 0 | ~a.m[3]); |
406 | 0 | } |
407 | | |
408 | | /** |
409 | | * @brief Return a 1-bit mask code indicating mask status. |
410 | | * |
411 | | * bit0 = lane 0 |
412 | | */ |
413 | | ASTCENC_SIMD_INLINE unsigned int mask(vmask4 a) |
414 | 0 | { |
415 | 0 | return (a.m[0] & 0x1) | |
416 | 0 | (a.m[1] & 0x2) | |
417 | 0 | (a.m[2] & 0x4) | |
418 | 0 | (a.m[3] & 0x8); |
419 | 0 | } |
420 | | |
421 | | /** |
422 | | * @brief True if any lanes are enabled, false otherwise. |
423 | | */ |
424 | | ASTCENC_SIMD_INLINE bool any(vmask4 a) |
425 | 0 | { |
426 | 0 | return mask(a) != 0; |
427 | 0 | } |
428 | | |
429 | | /** |
430 | | * @brief True if all lanes are enabled, false otherwise. |
431 | | */ |
432 | | ASTCENC_SIMD_INLINE bool all(vmask4 a) |
433 | 0 | { |
434 | 0 | return mask(a) == 0xF; |
435 | 0 | } |
436 | | |
437 | | // ============================================================================ |
438 | | // vint4 operators and functions |
439 | | // ============================================================================ |
440 | | |
441 | | /** |
442 | | * @brief Overload: vector by vector addition. |
443 | | */ |
444 | | ASTCENC_SIMD_INLINE vint4 operator+(vint4 a, vint4 b) |
445 | 0 | { |
446 | 0 | return vint4(a.m[0] + b.m[0], |
447 | 0 | a.m[1] + b.m[1], |
448 | 0 | a.m[2] + b.m[2], |
449 | 0 | a.m[3] + b.m[3]); |
450 | 0 | } |
451 | | |
452 | | /** |
453 | | * @brief Overload: vector by vector subtraction. |
454 | | */ |
455 | | ASTCENC_SIMD_INLINE vint4 operator-(vint4 a, vint4 b) |
456 | 0 | { |
457 | 0 | return vint4(a.m[0] - b.m[0], |
458 | 0 | a.m[1] - b.m[1], |
459 | 0 | a.m[2] - b.m[2], |
460 | 0 | a.m[3] - b.m[3]); |
461 | 0 | } |
462 | | |
463 | | /** |
464 | | * @brief Overload: vector by vector multiplication. |
465 | | */ |
466 | | ASTCENC_SIMD_INLINE vint4 operator*(vint4 a, vint4 b) |
467 | 0 | { |
468 | 0 | return vint4(a.m[0] * b.m[0], |
469 | 0 | a.m[1] * b.m[1], |
470 | 0 | a.m[2] * b.m[2], |
471 | 0 | a.m[3] * b.m[3]); |
472 | 0 | } |
473 | | |
474 | | /** |
475 | | * @brief Overload: vector bit invert. |
476 | | */ |
477 | | ASTCENC_SIMD_INLINE vint4 operator~(vint4 a) |
478 | 0 | { |
479 | 0 | return vint4(~a.m[0], |
480 | 0 | ~a.m[1], |
481 | 0 | ~a.m[2], |
482 | 0 | ~a.m[3]); |
483 | 0 | } |
484 | | |
485 | | /** |
486 | | * @brief Overload: vector by vector bitwise or. |
487 | | */ |
488 | | ASTCENC_SIMD_INLINE vint4 operator|(vint4 a, vint4 b) |
489 | 0 | { |
490 | 0 | return vint4(a.m[0] | b.m[0], |
491 | 0 | a.m[1] | b.m[1], |
492 | 0 | a.m[2] | b.m[2], |
493 | 0 | a.m[3] | b.m[3]); |
494 | 0 | } |
495 | | |
496 | | /** |
497 | | * @brief Overload: vector by vector bitwise and. |
498 | | */ |
499 | | ASTCENC_SIMD_INLINE vint4 operator&(vint4 a, vint4 b) |
500 | 0 | { |
501 | 0 | return vint4(a.m[0] & b.m[0], |
502 | 0 | a.m[1] & b.m[1], |
503 | 0 | a.m[2] & b.m[2], |
504 | 0 | a.m[3] & b.m[3]); |
505 | 0 | } |
506 | | |
507 | | /** |
508 | | * @brief Overload: vector by vector bitwise xor. |
509 | | */ |
510 | | ASTCENC_SIMD_INLINE vint4 operator^(vint4 a, vint4 b) |
511 | 0 | { |
512 | 0 | return vint4(a.m[0] ^ b.m[0], |
513 | 0 | a.m[1] ^ b.m[1], |
514 | 0 | a.m[2] ^ b.m[2], |
515 | 0 | a.m[3] ^ b.m[3]); |
516 | 0 | } |
517 | | |
518 | | /** |
519 | | * @brief Overload: vector by vector equality. |
520 | | */ |
521 | | ASTCENC_SIMD_INLINE vmask4 operator==(vint4 a, vint4 b) |
522 | 0 | { |
523 | 0 | return vmask4(a.m[0] == b.m[0], |
524 | 0 | a.m[1] == b.m[1], |
525 | 0 | a.m[2] == b.m[2], |
526 | 0 | a.m[3] == b.m[3]); |
527 | 0 | } |
528 | | |
529 | | /** |
530 | | * @brief Overload: vector by vector inequality. |
531 | | */ |
532 | | ASTCENC_SIMD_INLINE vmask4 operator!=(vint4 a, vint4 b) |
533 | 0 | { |
534 | 0 | return vmask4(a.m[0] != b.m[0], |
535 | 0 | a.m[1] != b.m[1], |
536 | 0 | a.m[2] != b.m[2], |
537 | 0 | a.m[3] != b.m[3]); |
538 | 0 | } |
539 | | |
540 | | /** |
541 | | * @brief Overload: vector by vector less than. |
542 | | */ |
543 | | ASTCENC_SIMD_INLINE vmask4 operator<(vint4 a, vint4 b) |
544 | 0 | { |
545 | 0 | return vmask4(a.m[0] < b.m[0], |
546 | 0 | a.m[1] < b.m[1], |
547 | 0 | a.m[2] < b.m[2], |
548 | 0 | a.m[3] < b.m[3]); |
549 | 0 | } |
550 | | |
551 | | /** |
552 | | * @brief Overload: vector by vector greater than. |
553 | | */ |
554 | | ASTCENC_SIMD_INLINE vmask4 operator>(vint4 a, vint4 b) |
555 | 0 | { |
556 | 0 | return vmask4(a.m[0] > b.m[0], |
557 | 0 | a.m[1] > b.m[1], |
558 | 0 | a.m[2] > b.m[2], |
559 | 0 | a.m[3] > b.m[3]); |
560 | 0 | } |
561 | | |
562 | | /** |
563 | | * @brief Logical shift left. |
564 | | */ |
565 | | template <int s> ASTCENC_SIMD_INLINE vint4 lsl(vint4 a) |
566 | 0 | { |
567 | 0 | // Cast to unsigned to avoid shift in/out of sign bit undefined behavior |
568 | 0 | unsigned int as0 = static_cast<unsigned int>(a.m[0]) << s; |
569 | 0 | unsigned int as1 = static_cast<unsigned int>(a.m[1]) << s; |
570 | 0 | unsigned int as2 = static_cast<unsigned int>(a.m[2]) << s; |
571 | 0 | unsigned int as3 = static_cast<unsigned int>(a.m[3]) << s; |
572 | 0 |
|
573 | 0 | return vint4(static_cast<int>(as0), |
574 | 0 | static_cast<int>(as1), |
575 | 0 | static_cast<int>(as2), |
576 | 0 | static_cast<int>(as3)); |
577 | 0 | } Unexecuted instantiation: vint4 lsl<8>(vint4) Unexecuted instantiation: vint4 lsl<16>(vint4) Unexecuted instantiation: vint4 lsl<24>(vint4) Unexecuted instantiation: vint4 lsl<23>(vint4) Unexecuted instantiation: vint4 lsl<10>(vint4) |
578 | | |
579 | | /** |
580 | | * @brief Logical shift right. |
581 | | */ |
582 | | template <int s> ASTCENC_SIMD_INLINE vint4 lsr(vint4 a) |
583 | 0 | { |
584 | 0 | // Cast to unsigned to avoid shift in/out of sign bit undefined behavior |
585 | 0 | unsigned int as0 = static_cast<unsigned int>(a.m[0]) >> s; |
586 | 0 | unsigned int as1 = static_cast<unsigned int>(a.m[1]) >> s; |
587 | 0 | unsigned int as2 = static_cast<unsigned int>(a.m[2]) >> s; |
588 | 0 | unsigned int as3 = static_cast<unsigned int>(a.m[3]) >> s; |
589 | 0 |
|
590 | 0 | return vint4(static_cast<int>(as0), |
591 | 0 | static_cast<int>(as1), |
592 | 0 | static_cast<int>(as2), |
593 | 0 | static_cast<int>(as3)); |
594 | 0 | } Unexecuted instantiation: vint4 lsr<1>(vint4) Unexecuted instantiation: vint4 lsr<23>(vint4) Unexecuted instantiation: vint4 lsr<8>(vint4) Unexecuted instantiation: vint4 lsr<6>(vint4) Unexecuted instantiation: vint4 lsr<11>(vint4) Unexecuted instantiation: vint4 lsr<3>(vint4) |
595 | | |
596 | | /** |
597 | | * @brief Arithmetic shift right. |
598 | | */ |
599 | | template <int s> ASTCENC_SIMD_INLINE vint4 asr(vint4 a) |
600 | | { |
601 | | return vint4(a.m[0] >> s, |
602 | | a.m[1] >> s, |
603 | | a.m[2] >> s, |
604 | | a.m[3] >> s); |
605 | | } |
606 | | |
607 | | /** |
608 | | * @brief Return the min vector of two vectors. |
609 | | */ |
610 | | ASTCENC_SIMD_INLINE vint4 min(vint4 a, vint4 b) |
611 | 0 | { |
612 | 0 | return vint4(a.m[0] < b.m[0] ? a.m[0] : b.m[0], |
613 | 0 | a.m[1] < b.m[1] ? a.m[1] : b.m[1], |
614 | 0 | a.m[2] < b.m[2] ? a.m[2] : b.m[2], |
615 | 0 | a.m[3] < b.m[3] ? a.m[3] : b.m[3]); |
616 | 0 | } |
617 | | |
618 | | /** |
619 | | * @brief Return the min vector of two vectors. |
620 | | */ |
621 | | ASTCENC_SIMD_INLINE vint4 max(vint4 a, vint4 b) |
622 | 0 | { |
623 | 0 | return vint4(a.m[0] > b.m[0] ? a.m[0] : b.m[0], |
624 | 0 | a.m[1] > b.m[1] ? a.m[1] : b.m[1], |
625 | 0 | a.m[2] > b.m[2] ? a.m[2] : b.m[2], |
626 | 0 | a.m[3] > b.m[3] ? a.m[3] : b.m[3]); |
627 | 0 | } |
628 | | |
629 | | /** |
630 | | * @brief Return the horizontal minimum of a single vector. |
631 | | */ |
632 | | ASTCENC_SIMD_INLINE vint4 hmin(vint4 a) |
633 | 0 | { |
634 | 0 | int b = std::min(a.m[0], a.m[1]); |
635 | 0 | int c = std::min(a.m[2], a.m[3]); |
636 | 0 | return vint4(std::min(b, c)); |
637 | 0 | } |
638 | | |
639 | | /** |
640 | | * @brief Return the horizontal maximum of a single vector. |
641 | | */ |
642 | | ASTCENC_SIMD_INLINE vint4 hmax(vint4 a) |
643 | 0 | { |
644 | 0 | int b = std::max(a.m[0], a.m[1]); |
645 | 0 | int c = std::max(a.m[2], a.m[3]); |
646 | 0 | return vint4(std::max(b, c)); |
647 | 0 | } |
648 | | |
649 | | /** |
650 | | * @brief Store a vector to an aligned memory address. |
651 | | */ |
652 | | ASTCENC_SIMD_INLINE void storea(vint4 a, int* p) |
653 | 0 | { |
654 | 0 | p[0] = a.m[0]; |
655 | 0 | p[1] = a.m[1]; |
656 | 0 | p[2] = a.m[2]; |
657 | 0 | p[3] = a.m[3]; |
658 | 0 | } |
659 | | |
660 | | /** |
661 | | * @brief Store a vector to an unaligned memory address. |
662 | | */ |
663 | | ASTCENC_SIMD_INLINE void store(vint4 a, int* p) |
664 | 0 | { |
665 | 0 | p[0] = a.m[0]; |
666 | 0 | p[1] = a.m[1]; |
667 | 0 | p[2] = a.m[2]; |
668 | 0 | p[3] = a.m[3]; |
669 | 0 | } |
670 | | |
671 | | /** |
672 | | * @brief Store a vector to an unaligned memory address. |
673 | | */ |
674 | | ASTCENC_SIMD_INLINE void store(vint4 a, uint8_t* p) |
675 | 0 | { |
676 | 0 | std::memcpy(p, a.m, sizeof(int) * 4); |
677 | 0 | } |
678 | | |
679 | | /** |
680 | | * @brief Store lowest N (vector width) bytes into an unaligned address. |
681 | | */ |
682 | | ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p) |
683 | 0 | { |
684 | 0 | std::memcpy(p, a.m, sizeof(uint8_t) * 4); |
685 | 0 | } |
686 | | |
687 | | /** |
688 | | * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector. |
689 | | */ |
690 | | ASTCENC_SIMD_INLINE void pack_and_store_low_bytes(vint4 a, uint8_t* p) |
691 | 0 | { |
692 | 0 | int b0 = a.m[0] & 0xFF; |
693 | 0 | int b1 = a.m[1] & 0xFF; |
694 | 0 | int b2 = a.m[2] & 0xFF; |
695 | 0 | int b3 = a.m[3] & 0xFF; |
696 | 0 |
|
697 | 0 | #if !defined(ASTCENC_BIG_ENDIAN) |
698 | 0 | int b = b0 | (b1 << 8) | (b2 << 16) | (b3 << 24); |
699 | 0 | #else |
700 | 0 | int b = b3 | (b2 << 8) | (b1 << 16) | (b0 << 24); |
701 | 0 | #endif |
702 | 0 | a = vint4(b, 0, 0, 0); |
703 | 0 | store_nbytes(a, p); |
704 | 0 | } |
705 | | |
706 | | /** |
707 | | * @brief Return lanes from @c b if MSB of @c cond is set, else @c a. |
708 | | */ |
709 | | ASTCENC_SIMD_INLINE vint4 select(vint4 a, vint4 b, vmask4 cond) |
710 | 0 | { |
711 | 0 | return vint4((cond.m[0] & static_cast<int>(0x80000000)) ? b.m[0] : a.m[0], |
712 | 0 | (cond.m[1] & static_cast<int>(0x80000000)) ? b.m[1] : a.m[1], |
713 | 0 | (cond.m[2] & static_cast<int>(0x80000000)) ? b.m[2] : a.m[2], |
714 | 0 | (cond.m[3] & static_cast<int>(0x80000000)) ? b.m[3] : a.m[3]); |
715 | 0 | } |
716 | | |
717 | | // ============================================================================ |
718 | | // vfloat4 operators and functions |
719 | | // ============================================================================ |
720 | | |
721 | | /** |
722 | | * @brief Overload: vector by vector addition. |
723 | | */ |
724 | | ASTCENC_SIMD_INLINE vfloat4 operator+(vfloat4 a, vfloat4 b) |
725 | 0 | { |
726 | 0 | return vfloat4(a.m[0] + b.m[0], |
727 | 0 | a.m[1] + b.m[1], |
728 | 0 | a.m[2] + b.m[2], |
729 | 0 | a.m[3] + b.m[3]); |
730 | 0 | } |
731 | | |
732 | | /** |
733 | | * @brief Overload: vector by vector subtraction. |
734 | | */ |
735 | | ASTCENC_SIMD_INLINE vfloat4 operator-(vfloat4 a, vfloat4 b) |
736 | 0 | { |
737 | 0 | return vfloat4(a.m[0] - b.m[0], |
738 | 0 | a.m[1] - b.m[1], |
739 | 0 | a.m[2] - b.m[2], |
740 | 0 | a.m[3] - b.m[3]); |
741 | 0 | } |
742 | | |
743 | | /** |
744 | | * @brief Overload: vector by vector multiplication. |
745 | | */ |
746 | | ASTCENC_SIMD_INLINE vfloat4 operator*(vfloat4 a, vfloat4 b) |
747 | 0 | { |
748 | 0 | return vfloat4(a.m[0] * b.m[0], |
749 | 0 | a.m[1] * b.m[1], |
750 | 0 | a.m[2] * b.m[2], |
751 | 0 | a.m[3] * b.m[3]); |
752 | 0 | } |
753 | | |
754 | | /** |
755 | | * @brief Overload: vector by vector division. |
756 | | */ |
757 | | ASTCENC_SIMD_INLINE vfloat4 operator/(vfloat4 a, vfloat4 b) |
758 | 0 | { |
759 | 0 | return vfloat4(a.m[0] / b.m[0], |
760 | 0 | a.m[1] / b.m[1], |
761 | 0 | a.m[2] / b.m[2], |
762 | 0 | a.m[3] / b.m[3]); |
763 | 0 | } |
764 | | |
765 | | /** |
766 | | * @brief Overload: vector by vector equality. |
767 | | */ |
768 | | ASTCENC_SIMD_INLINE vmask4 operator==(vfloat4 a, vfloat4 b) |
769 | 0 | { |
770 | 0 | return vmask4(a.m[0] == b.m[0], |
771 | 0 | a.m[1] == b.m[1], |
772 | 0 | a.m[2] == b.m[2], |
773 | 0 | a.m[3] == b.m[3]); |
774 | 0 | } |
775 | | |
776 | | /** |
777 | | * @brief Overload: vector by vector inequality. |
778 | | */ |
779 | | ASTCENC_SIMD_INLINE vmask4 operator!=(vfloat4 a, vfloat4 b) |
780 | 0 | { |
781 | 0 | return vmask4(a.m[0] != b.m[0], |
782 | 0 | a.m[1] != b.m[1], |
783 | 0 | a.m[2] != b.m[2], |
784 | 0 | a.m[3] != b.m[3]); |
785 | 0 | } |
786 | | |
787 | | /** |
788 | | * @brief Overload: vector by vector less than. |
789 | | */ |
790 | | ASTCENC_SIMD_INLINE vmask4 operator<(vfloat4 a, vfloat4 b) |
791 | 0 | { |
792 | 0 | return vmask4(a.m[0] < b.m[0], |
793 | 0 | a.m[1] < b.m[1], |
794 | 0 | a.m[2] < b.m[2], |
795 | 0 | a.m[3] < b.m[3]); |
796 | 0 | } |
797 | | |
798 | | /** |
799 | | * @brief Overload: vector by vector greater than. |
800 | | */ |
801 | | ASTCENC_SIMD_INLINE vmask4 operator>(vfloat4 a, vfloat4 b) |
802 | 0 | { |
803 | 0 | return vmask4(a.m[0] > b.m[0], |
804 | 0 | a.m[1] > b.m[1], |
805 | 0 | a.m[2] > b.m[2], |
806 | 0 | a.m[3] > b.m[3]); |
807 | 0 | } |
808 | | |
809 | | /** |
810 | | * @brief Overload: vector by vector less than or equal. |
811 | | */ |
812 | | ASTCENC_SIMD_INLINE vmask4 operator<=(vfloat4 a, vfloat4 b) |
813 | 0 | { |
814 | 0 | return vmask4(a.m[0] <= b.m[0], |
815 | 0 | a.m[1] <= b.m[1], |
816 | 0 | a.m[2] <= b.m[2], |
817 | 0 | a.m[3] <= b.m[3]); |
818 | 0 | } |
819 | | |
820 | | /** |
821 | | * @brief Overload: vector by vector greater than or equal. |
822 | | */ |
823 | | ASTCENC_SIMD_INLINE vmask4 operator>=(vfloat4 a, vfloat4 b) |
824 | 0 | { |
825 | 0 | return vmask4(a.m[0] >= b.m[0], |
826 | 0 | a.m[1] >= b.m[1], |
827 | 0 | a.m[2] >= b.m[2], |
828 | 0 | a.m[3] >= b.m[3]); |
829 | 0 | } |
830 | | |
831 | | /** |
832 | | * @brief Return the min vector of two vectors. |
833 | | * |
834 | | * If either lane value is NaN, @c b will be returned for that lane. |
835 | | */ |
836 | | ASTCENC_SIMD_INLINE vfloat4 min(vfloat4 a, vfloat4 b) |
837 | 0 | { |
838 | 0 | return vfloat4(a.m[0] < b.m[0] ? a.m[0] : b.m[0], |
839 | 0 | a.m[1] < b.m[1] ? a.m[1] : b.m[1], |
840 | 0 | a.m[2] < b.m[2] ? a.m[2] : b.m[2], |
841 | 0 | a.m[3] < b.m[3] ? a.m[3] : b.m[3]); |
842 | 0 | } |
843 | | |
844 | | /** |
845 | | * @brief Return the max vector of two vectors. |
846 | | * |
847 | | * If either lane value is NaN, @c b will be returned for that lane. |
848 | | */ |
849 | | ASTCENC_SIMD_INLINE vfloat4 max(vfloat4 a, vfloat4 b) |
850 | 0 | { |
851 | 0 | return vfloat4(a.m[0] > b.m[0] ? a.m[0] : b.m[0], |
852 | 0 | a.m[1] > b.m[1] ? a.m[1] : b.m[1], |
853 | 0 | a.m[2] > b.m[2] ? a.m[2] : b.m[2], |
854 | 0 | a.m[3] > b.m[3] ? a.m[3] : b.m[3]); |
855 | 0 | } |
856 | | |
857 | | /** |
858 | | * @brief Return the absolute value of the float vector. |
859 | | */ |
860 | | ASTCENC_SIMD_INLINE vfloat4 abs(vfloat4 a) |
861 | 0 | { |
862 | 0 | return vfloat4(std::abs(a.m[0]), |
863 | 0 | std::abs(a.m[1]), |
864 | 0 | std::abs(a.m[2]), |
865 | 0 | std::abs(a.m[3])); |
866 | 0 | } |
867 | | |
868 | | /** |
869 | | * @brief Return a float rounded to the nearest integer value. |
870 | | */ |
871 | | ASTCENC_SIMD_INLINE vfloat4 round(vfloat4 a) |
872 | 0 | { |
873 | 0 | assert(std::fegetround() == FE_TONEAREST); |
874 | 0 | return vfloat4(std::nearbyint(a.m[0]), |
875 | 0 | std::nearbyint(a.m[1]), |
876 | 0 | std::nearbyint(a.m[2]), |
877 | 0 | std::nearbyint(a.m[3])); |
878 | 0 | } |
879 | | |
880 | | /** |
881 | | * @brief Return the horizontal minimum of a vector. |
882 | | */ |
883 | | ASTCENC_SIMD_INLINE vfloat4 hmin(vfloat4 a) |
884 | 0 | { |
885 | 0 | float tmp1 = std::min(a.m[0], a.m[1]); |
886 | 0 | float tmp2 = std::min(a.m[2], a.m[3]); |
887 | 0 | return vfloat4(std::min(tmp1, tmp2)); |
888 | 0 | } |
889 | | |
890 | | /** |
891 | | * @brief Return the horizontal maximum of a vector. |
892 | | */ |
893 | | ASTCENC_SIMD_INLINE vfloat4 hmax(vfloat4 a) |
894 | 0 | { |
895 | 0 | float tmp1 = std::max(a.m[0], a.m[1]); |
896 | 0 | float tmp2 = std::max(a.m[2], a.m[3]); |
897 | 0 | return vfloat4(std::max(tmp1, tmp2)); |
898 | 0 | } |
899 | | |
900 | | /** |
901 | | * @brief Return the horizontal sum of a vector. |
902 | | */ |
903 | | ASTCENC_SIMD_INLINE float hadd_s(vfloat4 a) |
904 | 0 | { |
905 | 0 | // Use halving add, gives invariance with SIMD versions |
906 | 0 | return (a.m[0] + a.m[2]) + (a.m[1] + a.m[3]); |
907 | 0 | } |
908 | | |
909 | | /** |
910 | | * @brief Return the sqrt of the lanes in the vector. |
911 | | */ |
912 | | ASTCENC_SIMD_INLINE vfloat4 sqrt(vfloat4 a) |
913 | 0 | { |
914 | 0 | return vfloat4(std::sqrt(a.m[0]), |
915 | 0 | std::sqrt(a.m[1]), |
916 | 0 | std::sqrt(a.m[2]), |
917 | 0 | std::sqrt(a.m[3])); |
918 | 0 | } |
919 | | |
920 | | /** |
921 | | * @brief Return lanes from @c b if @c cond is set, else @c a. |
922 | | */ |
923 | | ASTCENC_SIMD_INLINE vfloat4 select(vfloat4 a, vfloat4 b, vmask4 cond) |
924 | 0 | { |
925 | 0 | return vfloat4((cond.m[0] & static_cast<int>(0x80000000)) ? b.m[0] : a.m[0], |
926 | 0 | (cond.m[1] & static_cast<int>(0x80000000)) ? b.m[1] : a.m[1], |
927 | 0 | (cond.m[2] & static_cast<int>(0x80000000)) ? b.m[2] : a.m[2], |
928 | 0 | (cond.m[3] & static_cast<int>(0x80000000)) ? b.m[3] : a.m[3]); |
929 | 0 | } |
930 | | |
931 | | /** |
932 | | * @brief Load a vector of gathered results from an array; |
933 | | */ |
934 | | ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices) |
935 | 0 | { |
936 | 0 | return vfloat4(base[indices.m[0]], |
937 | 0 | base[indices.m[1]], |
938 | 0 | base[indices.m[2]], |
939 | 0 | base[indices.m[3]]); |
940 | 0 | } |
941 | | |
942 | | /** |
943 | | * @brief Load a vector of gathered results from an array using byte indices from memory |
944 | | */ |
945 | | template<> |
946 | | ASTCENC_SIMD_INLINE vfloat4 gatherf_byte_inds<vfloat4>(const float* base, const uint8_t* indices) |
947 | 0 | { |
948 | 0 | return vfloat4(base[indices[0]], |
949 | 0 | base[indices[1]], |
950 | 0 | base[indices[2]], |
951 | 0 | base[indices[3]]); |
952 | 0 | } |
953 | | |
954 | | /** |
955 | | * @brief Store a vector to an unaligned memory address. |
956 | | */ |
957 | | ASTCENC_SIMD_INLINE void store(vfloat4 a, float* ptr) |
958 | 0 | { |
959 | 0 | ptr[0] = a.m[0]; |
960 | 0 | ptr[1] = a.m[1]; |
961 | 0 | ptr[2] = a.m[2]; |
962 | 0 | ptr[3] = a.m[3]; |
963 | 0 | } |
964 | | |
965 | | /** |
966 | | * @brief Store a vector to an aligned memory address. |
967 | | */ |
968 | | ASTCENC_SIMD_INLINE void storea(vfloat4 a, float* ptr) |
969 | 0 | { |
970 | 0 | ptr[0] = a.m[0]; |
971 | 0 | ptr[1] = a.m[1]; |
972 | 0 | ptr[2] = a.m[2]; |
973 | 0 | ptr[3] = a.m[3]; |
974 | 0 | } |
975 | | |
976 | | /** |
977 | | * @brief Return a integer value for a float vector, using truncation. |
978 | | */ |
979 | | ASTCENC_SIMD_INLINE vint4 float_to_int(vfloat4 a) |
980 | 0 | { |
981 | 0 | return vint4(static_cast<int>(a.m[0]), |
982 | 0 | static_cast<int>(a.m[1]), |
983 | 0 | static_cast<int>(a.m[2]), |
984 | 0 | static_cast<int>(a.m[3])); |
985 | 0 | } |
986 | | |
987 | | /**f |
988 | | * @brief Return a integer value for a float vector, using round-to-nearest. |
989 | | */ |
990 | | ASTCENC_SIMD_INLINE vint4 float_to_int_rtn(vfloat4 a) |
991 | 0 | { |
992 | 0 | a = a + vfloat4(0.5f); |
993 | 0 | return vint4(static_cast<int>(a.m[0]), |
994 | 0 | static_cast<int>(a.m[1]), |
995 | 0 | static_cast<int>(a.m[2]), |
996 | 0 | static_cast<int>(a.m[3])); |
997 | 0 | } |
998 | | |
999 | | /** |
1000 | | * @brief Return a float value for a integer vector. |
1001 | | */ |
1002 | | ASTCENC_SIMD_INLINE vfloat4 int_to_float(vint4 a) |
1003 | 0 | { |
1004 | 0 | return vfloat4(static_cast<float>(a.m[0]), |
1005 | 0 | static_cast<float>(a.m[1]), |
1006 | 0 | static_cast<float>(a.m[2]), |
1007 | 0 | static_cast<float>(a.m[3])); |
1008 | 0 | } |
1009 | | |
1010 | | /** |
1011 | | * @brief Return a float16 value for a float vector, using round-to-nearest. |
1012 | | */ |
1013 | | ASTCENC_SIMD_INLINE vint4 float_to_float16(vfloat4 a) |
1014 | 0 | { |
1015 | 0 | return vint4( |
1016 | 0 | float_to_sf16(a.lane<0>()), |
1017 | 0 | float_to_sf16(a.lane<1>()), |
1018 | 0 | float_to_sf16(a.lane<2>()), |
1019 | 0 | float_to_sf16(a.lane<3>())); |
1020 | 0 | } |
1021 | | |
1022 | | /** |
1023 | | * @brief Return a float16 value for a float scalar, using round-to-nearest. |
1024 | | */ |
1025 | | static inline uint16_t float_to_float16(float a) |
1026 | 0 | { |
1027 | 0 | return float_to_sf16(a); |
1028 | 0 | } Unexecuted instantiation: fuzz_astc_physical_to_symbolic.cpp:float_to_float16(float) Unexecuted instantiation: astcenc_block_sizes.cpp:float_to_float16(float) Unexecuted instantiation: astcenc_integer_sequence.cpp:float_to_float16(float) Unexecuted instantiation: astcenc_mathlib.cpp:float_to_float16(float) Unexecuted instantiation: astcenc_partition_tables.cpp:float_to_float16(float) Unexecuted instantiation: astcenc_percentile_tables.cpp:float_to_float16(float) Unexecuted instantiation: astcenc_symbolic_physical.cpp:float_to_float16(float) Unexecuted instantiation: astcenc_weight_quant_xfer_tables.cpp:float_to_float16(float) Unexecuted instantiation: astcenc_quantization.cpp:float_to_float16(float) |
1029 | | |
1030 | | /** |
1031 | | * @brief Return a float value for a float16 vector. |
1032 | | */ |
1033 | | ASTCENC_SIMD_INLINE vfloat4 float16_to_float(vint4 a) |
1034 | 0 | { |
1035 | 0 | return vfloat4( |
1036 | 0 | sf16_to_float(static_cast<uint16_t>(a.lane<0>())), |
1037 | 0 | sf16_to_float(static_cast<uint16_t>(a.lane<1>())), |
1038 | 0 | sf16_to_float(static_cast<uint16_t>(a.lane<2>())), |
1039 | 0 | sf16_to_float(static_cast<uint16_t>(a.lane<3>()))); |
1040 | 0 | } |
1041 | | |
1042 | | /** |
1043 | | * @brief Return a float value for a float16 scalar. |
1044 | | */ |
1045 | | ASTCENC_SIMD_INLINE float float16_to_float(uint16_t a) |
1046 | 0 | { |
1047 | 0 | return sf16_to_float(a); |
1048 | 0 | } |
1049 | | |
1050 | | /** |
1051 | | * @brief Return a float value as an integer bit pattern (i.e. no conversion). |
1052 | | * |
1053 | | * It is a common trick to convert floats into integer bit patterns, perform |
1054 | | * some bit hackery based on knowledge they are IEEE 754 layout, and then |
1055 | | * convert them back again. This is the first half of that flip. |
1056 | | */ |
1057 | | ASTCENC_SIMD_INLINE vint4 float_as_int(vfloat4 a) |
1058 | 0 | { |
1059 | 0 | vint4 r; |
1060 | 0 | std::memcpy(r.m, a.m, 4 * 4); |
1061 | 0 | return r; |
1062 | 0 | } |
1063 | | |
1064 | | /** |
1065 | | * @brief Return a integer value as a float bit pattern (i.e. no conversion). |
1066 | | * |
1067 | | * It is a common trick to convert floats into integer bit patterns, perform |
1068 | | * some bit hackery based on knowledge they are IEEE 754 layout, and then |
1069 | | * convert them back again. This is the second half of that flip. |
1070 | | */ |
1071 | | ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 a) |
1072 | 0 | { |
1073 | 0 | vfloat4 r; |
1074 | 0 | std::memcpy(r.m, a.m, 4 * 4); |
1075 | 0 | return r; |
1076 | 0 | } |
1077 | | |
1078 | | /* |
1079 | | * Table structure for a 16x 8-bit entry table. |
1080 | | */ |
1081 | | struct vtable4_16x8 { |
1082 | | const uint8_t* data; |
1083 | | }; |
1084 | | |
1085 | | /* |
1086 | | * Table structure for a 32x 8-bit entry table. |
1087 | | */ |
1088 | | struct vtable4_32x8 { |
1089 | | const uint8_t* data; |
1090 | | }; |
1091 | | |
1092 | | /* |
1093 | | * Table structure for a 64x 8-bit entry table. |
1094 | | */ |
1095 | | struct vtable4_64x8 { |
1096 | | const uint8_t* data; |
1097 | | }; |
1098 | | |
1099 | | /** |
1100 | | * @brief Prepare a vtable lookup table for 16x 8-bit entry table. |
1101 | | */ |
1102 | | ASTCENC_SIMD_INLINE void vtable_prepare( |
1103 | | vtable4_16x8& table, |
1104 | | const uint8_t* data |
1105 | 0 | ) { |
1106 | 0 | table.data = data; |
1107 | 0 | } |
1108 | | |
1109 | | /** |
1110 | | * @brief Prepare a vtable lookup table for 32x 8-bit entry table. |
1111 | | */ |
1112 | | ASTCENC_SIMD_INLINE void vtable_prepare( |
1113 | | vtable4_32x8& table, |
1114 | | const uint8_t* data |
1115 | 0 | ) { |
1116 | 0 | table.data = data; |
1117 | 0 | } |
1118 | | |
1119 | | /** |
1120 | | * @brief Prepare a vtable lookup table 64x 8-bit entry table. |
1121 | | */ |
1122 | | ASTCENC_SIMD_INLINE void vtable_prepare( |
1123 | | vtable4_64x8& table, |
1124 | | const uint8_t* data |
1125 | 0 | ) { |
1126 | 0 | table.data = data; |
1127 | 0 | } |
1128 | | |
1129 | | /** |
1130 | | * @brief Perform a vtable lookup in a 16x 8-bit table with 32-bit indices. |
1131 | | */ |
1132 | | ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit( |
1133 | | const vtable4_16x8& table, |
1134 | | vint4 idx |
1135 | 0 | ) { |
1136 | 0 | return vint4(table.data[idx.lane<0>()], |
1137 | 0 | table.data[idx.lane<1>()], |
1138 | 0 | table.data[idx.lane<2>()], |
1139 | 0 | table.data[idx.lane<3>()]); |
1140 | 0 | } |
1141 | | |
1142 | | /** |
1143 | | * @brief Perform a vtable lookup in a 32x 8-bit table with 32-bit indices. |
1144 | | */ |
1145 | | ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit( |
1146 | | const vtable4_32x8& table, |
1147 | | vint4 idx |
1148 | 0 | ) { |
1149 | 0 | return vint4(table.data[idx.lane<0>()], |
1150 | 0 | table.data[idx.lane<1>()], |
1151 | 0 | table.data[idx.lane<2>()], |
1152 | 0 | table.data[idx.lane<3>()]); |
1153 | 0 | } |
1154 | | |
1155 | | /** |
1156 | | * @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices. |
1157 | | */ |
1158 | | ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit( |
1159 | | const vtable4_64x8& table, |
1160 | | vint4 idx |
1161 | 0 | ) { |
1162 | 0 | return vint4(table.data[idx.lane<0>()], |
1163 | 0 | table.data[idx.lane<1>()], |
1164 | 0 | table.data[idx.lane<2>()], |
1165 | 0 | table.data[idx.lane<3>()]); |
1166 | 0 | } |
1167 | | |
1168 | | /** |
1169 | | * @brief Return a vector of interleaved RGBA data. |
1170 | | * |
1171 | | * Input vectors have the value stored in the bottom 8 bits of each lane, |
1172 | | * with high bits set to zero. |
1173 | | * |
1174 | | * Output vector stores a single RGBA texel packed in each lane. |
1175 | | */ |
1176 | | ASTCENC_SIMD_INLINE vint4 interleave_rgba8(vint4 r, vint4 g, vint4 b, vint4 a) |
1177 | 0 | { |
1178 | 0 | #if !defined(ASTCENC_BIG_ENDIAN) |
1179 | 0 | return r + lsl<8>(g) + lsl<16>(b) + lsl<24>(a); |
1180 | 0 | #else |
1181 | 0 | return a + lsl<8>(b) + lsl<16>(g) + lsl<24>(r); |
1182 | 0 | #endif |
1183 | 0 | } |
1184 | | |
1185 | | /** |
1186 | | * @brief Store a single vector lane to an unaligned address. |
1187 | | */ |
1188 | | ASTCENC_SIMD_INLINE void store_lane(uint8_t* base, int data) |
1189 | 0 | { |
1190 | 0 | std::memcpy(base, &data, sizeof(int)); |
1191 | 0 | } |
1192 | | |
1193 | | /** |
1194 | | * @brief Store a vector, skipping masked lanes. |
1195 | | * |
1196 | | * All masked lanes must be at the end of vector, after all non-masked lanes. |
1197 | | * Input is a byte array of at least 4 bytes per unmasked entry. |
1198 | | */ |
1199 | | ASTCENC_SIMD_INLINE void store_lanes_masked(uint8_t* base, vint4 data, vmask4 mask) |
1200 | 0 | { |
1201 | 0 | if (mask.m[3]) |
1202 | 0 | { |
1203 | 0 | store(data, base); |
1204 | 0 | } |
1205 | 0 | else if (mask.m[2]) |
1206 | 0 | { |
1207 | 0 | store_lane(base + 0, data.lane<0>()); |
1208 | 0 | store_lane(base + 4, data.lane<1>()); |
1209 | 0 | store_lane(base + 8, data.lane<2>()); |
1210 | 0 | } |
1211 | 0 | else if (mask.m[1]) |
1212 | 0 | { |
1213 | 0 | store_lane(base + 0, data.lane<0>()); |
1214 | 0 | store_lane(base + 4, data.lane<1>()); |
1215 | 0 | } |
1216 | 0 | else if (mask.m[0]) |
1217 | 0 | { |
1218 | 0 | store_lane(base + 0, data.lane<0>()); |
1219 | 0 | } |
1220 | 0 | } |
1221 | | |
1222 | | #endif // #ifndef ASTC_VECMATHLIB_NONE_4_H_INCLUDED |