/src/astc-encoder/Source/astcenc_vecmathlib.h
Line | Count | Source |
1 | | // SPDX-License-Identifier: Apache-2.0 |
2 | | // ---------------------------------------------------------------------------- |
3 | | // Copyright 2019-2026 Arm Limited |
4 | | // Copyright 2008 Jose Fonseca |
5 | | // Copyright 2026 Olaf Bernstein |
6 | | // |
7 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not |
8 | | // use this file except in compliance with the License. You may obtain a copy |
9 | | // of the License at: |
10 | | // |
11 | | // http://www.apache.org/licenses/LICENSE-2.0 |
12 | | // |
13 | | // Unless required by applicable law or agreed to in writing, software |
14 | | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT |
15 | | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the |
16 | | // License for the specific language governing permissions and limitations |
17 | | // under the License. |
18 | | // ---------------------------------------------------------------------------- |
19 | | |
20 | | /* |
21 | | * This module implements vector support for floats, ints, and vector lane |
22 | | * control masks. It provides access to both explicit vector width types, and |
23 | | * flexible N-wide types where N can be determined at compile time. |
24 | | * |
25 | | * The design of this module encourages use of vector length agnostic code, via |
26 | | * the vint, vfloat, and vmask types. These will take on the widest SIMD vector |
27 | | * with that is available at compile time. The current vector width is |
28 | | * accessible for e.g. loop strides via the ASTCENC_SIMD_WIDTH constant. |
29 | | * |
30 | | * Explicit scalar types are accessible via the vint1, vfloat1, vmask1 types. |
31 | | * These are provided primarily for prototyping and algorithm debug of VLA |
32 | | * implementations. |
33 | | * |
34 | | * Explicit 4-wide types are accessible via the vint4, vfloat4, and vmask4 |
35 | | * types. These are provided for use by VLA code, but are also expected to be |
36 | | * used as a fixed-width type and will supported a reference C++ fallback for |
37 | | * use on platforms without SIMD intrinsics. |
38 | | * |
39 | | * Explicit 8-wide types are accessible via the vint8, vfloat8, and vmask8 |
40 | | * types. These are provide for use by VLA code, and are not expected to be |
41 | | * used as a fixed-width type in normal code. No reference C implementation is |
42 | | * provided on platforms without underlying SIMD intrinsics. |
43 | | * |
44 | | * With the current implementation ISA support is provided for: |
45 | | * |
46 | | * * 1-wide for scalar reference |
47 | | * * 4-wide for Armv8-A NEON |
48 | | * * 4-wide for x86-64 SSE2 |
49 | | * * 4-wide for x86-64 SSE4.1 |
50 | | * * 8-wide for Armv8-A SVE |
51 | | * * 8-wide for x86-64 AVX2 |
52 | | */ |
53 | | |
54 | | #ifndef ASTC_VECMATHLIB_H_INCLUDED |
55 | | #define ASTC_VECMATHLIB_H_INCLUDED |
56 | | |
57 | | #if ASTCENC_SSE != 0 || ASTCENC_AVX != 0 |
58 | | #include <immintrin.h> |
59 | | #endif |
60 | | |
61 | | #if ASTCENC_SVE != 0 |
62 | | #include <arm_sve.h> |
63 | | #include <arm_neon_sve_bridge.h> |
64 | | #endif |
65 | | |
66 | | #if ASTCENC_NEON != 0 |
67 | | #include <arm_neon.h> |
68 | | #endif |
69 | | |
70 | | #if !defined(__clang__) && defined(_MSC_VER) |
71 | | #define ASTCENC_SIMD_INLINE __forceinline |
72 | | #define ASTCENC_NO_INLINE |
73 | | #elif defined(__GNUC__) && !defined(__clang__) |
74 | | #define ASTCENC_SIMD_INLINE __attribute__((always_inline)) inline |
75 | | #define ASTCENC_NO_INLINE __attribute__ ((noinline)) |
76 | | #else |
77 | | #define ASTCENC_SIMD_INLINE __attribute__((always_inline, nodebug)) inline |
78 | | #define ASTCENC_NO_INLINE __attribute__ ((noinline)) |
79 | | #endif |
80 | | |
81 | | template<typename T> T gatherf_byte_inds(const float* base, const uint8_t* indices); |
82 | | |
83 | | #if ASTCENC_AVX >= 2 |
84 | | // If we have AVX2 expose 8-wide VLA. |
85 | | #include "astcenc_vecmathlib_sse_4.h" |
86 | | #include "astcenc_vecmathlib_common_4.h" |
87 | | #include "astcenc_vecmathlib_avx2_8.h" |
88 | | |
89 | | #define ASTCENC_SIMD_WIDTH 8 |
90 | | |
91 | | using vfloat = vfloat8; |
92 | | |
93 | | #if defined(ASTCENC_NO_INVARIANCE) |
94 | | using vfloatacc = vfloat8; |
95 | | #else |
96 | | using vfloatacc = vfloat4; |
97 | | #endif |
98 | | |
99 | | using vint = vint8; |
100 | | using vmask = vmask8; |
101 | | |
102 | | using vtable_16x8 = vtable8_16x8; |
103 | | using vtable_32x8 = vtable8_32x8; |
104 | | using vtable_64x8 = vtable8_64x8; |
105 | | |
106 | | constexpr auto loada = vfloat8::loada; |
107 | | constexpr auto load1 = vfloat8::load1; |
108 | | constexpr auto vint_from_size = vint8_from_size; |
109 | | |
110 | | #elif ASTCENC_SSE >= 20 |
111 | | // If we have SSE expose 4-wide VLA, and 4-wide fixed width. |
112 | | #include "astcenc_vecmathlib_sse_4.h" |
113 | | #include "astcenc_vecmathlib_common_4.h" |
114 | | |
115 | | #define ASTCENC_SIMD_WIDTH 4 |
116 | | |
117 | | using vfloat = vfloat4; |
118 | | using vfloatacc = vfloat4; |
119 | | using vint = vint4; |
120 | | using vmask = vmask4; |
121 | | |
122 | | using vtable_16x8 = vtable4_16x8; |
123 | | using vtable_32x8 = vtable4_32x8; |
124 | | using vtable_64x8 = vtable4_64x8; |
125 | | |
126 | | constexpr auto loada = vfloat4::loada; |
127 | | constexpr auto load1 = vfloat4::load1; |
128 | | constexpr auto vint_from_size = vint4_from_size; |
129 | | |
130 | | #elif ASTCENC_SVE == 8 |
131 | | // Check the compiler is configured with fixed-length 256-bit SVE. |
132 | | #if !defined(__ARM_FEATURE_SVE_BITS) || (__ARM_FEATURE_SVE_BITS != 256) |
133 | | #error "__ARM_FEATURE_SVE_BITS is not set to 256 bits" |
134 | | #endif |
135 | | |
136 | | // If we have SVE configured as 8-wide, expose 8-wide VLA. |
137 | | #include "astcenc_vecmathlib_neon_4.h" |
138 | | #include "astcenc_vecmathlib_common_4.h" |
139 | | #include "astcenc_vecmathlib_sve_8.h" |
140 | | |
141 | | #define ASTCENC_SIMD_WIDTH 8 |
142 | | |
143 | | using vfloat = vfloat8; |
144 | | |
145 | | #if defined(ASTCENC_NO_INVARIANCE) |
146 | | using vfloatacc = vfloat8; |
147 | | #else |
148 | | using vfloatacc = vfloat4; |
149 | | #endif |
150 | | |
151 | | using vint = vint8; |
152 | | using vmask = vmask8; |
153 | | |
154 | | using vtable_16x8 = vtable8_16x8; |
155 | | using vtable_32x8 = vtable8_32x8; |
156 | | using vtable_64x8 = vtable8_64x8; |
157 | | |
158 | | constexpr auto loada = vfloat8::loada; |
159 | | constexpr auto load1 = vfloat8::load1; |
160 | | constexpr auto vint_from_size = vint8_from_size; |
161 | | |
162 | | #elif ASTCENC_NEON > 0 |
163 | | // If we have NEON expose 4-wide VLA. |
164 | | #include "astcenc_vecmathlib_neon_4.h" |
165 | | #include "astcenc_vecmathlib_common_4.h" |
166 | | |
167 | | #define ASTCENC_SIMD_WIDTH 4 |
168 | | |
169 | | using vfloat = vfloat4; |
170 | | using vfloatacc = vfloat4; |
171 | | using vint = vint4; |
172 | | using vmask = vmask4; |
173 | | |
174 | | using vtable_16x8 = vtable4_16x8; |
175 | | using vtable_32x8 = vtable4_32x8; |
176 | | using vtable_64x8 = vtable4_64x8; |
177 | | |
178 | | constexpr auto loada = vfloat4::loada; |
179 | | constexpr auto load1 = vfloat4::load1; |
180 | | constexpr auto vint_from_size = vint4_from_size; |
181 | | |
182 | | #elif defined(__riscv_v_fixed_vlen) && (__riscv_v_fixed_vlen == 256) |
183 | | #include <riscv_vector.h> |
184 | | #include "astcenc_vecmathlib_none_4.h" |
185 | | #include "astcenc_vecmathlib_common_4.h" |
186 | | #include "astcenc_vecmathlib_rvv_n.h" |
187 | | |
188 | | #if defined(ASTCENC_NO_INVARIANCE) |
189 | | using vfloatacc = vfloat; |
190 | | #else |
191 | | using vfloatacc = vfloat4; |
192 | | #endif |
193 | | |
194 | | constexpr auto loada = vfloat::loada; |
195 | | constexpr auto load1 = vfloat::load1; |
196 | | |
197 | | // For unit-tests |
198 | | using vfloat8 = vfloat; |
199 | | using vint8 = vint; |
200 | | using vmask8 = vmask; |
201 | | using vtable8_16x8 = vtable_16x8; |
202 | | using vtable8_32x8 = vtable_32x8; |
203 | | using vtable8_64x8 = vtable_64x8; |
204 | | |
205 | | #else |
206 | | // If we have nothing expose 4-wide VLA, and 4-wide fixed width. |
207 | | |
208 | | // Note: We no longer expose the 1-wide scalar fallback because it is not |
209 | | // invariant with the 4-wide path due to algorithms that use horizontal |
210 | | // operations that accumulate a local vector sum before accumulating into |
211 | | // a running sum. |
212 | | // |
213 | | // For 4 items adding into an accumulator using 1-wide vectors the sum is: |
214 | | // |
215 | | // result = ((((sum + l0) + l1) + l2) + l3) |
216 | | // |
217 | | // ... whereas the accumulator for a 4-wide vector sum is: |
218 | | // |
219 | | // result = sum + ((l0 + l2) + (l1 + l3)) |
220 | | // |
221 | | // In "normal maths" this is the same, but the floating point reassociation |
222 | | // differences mean that these will not produce the same result. |
223 | | |
224 | | #include "astcenc_vecmathlib_none_4.h" |
225 | | #include "astcenc_vecmathlib_common_4.h" |
226 | | |
227 | 144M | #define ASTCENC_SIMD_WIDTH 4 |
228 | | |
229 | | using vfloat = vfloat4; |
230 | | using vfloatacc = vfloat4; |
231 | | using vint = vint4; |
232 | | using vmask = vmask4; |
233 | | |
234 | | using vtable_16x8 = vtable4_16x8; |
235 | | using vtable_32x8 = vtable4_32x8; |
236 | | using vtable_64x8 = vtable4_64x8; |
237 | | |
238 | | constexpr auto loada = vfloat4::loada; |
239 | | constexpr auto load1 = vfloat4::load1; |
240 | | constexpr auto vint_from_size = vint4_from_size; |
241 | | #endif |
242 | | |
243 | | /** |
244 | | * @brief Round a count down to the largest multiple of the SIMD width. |
245 | | * |
246 | | * Assumption that the vector width is a power of two ... |
247 | | * |
248 | | * @param count The unrounded value. |
249 | | * |
250 | | * @return The rounded value. |
251 | | */ |
252 | | ASTCENC_SIMD_INLINE size_t round_down_to_simd_multiple_vla(size_t count) |
253 | 69.9k | { |
254 | 69.9k | return count & static_cast<size_t>(~(ASTCENC_SIMD_WIDTH - 1)); |
255 | 69.9k | } |
256 | | |
257 | | /** |
258 | | * @brief Round a count up to the largest multiple of the SIMD width. |
259 | | * |
260 | | * Assumption that the vector width is a power of two ... |
261 | | * |
262 | | * @param count The unrounded value. |
263 | | * |
264 | | * @return The rounded value. |
265 | | */ |
266 | | ASTCENC_SIMD_INLINE size_t round_up_to_simd_multiple_vla(size_t count) |
267 | 42.7M | { |
268 | 42.7M | size_t multiples = (count + ASTCENC_SIMD_WIDTH - 1) / ASTCENC_SIMD_WIDTH; |
269 | 42.7M | return multiples * ASTCENC_SIMD_WIDTH; |
270 | 42.7M | } |
271 | | |
272 | | /** |
273 | | * @brief Return @c a with lanes negated if the @c b lane is negative. |
274 | | */ |
275 | | ASTCENC_SIMD_INLINE vfloat change_sign(vfloat a, vfloat b) |
276 | 617k | { |
277 | 617k | vint ia = float_as_int(a); |
278 | 617k | vint ib = float_as_int(b); |
279 | 617k | vint sign_mask(static_cast<int>(0x80000000)); |
280 | 617k | vint r = ia ^ (ib & sign_mask); |
281 | 617k | return int_as_float(r); |
282 | 617k | } |
283 | | |
284 | | /** |
285 | | * @brief Return fast, but approximate, vector atan(x). |
286 | | * |
287 | | * Max error of this implementation is 0.004883. |
288 | | */ |
289 | | ASTCENC_SIMD_INLINE vfloat atan(vfloat x) |
290 | 308k | { |
291 | 308k | vmask c = abs(x) > vfloat(1.0f); |
292 | 308k | vfloat z = change_sign(vfloat(astc::PI_OVER_TWO), x); |
293 | 308k | vfloat y = select(x, vfloat(1.0f) / x, c); |
294 | 308k | y = y / (y * y * vfloat(0.28f) + vfloat(1.0f)); |
295 | 308k | return select(y, z - y, c); |
296 | 308k | } |
297 | | |
298 | | /** |
299 | | * @brief Return fast, but approximate, vector atan2(x, y). |
300 | | */ |
301 | | ASTCENC_SIMD_INLINE vfloat atan2(vfloat y, vfloat x) |
302 | 308k | { |
303 | 308k | vfloat z = atan(abs(y / x)); |
304 | 308k | vmask xmask = x < vfloat::zero(); |
305 | 308k | return change_sign(select(z, vfloat(astc::PI) - z, xmask), y); |
306 | 308k | } |
307 | | |
308 | | /* |
309 | | * @brief Factory that returns a unit length 4 component vfloat4. |
310 | | */ |
311 | | static ASTCENC_SIMD_INLINE vfloat4 unit4() |
312 | 525k | { |
313 | 525k | return vfloat4(0.5f); |
314 | 525k | } Unexecuted instantiation: astcenc_entry.cpp:unit4() Unexecuted instantiation: astcenc_image.cpp:unit4() Unexecuted instantiation: astcenc_mathlib_softfloat.cpp:unit4() Unexecuted instantiation: astcenc_percentile_tables.cpp:unit4() Unexecuted instantiation: astcenc_symbolic_physical.cpp:unit4() Unexecuted instantiation: astcenc_weight_align.cpp:unit4() Unexecuted instantiation: astcenc_weight_quant_xfer_tables.cpp:unit4() Unexecuted instantiation: astcenc_block_sizes.cpp:unit4() Unexecuted instantiation: astcenc_color_unquantize.cpp:unit4() Unexecuted instantiation: astcenc_compress_symbolic.cpp:unit4() Unexecuted instantiation: astcenc_compute_variance.cpp:unit4() Unexecuted instantiation: astcenc_decompress_symbolic.cpp:unit4() astcenc_find_best_partitioning.cpp:unit4() Line | Count | Source | 312 | 504k | { | 313 | 504k | return vfloat4(0.5f); | 314 | 504k | } |
astcenc_ideal_endpoints_and_weights.cpp:unit4() Line | Count | Source | 312 | 21.0k | { | 313 | 21.0k | return vfloat4(0.5f); | 314 | 21.0k | } |
Unexecuted instantiation: astcenc_integer_sequence.cpp:unit4() Unexecuted instantiation: astcenc_mathlib.cpp:unit4() Unexecuted instantiation: astcenc_partition_tables.cpp:unit4() Unexecuted instantiation: astcenc_pick_best_endpoint_format.cpp:unit4() Unexecuted instantiation: astcenc_quantization.cpp:unit4() Unexecuted instantiation: astcenc_averages_and_directions.cpp:unit4() Unexecuted instantiation: astcenc_color_quantize.cpp:unit4() Unexecuted instantiation: fuzz_astc_physical_to_symbolic.cpp:unit4() |
315 | | |
316 | | /** |
317 | | * @brief Factory that returns a unit length 3 component vfloat4. |
318 | | */ |
319 | | static ASTCENC_SIMD_INLINE vfloat4 unit3() |
320 | 232k | { |
321 | 232k | float val = 0.577350258827209473f; |
322 | 232k | return vfloat4(val, val, val, 0.0f); |
323 | 232k | } Unexecuted instantiation: astcenc_entry.cpp:unit3() Unexecuted instantiation: astcenc_image.cpp:unit3() Unexecuted instantiation: astcenc_mathlib_softfloat.cpp:unit3() Unexecuted instantiation: astcenc_percentile_tables.cpp:unit3() Unexecuted instantiation: astcenc_symbolic_physical.cpp:unit3() Unexecuted instantiation: astcenc_weight_align.cpp:unit3() Unexecuted instantiation: astcenc_weight_quant_xfer_tables.cpp:unit3() Unexecuted instantiation: astcenc_block_sizes.cpp:unit3() Unexecuted instantiation: astcenc_color_unquantize.cpp:unit3() Unexecuted instantiation: astcenc_compress_symbolic.cpp:unit3() Unexecuted instantiation: astcenc_compute_variance.cpp:unit3() Unexecuted instantiation: astcenc_decompress_symbolic.cpp:unit3() astcenc_find_best_partitioning.cpp:unit3() Line | Count | Source | 320 | 94.6k | { | 321 | 94.6k | float val = 0.577350258827209473f; | 322 | 94.6k | return vfloat4(val, val, val, 0.0f); | 323 | 94.6k | } |
astcenc_ideal_endpoints_and_weights.cpp:unit3() Line | Count | Source | 320 | 10.1k | { | 321 | 10.1k | float val = 0.577350258827209473f; | 322 | 10.1k | return vfloat4(val, val, val, 0.0f); | 323 | 10.1k | } |
Unexecuted instantiation: astcenc_integer_sequence.cpp:unit3() Unexecuted instantiation: astcenc_mathlib.cpp:unit3() Unexecuted instantiation: astcenc_partition_tables.cpp:unit3() astcenc_pick_best_endpoint_format.cpp:unit3() Line | Count | Source | 320 | 128k | { | 321 | 128k | float val = 0.577350258827209473f; | 322 | 128k | return vfloat4(val, val, val, 0.0f); | 323 | 128k | } |
Unexecuted instantiation: astcenc_quantization.cpp:unit3() Unexecuted instantiation: astcenc_averages_and_directions.cpp:unit3() Unexecuted instantiation: astcenc_color_quantize.cpp:unit3() Unexecuted instantiation: fuzz_astc_physical_to_symbolic.cpp:unit3() |
324 | | |
325 | | /** |
326 | | * @brief Factory that returns a unit length 2 component vfloat4. |
327 | | */ |
328 | | static ASTCENC_SIMD_INLINE vfloat4 unit2() |
329 | 813 | { |
330 | 813 | float val = 0.707106769084930420f; |
331 | 813 | return vfloat4(val, val, 0.0f, 0.0f); |
332 | 813 | } Unexecuted instantiation: astcenc_entry.cpp:unit2() Unexecuted instantiation: astcenc_image.cpp:unit2() Unexecuted instantiation: astcenc_mathlib_softfloat.cpp:unit2() Unexecuted instantiation: astcenc_percentile_tables.cpp:unit2() Unexecuted instantiation: astcenc_symbolic_physical.cpp:unit2() Unexecuted instantiation: astcenc_weight_align.cpp:unit2() Unexecuted instantiation: astcenc_weight_quant_xfer_tables.cpp:unit2() Unexecuted instantiation: astcenc_block_sizes.cpp:unit2() Unexecuted instantiation: astcenc_color_unquantize.cpp:unit2() Unexecuted instantiation: astcenc_compress_symbolic.cpp:unit2() Unexecuted instantiation: astcenc_compute_variance.cpp:unit2() Unexecuted instantiation: astcenc_decompress_symbolic.cpp:unit2() Unexecuted instantiation: astcenc_find_best_partitioning.cpp:unit2() astcenc_ideal_endpoints_and_weights.cpp:unit2() Line | Count | Source | 329 | 813 | { | 330 | 813 | float val = 0.707106769084930420f; | 331 | 813 | return vfloat4(val, val, 0.0f, 0.0f); | 332 | 813 | } |
Unexecuted instantiation: astcenc_integer_sequence.cpp:unit2() Unexecuted instantiation: astcenc_mathlib.cpp:unit2() Unexecuted instantiation: astcenc_partition_tables.cpp:unit2() Unexecuted instantiation: astcenc_pick_best_endpoint_format.cpp:unit2() Unexecuted instantiation: astcenc_quantization.cpp:unit2() Unexecuted instantiation: astcenc_averages_and_directions.cpp:unit2() Unexecuted instantiation: astcenc_color_quantize.cpp:unit2() Unexecuted instantiation: fuzz_astc_physical_to_symbolic.cpp:unit2() |
333 | | |
334 | | /** |
335 | | * @brief Factory that returns a 3 component vfloat4. |
336 | | */ |
337 | | static ASTCENC_SIMD_INLINE vfloat4 vfloat3(float a, float b, float c) |
338 | 1.42M | { |
339 | 1.42M | return vfloat4(a, b, c, 0.0f); |
340 | 1.42M | } Unexecuted instantiation: astcenc_entry.cpp:vfloat3(float, float, float) Unexecuted instantiation: astcenc_image.cpp:vfloat3(float, float, float) Unexecuted instantiation: astcenc_mathlib_softfloat.cpp:vfloat3(float, float, float) Unexecuted instantiation: astcenc_percentile_tables.cpp:vfloat3(float, float, float) Unexecuted instantiation: astcenc_symbolic_physical.cpp:vfloat3(float, float, float) Unexecuted instantiation: astcenc_weight_align.cpp:vfloat3(float, float, float) Unexecuted instantiation: astcenc_weight_quant_xfer_tables.cpp:vfloat3(float, float, float) Unexecuted instantiation: astcenc_block_sizes.cpp:vfloat3(float, float, float) Unexecuted instantiation: astcenc_color_unquantize.cpp:vfloat3(float, float, float) Unexecuted instantiation: astcenc_compress_symbolic.cpp:vfloat3(float, float, float) Unexecuted instantiation: astcenc_compute_variance.cpp:vfloat3(float, float, float) Unexecuted instantiation: astcenc_decompress_symbolic.cpp:vfloat3(float, float, float) Unexecuted instantiation: astcenc_find_best_partitioning.cpp:vfloat3(float, float, float) astcenc_ideal_endpoints_and_weights.cpp:vfloat3(float, float, float) Line | Count | Source | 338 | 317k | { | 339 | 317k | return vfloat4(a, b, c, 0.0f); | 340 | 317k | } |
Unexecuted instantiation: astcenc_integer_sequence.cpp:vfloat3(float, float, float) Unexecuted instantiation: astcenc_mathlib.cpp:vfloat3(float, float, float) Unexecuted instantiation: astcenc_partition_tables.cpp:vfloat3(float, float, float) Unexecuted instantiation: astcenc_pick_best_endpoint_format.cpp:vfloat3(float, float, float) Unexecuted instantiation: astcenc_quantization.cpp:vfloat3(float, float, float) astcenc_averages_and_directions.cpp:vfloat3(float, float, float) Line | Count | Source | 338 | 1.10M | { | 339 | 1.10M | return vfloat4(a, b, c, 0.0f); | 340 | 1.10M | } |
Unexecuted instantiation: astcenc_color_quantize.cpp:vfloat3(float, float, float) Unexecuted instantiation: fuzz_astc_physical_to_symbolic.cpp:vfloat3(float, float, float) |
341 | | |
342 | | /** |
343 | | * @brief Factory that returns a 2 component vfloat4. |
344 | | */ |
345 | | static ASTCENC_SIMD_INLINE vfloat4 vfloat2(float a, float b) |
346 | 1.83M | { |
347 | 1.83M | return vfloat4(a, b, 0.0f, 0.0f); |
348 | 1.83M | } Unexecuted instantiation: astcenc_entry.cpp:vfloat2(float, float) Unexecuted instantiation: astcenc_image.cpp:vfloat2(float, float) Unexecuted instantiation: astcenc_mathlib_softfloat.cpp:vfloat2(float, float) Unexecuted instantiation: astcenc_percentile_tables.cpp:vfloat2(float, float) Unexecuted instantiation: astcenc_symbolic_physical.cpp:vfloat2(float, float) Unexecuted instantiation: astcenc_weight_align.cpp:vfloat2(float, float) Unexecuted instantiation: astcenc_weight_quant_xfer_tables.cpp:vfloat2(float, float) Unexecuted instantiation: astcenc_block_sizes.cpp:vfloat2(float, float) Unexecuted instantiation: astcenc_color_unquantize.cpp:vfloat2(float, float) Unexecuted instantiation: astcenc_compress_symbolic.cpp:vfloat2(float, float) Unexecuted instantiation: astcenc_compute_variance.cpp:vfloat2(float, float) Unexecuted instantiation: astcenc_decompress_symbolic.cpp:vfloat2(float, float) Unexecuted instantiation: astcenc_find_best_partitioning.cpp:vfloat2(float, float) astcenc_ideal_endpoints_and_weights.cpp:vfloat2(float, float) Line | Count | Source | 346 | 1.81M | { | 347 | 1.81M | return vfloat4(a, b, 0.0f, 0.0f); | 348 | 1.81M | } |
Unexecuted instantiation: astcenc_integer_sequence.cpp:vfloat2(float, float) Unexecuted instantiation: astcenc_mathlib.cpp:vfloat2(float, float) Unexecuted instantiation: astcenc_partition_tables.cpp:vfloat2(float, float) Unexecuted instantiation: astcenc_pick_best_endpoint_format.cpp:vfloat2(float, float) Unexecuted instantiation: astcenc_quantization.cpp:vfloat2(float, float) astcenc_averages_and_directions.cpp:vfloat2(float, float) Line | Count | Source | 346 | 16.5k | { | 347 | 16.5k | return vfloat4(a, b, 0.0f, 0.0f); | 348 | 16.5k | } |
Unexecuted instantiation: astcenc_color_quantize.cpp:vfloat2(float, float) Unexecuted instantiation: fuzz_astc_physical_to_symbolic.cpp:vfloat2(float, float) |
349 | | |
350 | | /** |
351 | | * @brief Normalize a non-zero length vector to unit length. |
352 | | */ |
353 | | static ASTCENC_SIMD_INLINE vfloat4 normalize(vfloat4 a) |
354 | 121k | { |
355 | 121k | vfloat4 length = dot(a, a); |
356 | 121k | return a / sqrt(length); |
357 | 121k | } Unexecuted instantiation: astcenc_entry.cpp:normalize(vfloat4) Unexecuted instantiation: astcenc_image.cpp:normalize(vfloat4) Unexecuted instantiation: astcenc_mathlib_softfloat.cpp:normalize(vfloat4) Unexecuted instantiation: astcenc_percentile_tables.cpp:normalize(vfloat4) Unexecuted instantiation: astcenc_symbolic_physical.cpp:normalize(vfloat4) Unexecuted instantiation: astcenc_weight_align.cpp:normalize(vfloat4) Unexecuted instantiation: astcenc_weight_quant_xfer_tables.cpp:normalize(vfloat4) Unexecuted instantiation: astcenc_block_sizes.cpp:normalize(vfloat4) Unexecuted instantiation: astcenc_color_unquantize.cpp:normalize(vfloat4) Unexecuted instantiation: astcenc_compress_symbolic.cpp:normalize(vfloat4) Unexecuted instantiation: astcenc_compute_variance.cpp:normalize(vfloat4) Unexecuted instantiation: astcenc_decompress_symbolic.cpp:normalize(vfloat4) Unexecuted instantiation: astcenc_find_best_partitioning.cpp:normalize(vfloat4) astcenc_ideal_endpoints_and_weights.cpp:normalize(vfloat4) Line | Count | Source | 354 | 121k | { | 355 | 121k | vfloat4 length = dot(a, a); | 356 | 121k | return a / sqrt(length); | 357 | 121k | } |
Unexecuted instantiation: astcenc_integer_sequence.cpp:normalize(vfloat4) Unexecuted instantiation: astcenc_mathlib.cpp:normalize(vfloat4) Unexecuted instantiation: astcenc_partition_tables.cpp:normalize(vfloat4) Unexecuted instantiation: astcenc_pick_best_endpoint_format.cpp:normalize(vfloat4) Unexecuted instantiation: astcenc_quantization.cpp:normalize(vfloat4) Unexecuted instantiation: astcenc_averages_and_directions.cpp:normalize(vfloat4) Unexecuted instantiation: astcenc_color_quantize.cpp:normalize(vfloat4) Unexecuted instantiation: fuzz_astc_physical_to_symbolic.cpp:normalize(vfloat4) |
358 | | |
359 | | /** |
360 | | * @brief Normalize a vector, returning @c safe if len is zero. |
361 | | */ |
362 | | static ASTCENC_SIMD_INLINE vfloat4 normalize_safe(vfloat4 a, vfloat4 safe) |
363 | 695k | { |
364 | 695k | vfloat4 length = dot(a, a); |
365 | 695k | if (length.lane<0>() != 0.0f) |
366 | 636k | { |
367 | 636k | return a / sqrt(length); |
368 | 636k | } |
369 | | |
370 | 59.0k | return safe; |
371 | 695k | } Unexecuted instantiation: astcenc_entry.cpp:normalize_safe(vfloat4, vfloat4) Unexecuted instantiation: astcenc_image.cpp:normalize_safe(vfloat4, vfloat4) Unexecuted instantiation: astcenc_mathlib_softfloat.cpp:normalize_safe(vfloat4, vfloat4) Unexecuted instantiation: astcenc_percentile_tables.cpp:normalize_safe(vfloat4, vfloat4) Unexecuted instantiation: astcenc_symbolic_physical.cpp:normalize_safe(vfloat4, vfloat4) Unexecuted instantiation: astcenc_weight_align.cpp:normalize_safe(vfloat4, vfloat4) Unexecuted instantiation: astcenc_weight_quant_xfer_tables.cpp:normalize_safe(vfloat4, vfloat4) Unexecuted instantiation: astcenc_block_sizes.cpp:normalize_safe(vfloat4, vfloat4) Unexecuted instantiation: astcenc_color_unquantize.cpp:normalize_safe(vfloat4, vfloat4) Unexecuted instantiation: astcenc_compress_symbolic.cpp:normalize_safe(vfloat4, vfloat4) Unexecuted instantiation: astcenc_compute_variance.cpp:normalize_safe(vfloat4, vfloat4) Unexecuted instantiation: astcenc_decompress_symbolic.cpp:normalize_safe(vfloat4, vfloat4) astcenc_find_best_partitioning.cpp:normalize_safe(vfloat4, vfloat4) Line | Count | Source | 363 | 599k | { | 364 | 599k | vfloat4 length = dot(a, a); | 365 | 599k | if (length.lane<0>() != 0.0f) | 366 | 546k | { | 367 | 546k | return a / sqrt(length); | 368 | 546k | } | 369 | | | 370 | 52.8k | return safe; | 371 | 599k | } |
astcenc_ideal_endpoints_and_weights.cpp:normalize_safe(vfloat4, vfloat4) Line | Count | Source | 363 | 32.0k | { | 364 | 32.0k | vfloat4 length = dot(a, a); | 365 | 32.0k | if (length.lane<0>() != 0.0f) | 366 | 29.5k | { | 367 | 29.5k | return a / sqrt(length); | 368 | 29.5k | } | 369 | | | 370 | 2.49k | return safe; | 371 | 32.0k | } |
Unexecuted instantiation: astcenc_integer_sequence.cpp:normalize_safe(vfloat4, vfloat4) Unexecuted instantiation: astcenc_mathlib.cpp:normalize_safe(vfloat4, vfloat4) Unexecuted instantiation: astcenc_partition_tables.cpp:normalize_safe(vfloat4, vfloat4) astcenc_pick_best_endpoint_format.cpp:normalize_safe(vfloat4, vfloat4) Line | Count | Source | 363 | 64.0k | { | 364 | 64.0k | vfloat4 length = dot(a, a); | 365 | 64.0k | if (length.lane<0>() != 0.0f) | 366 | 60.3k | { | 367 | 60.3k | return a / sqrt(length); | 368 | 60.3k | } | 369 | | | 370 | 3.67k | return safe; | 371 | 64.0k | } |
Unexecuted instantiation: astcenc_quantization.cpp:normalize_safe(vfloat4, vfloat4) Unexecuted instantiation: astcenc_averages_and_directions.cpp:normalize_safe(vfloat4, vfloat4) Unexecuted instantiation: astcenc_color_quantize.cpp:normalize_safe(vfloat4, vfloat4) Unexecuted instantiation: fuzz_astc_physical_to_symbolic.cpp:normalize_safe(vfloat4, vfloat4) |
372 | | |
373 | | |
374 | | |
375 | 7.19k | #define POLY0(x, c0) ( c0) |
376 | 7.19k | #define POLY1(x, c0, c1) ((POLY0(x, c1) * x) + c0) |
377 | 7.19k | #define POLY2(x, c0, c1, c2) ((POLY1(x, c1, c2) * x) + c0) |
378 | 7.19k | #define POLY3(x, c0, c1, c2, c3) ((POLY2(x, c1, c2, c3) * x) + c0) |
379 | 5.39k | #define POLY4(x, c0, c1, c2, c3, c4) ((POLY3(x, c1, c2, c3, c4) * x) + c0) |
380 | 1.79k | #define POLY5(x, c0, c1, c2, c3, c4, c5) ((POLY4(x, c1, c2, c3, c4, c5) * x) + c0) |
381 | | |
382 | | /** |
383 | | * @brief Compute an approximate exp2(x) for each lane in the vector. |
384 | | * |
385 | | * Based on 5th degree minimax polynomials, ported from this blog |
386 | | * https://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html |
387 | | */ |
388 | | static ASTCENC_SIMD_INLINE vfloat4 exp2(vfloat4 x) |
389 | 1.79k | { |
390 | 1.79k | x = clamp(-126.99999f, 129.0f, x); |
391 | | |
392 | 1.79k | vint4 ipart = float_to_int(x - 0.5f); |
393 | 1.79k | vfloat4 fpart = x - int_to_float(ipart); |
394 | | |
395 | | // Integer contrib, using 1 << ipart |
396 | 1.79k | vfloat4 iexp = int_as_float(lsl<23>(ipart + 127)); |
397 | | |
398 | | // Fractional contrib, using polynomial fit of 2^x in range [-0.5, 0.5) |
399 | 1.79k | vfloat4 fexp = POLY5(fpart, |
400 | 1.79k | 9.9999994e-1f, |
401 | 1.79k | 6.9315308e-1f, |
402 | 1.79k | 2.4015361e-1f, |
403 | 1.79k | 5.5826318e-2f, |
404 | 1.79k | 8.9893397e-3f, |
405 | 1.79k | 1.8775767e-3f); |
406 | | |
407 | 1.79k | return iexp * fexp; |
408 | 1.79k | } astcenc_entry.cpp:exp2(vfloat4) Line | Count | Source | 389 | 1.79k | { | 390 | 1.79k | x = clamp(-126.99999f, 129.0f, x); | 391 | | | 392 | 1.79k | vint4 ipart = float_to_int(x - 0.5f); | 393 | 1.79k | vfloat4 fpart = x - int_to_float(ipart); | 394 | | | 395 | | // Integer contrib, using 1 << ipart | 396 | 1.79k | vfloat4 iexp = int_as_float(lsl<23>(ipart + 127)); | 397 | | | 398 | | // Fractional contrib, using polynomial fit of 2^x in range [-0.5, 0.5) | 399 | 1.79k | vfloat4 fexp = POLY5(fpart, | 400 | 1.79k | 9.9999994e-1f, | 401 | 1.79k | 6.9315308e-1f, | 402 | 1.79k | 2.4015361e-1f, | 403 | 1.79k | 5.5826318e-2f, | 404 | 1.79k | 8.9893397e-3f, | 405 | 1.79k | 1.8775767e-3f); | 406 | | | 407 | 1.79k | return iexp * fexp; | 408 | 1.79k | } |
Unexecuted instantiation: astcenc_image.cpp:exp2(vfloat4) Unexecuted instantiation: astcenc_mathlib_softfloat.cpp:exp2(vfloat4) Unexecuted instantiation: astcenc_percentile_tables.cpp:exp2(vfloat4) Unexecuted instantiation: astcenc_symbolic_physical.cpp:exp2(vfloat4) Unexecuted instantiation: astcenc_weight_align.cpp:exp2(vfloat4) Unexecuted instantiation: astcenc_weight_quant_xfer_tables.cpp:exp2(vfloat4) Unexecuted instantiation: astcenc_block_sizes.cpp:exp2(vfloat4) Unexecuted instantiation: astcenc_color_unquantize.cpp:exp2(vfloat4) Unexecuted instantiation: astcenc_compress_symbolic.cpp:exp2(vfloat4) Unexecuted instantiation: astcenc_compute_variance.cpp:exp2(vfloat4) Unexecuted instantiation: astcenc_decompress_symbolic.cpp:exp2(vfloat4) Unexecuted instantiation: astcenc_find_best_partitioning.cpp:exp2(vfloat4) Unexecuted instantiation: astcenc_ideal_endpoints_and_weights.cpp:exp2(vfloat4) Unexecuted instantiation: astcenc_integer_sequence.cpp:exp2(vfloat4) Unexecuted instantiation: astcenc_mathlib.cpp:exp2(vfloat4) Unexecuted instantiation: astcenc_partition_tables.cpp:exp2(vfloat4) Unexecuted instantiation: astcenc_pick_best_endpoint_format.cpp:exp2(vfloat4) Unexecuted instantiation: astcenc_quantization.cpp:exp2(vfloat4) Unexecuted instantiation: astcenc_averages_and_directions.cpp:exp2(vfloat4) Unexecuted instantiation: astcenc_color_quantize.cpp:exp2(vfloat4) Unexecuted instantiation: fuzz_astc_physical_to_symbolic.cpp:exp2(vfloat4) |
409 | | |
410 | | /** |
411 | | * @brief Compute an approximate log2(x) for each lane in the vector. |
412 | | * |
413 | | * Based on 5th degree minimax polynomials, ported from this blog |
414 | | * https://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html |
415 | | */ |
416 | | static ASTCENC_SIMD_INLINE vfloat4 log2(vfloat4 x) |
417 | 1.79k | { |
418 | 1.79k | vint4 exp(0x7F800000); |
419 | 1.79k | vint4 mant(0x007FFFFF); |
420 | 1.79k | vint4 one(0x3F800000); |
421 | | |
422 | 1.79k | vint4 i = float_as_int(x); |
423 | | |
424 | 1.79k | vfloat4 e = int_to_float(lsr<23>(i & exp) - 127); |
425 | | |
426 | 1.79k | vfloat4 m = int_as_float((i & mant) | one); |
427 | | |
428 | | // Polynomial fit of log2(x)/(x - 1), for x in range [1, 2) |
429 | 1.79k | vfloat4 p = POLY4(m, |
430 | 1.79k | 2.8882704548164776201f, |
431 | 1.79k | -2.52074962577807006663f, |
432 | 1.79k | 1.48116647521213171641f, |
433 | 1.79k | -0.465725644288844778798f, |
434 | 1.79k | 0.0596515482674574969533f); |
435 | | |
436 | | // Increases the polynomial degree, but ensures that log2(1) == 0 |
437 | 1.79k | p = p * (m - 1.0f); |
438 | | |
439 | 1.79k | return p + e; |
440 | 1.79k | } astcenc_entry.cpp:log2(vfloat4) Line | Count | Source | 417 | 1.79k | { | 418 | 1.79k | vint4 exp(0x7F800000); | 419 | 1.79k | vint4 mant(0x007FFFFF); | 420 | 1.79k | vint4 one(0x3F800000); | 421 | | | 422 | 1.79k | vint4 i = float_as_int(x); | 423 | | | 424 | 1.79k | vfloat4 e = int_to_float(lsr<23>(i & exp) - 127); | 425 | | | 426 | 1.79k | vfloat4 m = int_as_float((i & mant) | one); | 427 | | | 428 | | // Polynomial fit of log2(x)/(x - 1), for x in range [1, 2) | 429 | 1.79k | vfloat4 p = POLY4(m, | 430 | 1.79k | 2.8882704548164776201f, | 431 | 1.79k | -2.52074962577807006663f, | 432 | 1.79k | 1.48116647521213171641f, | 433 | 1.79k | -0.465725644288844778798f, | 434 | 1.79k | 0.0596515482674574969533f); | 435 | | | 436 | | // Increases the polynomial degree, but ensures that log2(1) == 0 | 437 | 1.79k | p = p * (m - 1.0f); | 438 | | | 439 | 1.79k | return p + e; | 440 | 1.79k | } |
Unexecuted instantiation: astcenc_image.cpp:log2(vfloat4) Unexecuted instantiation: astcenc_mathlib_softfloat.cpp:log2(vfloat4) Unexecuted instantiation: astcenc_percentile_tables.cpp:log2(vfloat4) Unexecuted instantiation: astcenc_symbolic_physical.cpp:log2(vfloat4) Unexecuted instantiation: astcenc_weight_align.cpp:log2(vfloat4) Unexecuted instantiation: astcenc_weight_quant_xfer_tables.cpp:log2(vfloat4) Unexecuted instantiation: astcenc_block_sizes.cpp:log2(vfloat4) Unexecuted instantiation: astcenc_color_unquantize.cpp:log2(vfloat4) Unexecuted instantiation: astcenc_compress_symbolic.cpp:log2(vfloat4) Unexecuted instantiation: astcenc_compute_variance.cpp:log2(vfloat4) Unexecuted instantiation: astcenc_decompress_symbolic.cpp:log2(vfloat4) Unexecuted instantiation: astcenc_find_best_partitioning.cpp:log2(vfloat4) Unexecuted instantiation: astcenc_ideal_endpoints_and_weights.cpp:log2(vfloat4) Unexecuted instantiation: astcenc_integer_sequence.cpp:log2(vfloat4) Unexecuted instantiation: astcenc_mathlib.cpp:log2(vfloat4) Unexecuted instantiation: astcenc_partition_tables.cpp:log2(vfloat4) Unexecuted instantiation: astcenc_pick_best_endpoint_format.cpp:log2(vfloat4) Unexecuted instantiation: astcenc_quantization.cpp:log2(vfloat4) Unexecuted instantiation: astcenc_averages_and_directions.cpp:log2(vfloat4) Unexecuted instantiation: astcenc_color_quantize.cpp:log2(vfloat4) Unexecuted instantiation: fuzz_astc_physical_to_symbolic.cpp:log2(vfloat4) |
441 | | |
442 | | /** |
443 | | * @brief Compute an approximate pow(x, y) for each lane in the vector. |
444 | | * |
445 | | * Power function based on the exp2(log2(x) * y) transform. |
446 | | */ |
447 | | static ASTCENC_SIMD_INLINE vfloat4 pow(vfloat4 x, vfloat4 y) |
448 | 1.79k | { |
449 | 1.79k | vmask4 zero_mask = y == vfloat4(0.0f); |
450 | 1.79k | vfloat4 estimate = exp2(log2(x) * y); |
451 | | |
452 | | // Guarantee that y == 0 returns exactly 1.0f |
453 | 1.79k | return select(estimate, vfloat4(1.0f), zero_mask); |
454 | 1.79k | } astcenc_entry.cpp:pow(vfloat4, vfloat4) Line | Count | Source | 448 | 1.79k | { | 449 | 1.79k | vmask4 zero_mask = y == vfloat4(0.0f); | 450 | 1.79k | vfloat4 estimate = exp2(log2(x) * y); | 451 | | | 452 | | // Guarantee that y == 0 returns exactly 1.0f | 453 | 1.79k | return select(estimate, vfloat4(1.0f), zero_mask); | 454 | 1.79k | } |
Unexecuted instantiation: astcenc_image.cpp:pow(vfloat4, vfloat4) Unexecuted instantiation: astcenc_mathlib_softfloat.cpp:pow(vfloat4, vfloat4) Unexecuted instantiation: astcenc_percentile_tables.cpp:pow(vfloat4, vfloat4) Unexecuted instantiation: astcenc_symbolic_physical.cpp:pow(vfloat4, vfloat4) Unexecuted instantiation: astcenc_weight_align.cpp:pow(vfloat4, vfloat4) Unexecuted instantiation: astcenc_weight_quant_xfer_tables.cpp:pow(vfloat4, vfloat4) Unexecuted instantiation: astcenc_block_sizes.cpp:pow(vfloat4, vfloat4) Unexecuted instantiation: astcenc_color_unquantize.cpp:pow(vfloat4, vfloat4) Unexecuted instantiation: astcenc_compress_symbolic.cpp:pow(vfloat4, vfloat4) Unexecuted instantiation: astcenc_compute_variance.cpp:pow(vfloat4, vfloat4) Unexecuted instantiation: astcenc_decompress_symbolic.cpp:pow(vfloat4, vfloat4) Unexecuted instantiation: astcenc_find_best_partitioning.cpp:pow(vfloat4, vfloat4) Unexecuted instantiation: astcenc_ideal_endpoints_and_weights.cpp:pow(vfloat4, vfloat4) Unexecuted instantiation: astcenc_integer_sequence.cpp:pow(vfloat4, vfloat4) Unexecuted instantiation: astcenc_mathlib.cpp:pow(vfloat4, vfloat4) Unexecuted instantiation: astcenc_partition_tables.cpp:pow(vfloat4, vfloat4) Unexecuted instantiation: astcenc_pick_best_endpoint_format.cpp:pow(vfloat4, vfloat4) Unexecuted instantiation: astcenc_quantization.cpp:pow(vfloat4, vfloat4) Unexecuted instantiation: astcenc_averages_and_directions.cpp:pow(vfloat4, vfloat4) Unexecuted instantiation: astcenc_color_quantize.cpp:pow(vfloat4, vfloat4) Unexecuted instantiation: fuzz_astc_physical_to_symbolic.cpp:pow(vfloat4, vfloat4) |
455 | | |
456 | | /** |
457 | | * @brief Count the leading zeros for each lane in @c a. |
458 | | * |
459 | | * Valid for all data values of @c a; will return a per-lane value [0, 32]. |
460 | | */ |
461 | | static ASTCENC_SIMD_INLINE vint4 clz(vint4 a) |
462 | 190k | { |
463 | | // This function is a horrible abuse of floating point exponents to convert |
464 | | // the original integer value into a 2^N encoding we can recover easily. |
465 | | |
466 | | // Convert to float without risk of rounding up by keeping only top 8 bits. |
467 | | // This trick is is guaranteed to keep top 8 bits and clear the 9th. |
468 | 190k | a = (~lsr<8>(a)) & a; |
469 | 190k | a = float_as_int(int_to_float(a)); |
470 | | |
471 | | // Extract and unbias exponent |
472 | 190k | a = vint4(127 + 31) - lsr<23>(a); |
473 | | |
474 | | // Clamp result to a valid 32-bit range |
475 | 190k | return clamp(0, 32, a); |
476 | 190k | } Unexecuted instantiation: astcenc_entry.cpp:clz(vint4) Unexecuted instantiation: astcenc_image.cpp:clz(vint4) Unexecuted instantiation: astcenc_mathlib_softfloat.cpp:clz(vint4) Unexecuted instantiation: astcenc_percentile_tables.cpp:clz(vint4) Unexecuted instantiation: astcenc_symbolic_physical.cpp:clz(vint4) Unexecuted instantiation: astcenc_weight_align.cpp:clz(vint4) Unexecuted instantiation: astcenc_weight_quant_xfer_tables.cpp:clz(vint4) Unexecuted instantiation: astcenc_block_sizes.cpp:clz(vint4) Unexecuted instantiation: astcenc_color_unquantize.cpp:clz(vint4) Unexecuted instantiation: astcenc_compress_symbolic.cpp:clz(vint4) Unexecuted instantiation: astcenc_compute_variance.cpp:clz(vint4) astcenc_decompress_symbolic.cpp:clz(vint4) Line | Count | Source | 462 | 190k | { | 463 | | // This function is a horrible abuse of floating point exponents to convert | 464 | | // the original integer value into a 2^N encoding we can recover easily. | 465 | | | 466 | | // Convert to float without risk of rounding up by keeping only top 8 bits. | 467 | | // This trick is is guaranteed to keep top 8 bits and clear the 9th. | 468 | 190k | a = (~lsr<8>(a)) & a; | 469 | 190k | a = float_as_int(int_to_float(a)); | 470 | | | 471 | | // Extract and unbias exponent | 472 | 190k | a = vint4(127 + 31) - lsr<23>(a); | 473 | | | 474 | | // Clamp result to a valid 32-bit range | 475 | 190k | return clamp(0, 32, a); | 476 | 190k | } |
Unexecuted instantiation: astcenc_find_best_partitioning.cpp:clz(vint4) Unexecuted instantiation: astcenc_ideal_endpoints_and_weights.cpp:clz(vint4) Unexecuted instantiation: astcenc_integer_sequence.cpp:clz(vint4) Unexecuted instantiation: astcenc_mathlib.cpp:clz(vint4) Unexecuted instantiation: astcenc_partition_tables.cpp:clz(vint4) Unexecuted instantiation: astcenc_pick_best_endpoint_format.cpp:clz(vint4) Unexecuted instantiation: astcenc_quantization.cpp:clz(vint4) Unexecuted instantiation: astcenc_averages_and_directions.cpp:clz(vint4) Unexecuted instantiation: astcenc_color_quantize.cpp:clz(vint4) Unexecuted instantiation: fuzz_astc_physical_to_symbolic.cpp:clz(vint4) |
477 | | |
478 | | /** |
479 | | * @brief Return lanewise 2^a for each lane in @c a. |
480 | | * |
481 | | * Use of signed int means that this is only valid for values in range [0, 31]. |
482 | | */ |
483 | | static ASTCENC_SIMD_INLINE vint4 two_to_the_n(vint4 a) |
484 | 190k | { |
485 | | // 2^30 is the largest signed number than can be represented |
486 | 190k | assert(all(a < vint4(31))); |
487 | | |
488 | | // This function is a horrible abuse of floating point to use the exponent |
489 | | // and float conversion to generate a 2^N multiple. |
490 | | |
491 | | // Bias the exponent |
492 | 190k | vint4 exp = a + 127; |
493 | 190k | exp = lsl<23>(exp); |
494 | | |
495 | | // Reinterpret the bits as a float, and then convert to an int |
496 | 190k | vfloat4 f = int_as_float(exp); |
497 | 190k | return float_to_int(f); |
498 | 190k | } Unexecuted instantiation: astcenc_entry.cpp:two_to_the_n(vint4) Unexecuted instantiation: astcenc_image.cpp:two_to_the_n(vint4) Unexecuted instantiation: astcenc_mathlib_softfloat.cpp:two_to_the_n(vint4) Unexecuted instantiation: astcenc_percentile_tables.cpp:two_to_the_n(vint4) Unexecuted instantiation: astcenc_symbolic_physical.cpp:two_to_the_n(vint4) Unexecuted instantiation: astcenc_weight_align.cpp:two_to_the_n(vint4) Unexecuted instantiation: astcenc_weight_quant_xfer_tables.cpp:two_to_the_n(vint4) Unexecuted instantiation: astcenc_block_sizes.cpp:two_to_the_n(vint4) Unexecuted instantiation: astcenc_color_unquantize.cpp:two_to_the_n(vint4) Unexecuted instantiation: astcenc_compress_symbolic.cpp:two_to_the_n(vint4) Unexecuted instantiation: astcenc_compute_variance.cpp:two_to_the_n(vint4) astcenc_decompress_symbolic.cpp:two_to_the_n(vint4) Line | Count | Source | 484 | 190k | { | 485 | | // 2^30 is the largest signed number than can be represented | 486 | 190k | assert(all(a < vint4(31))); | 487 | | | 488 | | // This function is a horrible abuse of floating point to use the exponent | 489 | | // and float conversion to generate a 2^N multiple. | 490 | | | 491 | | // Bias the exponent | 492 | 190k | vint4 exp = a + 127; | 493 | 190k | exp = lsl<23>(exp); | 494 | | | 495 | | // Reinterpret the bits as a float, and then convert to an int | 496 | 190k | vfloat4 f = int_as_float(exp); | 497 | 190k | return float_to_int(f); | 498 | 190k | } |
Unexecuted instantiation: astcenc_find_best_partitioning.cpp:two_to_the_n(vint4) Unexecuted instantiation: astcenc_ideal_endpoints_and_weights.cpp:two_to_the_n(vint4) Unexecuted instantiation: astcenc_integer_sequence.cpp:two_to_the_n(vint4) Unexecuted instantiation: astcenc_mathlib.cpp:two_to_the_n(vint4) Unexecuted instantiation: astcenc_partition_tables.cpp:two_to_the_n(vint4) Unexecuted instantiation: astcenc_pick_best_endpoint_format.cpp:two_to_the_n(vint4) Unexecuted instantiation: astcenc_quantization.cpp:two_to_the_n(vint4) Unexecuted instantiation: astcenc_averages_and_directions.cpp:two_to_the_n(vint4) Unexecuted instantiation: astcenc_color_quantize.cpp:two_to_the_n(vint4) Unexecuted instantiation: fuzz_astc_physical_to_symbolic.cpp:two_to_the_n(vint4) |
499 | | |
500 | | /** |
501 | | * @brief Convert unorm16 [0, 65535] to float16 in range [0, 1]. |
502 | | */ |
503 | | static ASTCENC_SIMD_INLINE vint4 unorm16_to_sf16(vint4 p) |
504 | 190k | { |
505 | 190k | vint4 fp16_one = vint4(0x3C00); |
506 | 190k | vint4 fp16_small = lsl<8>(p); |
507 | | |
508 | 190k | vmask4 is_one = p == vint4(0xFFFF); |
509 | 190k | vmask4 is_small = p < vint4(4); |
510 | | |
511 | | // Manually inline clz() on Visual Studio to avoid release build codegen bug |
512 | | // see https://github.com/ARM-software/astc-encoder/issues/259 |
513 | | #if !defined(__clang__) && defined(_MSC_VER) |
514 | | vint4 a = (~lsr<8>(p)) & p; |
515 | | a = float_as_int(int_to_float(a)); |
516 | | a = vint4(127 + 31) - lsr<23>(a); |
517 | | vint4 lz = clamp(0, 32, a) - 16; |
518 | | #else |
519 | 190k | vint4 lz = clz(p) - 16; |
520 | 190k | #endif |
521 | | |
522 | 190k | p = p * two_to_the_n(lz + 1); |
523 | 190k | p = p & vint4(0xFFFF); |
524 | | |
525 | 190k | p = lsr<6>(p); |
526 | | |
527 | 190k | p = p | lsl<10>(vint4(14) - lz); |
528 | | |
529 | 190k | vint4 r = select(p, fp16_one, is_one); |
530 | 190k | r = select(r, fp16_small, is_small); |
531 | 190k | return r; |
532 | 190k | } Unexecuted instantiation: astcenc_entry.cpp:unorm16_to_sf16(vint4) Unexecuted instantiation: astcenc_image.cpp:unorm16_to_sf16(vint4) Unexecuted instantiation: astcenc_mathlib_softfloat.cpp:unorm16_to_sf16(vint4) Unexecuted instantiation: astcenc_percentile_tables.cpp:unorm16_to_sf16(vint4) Unexecuted instantiation: astcenc_symbolic_physical.cpp:unorm16_to_sf16(vint4) Unexecuted instantiation: astcenc_weight_align.cpp:unorm16_to_sf16(vint4) Unexecuted instantiation: astcenc_weight_quant_xfer_tables.cpp:unorm16_to_sf16(vint4) Unexecuted instantiation: astcenc_block_sizes.cpp:unorm16_to_sf16(vint4) Unexecuted instantiation: astcenc_color_unquantize.cpp:unorm16_to_sf16(vint4) Unexecuted instantiation: astcenc_compress_symbolic.cpp:unorm16_to_sf16(vint4) Unexecuted instantiation: astcenc_compute_variance.cpp:unorm16_to_sf16(vint4) astcenc_decompress_symbolic.cpp:unorm16_to_sf16(vint4) Line | Count | Source | 504 | 190k | { | 505 | 190k | vint4 fp16_one = vint4(0x3C00); | 506 | 190k | vint4 fp16_small = lsl<8>(p); | 507 | | | 508 | 190k | vmask4 is_one = p == vint4(0xFFFF); | 509 | 190k | vmask4 is_small = p < vint4(4); | 510 | | | 511 | | // Manually inline clz() on Visual Studio to avoid release build codegen bug | 512 | | // see https://github.com/ARM-software/astc-encoder/issues/259 | 513 | | #if !defined(__clang__) && defined(_MSC_VER) | 514 | | vint4 a = (~lsr<8>(p)) & p; | 515 | | a = float_as_int(int_to_float(a)); | 516 | | a = vint4(127 + 31) - lsr<23>(a); | 517 | | vint4 lz = clamp(0, 32, a) - 16; | 518 | | #else | 519 | 190k | vint4 lz = clz(p) - 16; | 520 | 190k | #endif | 521 | | | 522 | 190k | p = p * two_to_the_n(lz + 1); | 523 | 190k | p = p & vint4(0xFFFF); | 524 | | | 525 | 190k | p = lsr<6>(p); | 526 | | | 527 | 190k | p = p | lsl<10>(vint4(14) - lz); | 528 | | | 529 | 190k | vint4 r = select(p, fp16_one, is_one); | 530 | 190k | r = select(r, fp16_small, is_small); | 531 | 190k | return r; | 532 | 190k | } |
Unexecuted instantiation: astcenc_find_best_partitioning.cpp:unorm16_to_sf16(vint4) Unexecuted instantiation: astcenc_ideal_endpoints_and_weights.cpp:unorm16_to_sf16(vint4) Unexecuted instantiation: astcenc_integer_sequence.cpp:unorm16_to_sf16(vint4) Unexecuted instantiation: astcenc_mathlib.cpp:unorm16_to_sf16(vint4) Unexecuted instantiation: astcenc_partition_tables.cpp:unorm16_to_sf16(vint4) Unexecuted instantiation: astcenc_pick_best_endpoint_format.cpp:unorm16_to_sf16(vint4) Unexecuted instantiation: astcenc_quantization.cpp:unorm16_to_sf16(vint4) Unexecuted instantiation: astcenc_averages_and_directions.cpp:unorm16_to_sf16(vint4) Unexecuted instantiation: astcenc_color_quantize.cpp:unorm16_to_sf16(vint4) Unexecuted instantiation: fuzz_astc_physical_to_symbolic.cpp:unorm16_to_sf16(vint4) |
533 | | |
534 | | /** |
535 | | * @brief Convert 16-bit LNS to float16. |
536 | | */ |
537 | | static ASTCENC_SIMD_INLINE vint4 lns_to_sf16(vint4 p) |
538 | 36.6k | { |
539 | 36.6k | vint4 mc = p & 0x7FF; |
540 | 36.6k | vint4 ec = lsr<11>(p); |
541 | | |
542 | 36.6k | vint4 mc_512 = mc * 3; |
543 | 36.6k | vmask4 mask_512 = mc < vint4(512); |
544 | | |
545 | 36.6k | vint4 mc_1536 = mc * 4 - 512; |
546 | 36.6k | vmask4 mask_1536 = mc < vint4(1536); |
547 | | |
548 | 36.6k | vint4 mc_else = mc * 5 - 2048; |
549 | | |
550 | 36.6k | vint4 mt = mc_else; |
551 | 36.6k | mt = select(mt, mc_1536, mask_1536); |
552 | 36.6k | mt = select(mt, mc_512, mask_512); |
553 | | |
554 | 36.6k | vint4 res = lsl<10>(ec) | lsr<3>(mt); |
555 | 36.6k | return min(res, vint4(0x7BFF)); |
556 | 36.6k | } Unexecuted instantiation: astcenc_entry.cpp:lns_to_sf16(vint4) astcenc_image.cpp:lns_to_sf16(vint4) Line | Count | Source | 538 | 1.07k | { | 539 | 1.07k | vint4 mc = p & 0x7FF; | 540 | 1.07k | vint4 ec = lsr<11>(p); | 541 | | | 542 | 1.07k | vint4 mc_512 = mc * 3; | 543 | 1.07k | vmask4 mask_512 = mc < vint4(512); | 544 | | | 545 | 1.07k | vint4 mc_1536 = mc * 4 - 512; | 546 | 1.07k | vmask4 mask_1536 = mc < vint4(1536); | 547 | | | 548 | 1.07k | vint4 mc_else = mc * 5 - 2048; | 549 | | | 550 | 1.07k | vint4 mt = mc_else; | 551 | 1.07k | mt = select(mt, mc_1536, mask_1536); | 552 | 1.07k | mt = select(mt, mc_512, mask_512); | 553 | | | 554 | 1.07k | vint4 res = lsl<10>(ec) | lsr<3>(mt); | 555 | 1.07k | return min(res, vint4(0x7BFF)); | 556 | 1.07k | } |
Unexecuted instantiation: astcenc_mathlib_softfloat.cpp:lns_to_sf16(vint4) Unexecuted instantiation: astcenc_percentile_tables.cpp:lns_to_sf16(vint4) Unexecuted instantiation: astcenc_symbolic_physical.cpp:lns_to_sf16(vint4) Unexecuted instantiation: astcenc_weight_align.cpp:lns_to_sf16(vint4) Unexecuted instantiation: astcenc_weight_quant_xfer_tables.cpp:lns_to_sf16(vint4) Unexecuted instantiation: astcenc_block_sizes.cpp:lns_to_sf16(vint4) Unexecuted instantiation: astcenc_color_unquantize.cpp:lns_to_sf16(vint4) Unexecuted instantiation: astcenc_compress_symbolic.cpp:lns_to_sf16(vint4) Unexecuted instantiation: astcenc_compute_variance.cpp:lns_to_sf16(vint4) astcenc_decompress_symbolic.cpp:lns_to_sf16(vint4) Line | Count | Source | 538 | 35.5k | { | 539 | 35.5k | vint4 mc = p & 0x7FF; | 540 | 35.5k | vint4 ec = lsr<11>(p); | 541 | | | 542 | 35.5k | vint4 mc_512 = mc * 3; | 543 | 35.5k | vmask4 mask_512 = mc < vint4(512); | 544 | | | 545 | 35.5k | vint4 mc_1536 = mc * 4 - 512; | 546 | 35.5k | vmask4 mask_1536 = mc < vint4(1536); | 547 | | | 548 | 35.5k | vint4 mc_else = mc * 5 - 2048; | 549 | | | 550 | 35.5k | vint4 mt = mc_else; | 551 | 35.5k | mt = select(mt, mc_1536, mask_1536); | 552 | 35.5k | mt = select(mt, mc_512, mask_512); | 553 | | | 554 | 35.5k | vint4 res = lsl<10>(ec) | lsr<3>(mt); | 555 | 35.5k | return min(res, vint4(0x7BFF)); | 556 | 35.5k | } |
Unexecuted instantiation: astcenc_find_best_partitioning.cpp:lns_to_sf16(vint4) Unexecuted instantiation: astcenc_ideal_endpoints_and_weights.cpp:lns_to_sf16(vint4) Unexecuted instantiation: astcenc_integer_sequence.cpp:lns_to_sf16(vint4) Unexecuted instantiation: astcenc_mathlib.cpp:lns_to_sf16(vint4) Unexecuted instantiation: astcenc_partition_tables.cpp:lns_to_sf16(vint4) Unexecuted instantiation: astcenc_pick_best_endpoint_format.cpp:lns_to_sf16(vint4) Unexecuted instantiation: astcenc_quantization.cpp:lns_to_sf16(vint4) Unexecuted instantiation: astcenc_averages_and_directions.cpp:lns_to_sf16(vint4) Unexecuted instantiation: astcenc_color_quantize.cpp:lns_to_sf16(vint4) Unexecuted instantiation: fuzz_astc_physical_to_symbolic.cpp:lns_to_sf16(vint4) |
557 | | |
558 | | /** |
559 | | * @brief Extract mantissa and exponent of a float value. |
560 | | * |
561 | | * @param a The input value. |
562 | | * @param[out] exp The output exponent. |
563 | | * |
564 | | * @return The mantissa. |
565 | | */ |
566 | | static ASTCENC_SIMD_INLINE vfloat4 frexp(vfloat4 a, vint4& exp) |
567 | 27.0k | { |
568 | | // Interpret the bits as an integer |
569 | 27.0k | vint4 ai = float_as_int(a); |
570 | | |
571 | | // Extract and unbias the exponent |
572 | 27.0k | exp = (lsr<23>(ai) & 0xFF) - 126; |
573 | | |
574 | | // Extract and unbias the mantissa |
575 | 27.0k | vint4 manti = (ai & static_cast<int>(0x807FFFFF)) | 0x3F000000; |
576 | 27.0k | return int_as_float(manti); |
577 | 27.0k | } Unexecuted instantiation: astcenc_entry.cpp:frexp(vfloat4, vint4&) astcenc_image.cpp:frexp(vfloat4, vint4&) Line | Count | Source | 567 | 27.0k | { | 568 | | // Interpret the bits as an integer | 569 | 27.0k | vint4 ai = float_as_int(a); | 570 | | | 571 | | // Extract and unbias the exponent | 572 | 27.0k | exp = (lsr<23>(ai) & 0xFF) - 126; | 573 | | | 574 | | // Extract and unbias the mantissa | 575 | 27.0k | vint4 manti = (ai & static_cast<int>(0x807FFFFF)) | 0x3F000000; | 576 | 27.0k | return int_as_float(manti); | 577 | 27.0k | } |
Unexecuted instantiation: astcenc_mathlib_softfloat.cpp:frexp(vfloat4, vint4&) Unexecuted instantiation: astcenc_percentile_tables.cpp:frexp(vfloat4, vint4&) Unexecuted instantiation: astcenc_symbolic_physical.cpp:frexp(vfloat4, vint4&) Unexecuted instantiation: astcenc_weight_align.cpp:frexp(vfloat4, vint4&) Unexecuted instantiation: astcenc_weight_quant_xfer_tables.cpp:frexp(vfloat4, vint4&) Unexecuted instantiation: astcenc_block_sizes.cpp:frexp(vfloat4, vint4&) Unexecuted instantiation: astcenc_color_unquantize.cpp:frexp(vfloat4, vint4&) Unexecuted instantiation: astcenc_compress_symbolic.cpp:frexp(vfloat4, vint4&) Unexecuted instantiation: astcenc_compute_variance.cpp:frexp(vfloat4, vint4&) Unexecuted instantiation: astcenc_decompress_symbolic.cpp:frexp(vfloat4, vint4&) Unexecuted instantiation: astcenc_find_best_partitioning.cpp:frexp(vfloat4, vint4&) Unexecuted instantiation: astcenc_ideal_endpoints_and_weights.cpp:frexp(vfloat4, vint4&) Unexecuted instantiation: astcenc_integer_sequence.cpp:frexp(vfloat4, vint4&) Unexecuted instantiation: astcenc_mathlib.cpp:frexp(vfloat4, vint4&) Unexecuted instantiation: astcenc_partition_tables.cpp:frexp(vfloat4, vint4&) Unexecuted instantiation: astcenc_pick_best_endpoint_format.cpp:frexp(vfloat4, vint4&) Unexecuted instantiation: astcenc_quantization.cpp:frexp(vfloat4, vint4&) Unexecuted instantiation: astcenc_averages_and_directions.cpp:frexp(vfloat4, vint4&) Unexecuted instantiation: astcenc_color_quantize.cpp:frexp(vfloat4, vint4&) Unexecuted instantiation: fuzz_astc_physical_to_symbolic.cpp:frexp(vfloat4, vint4&) |
578 | | |
579 | | /** |
580 | | * @brief Convert float to 16-bit LNS. |
581 | | */ |
582 | | static ASTCENC_SIMD_INLINE vfloat4 float_to_lns(vfloat4 a) |
583 | 27.0k | { |
584 | 27.0k | vint4 exp; |
585 | 27.0k | vfloat4 mant = frexp(a, exp); |
586 | | |
587 | | // Do these early before we start messing about ... |
588 | 27.0k | vmask4 mask_underflow_nan = ~(a > vfloat4(1.0f / 67108864.0f)); |
589 | 27.0k | vmask4 mask_infinity = a >= vfloat4(65536.0f); |
590 | | |
591 | | // If input is smaller than 2^-14, multiply by 2^25 and don't bias. |
592 | 27.0k | vmask4 exp_lt_m13 = exp < vint4(-13); |
593 | | |
594 | 27.0k | vfloat4 a1a = a * 33554432.0f; |
595 | 27.0k | vint4 expa = vint4::zero(); |
596 | | |
597 | 27.0k | vfloat4 a1b = (mant - 0.5f) * 4096; |
598 | 27.0k | vint4 expb = exp + 14; |
599 | | |
600 | 27.0k | a = select(a1b, a1a, exp_lt_m13); |
601 | 27.0k | exp = select(expb, expa, exp_lt_m13); |
602 | | |
603 | 27.0k | vmask4 a_lt_384 = a < vfloat4(384.0f); |
604 | 27.0k | vmask4 a_lt_1408 = a <= vfloat4(1408.0f); |
605 | | |
606 | 27.0k | vfloat4 a2a = a * (4.0f / 3.0f); |
607 | 27.0k | vfloat4 a2b = a + 128.0f; |
608 | 27.0k | vfloat4 a2c = (a + 512.0f) * (4.0f / 5.0f); |
609 | | |
610 | 27.0k | a = a2c; |
611 | 27.0k | a = select(a, a2b, a_lt_1408); |
612 | 27.0k | a = select(a, a2a, a_lt_384); |
613 | | |
614 | 27.0k | a = a + (int_to_float(exp) * 2048.0f) + 1.0f; |
615 | | |
616 | 27.0k | a = select(a, vfloat4(65535.0f), mask_infinity); |
617 | 27.0k | a = select(a, vfloat4::zero(), mask_underflow_nan); |
618 | | |
619 | 27.0k | return a; |
620 | 27.0k | } Unexecuted instantiation: astcenc_entry.cpp:float_to_lns(vfloat4) astcenc_image.cpp:float_to_lns(vfloat4) Line | Count | Source | 583 | 27.0k | { | 584 | 27.0k | vint4 exp; | 585 | 27.0k | vfloat4 mant = frexp(a, exp); | 586 | | | 587 | | // Do these early before we start messing about ... | 588 | 27.0k | vmask4 mask_underflow_nan = ~(a > vfloat4(1.0f / 67108864.0f)); | 589 | 27.0k | vmask4 mask_infinity = a >= vfloat4(65536.0f); | 590 | | | 591 | | // If input is smaller than 2^-14, multiply by 2^25 and don't bias. | 592 | 27.0k | vmask4 exp_lt_m13 = exp < vint4(-13); | 593 | | | 594 | 27.0k | vfloat4 a1a = a * 33554432.0f; | 595 | 27.0k | vint4 expa = vint4::zero(); | 596 | | | 597 | 27.0k | vfloat4 a1b = (mant - 0.5f) * 4096; | 598 | 27.0k | vint4 expb = exp + 14; | 599 | | | 600 | 27.0k | a = select(a1b, a1a, exp_lt_m13); | 601 | 27.0k | exp = select(expb, expa, exp_lt_m13); | 602 | | | 603 | 27.0k | vmask4 a_lt_384 = a < vfloat4(384.0f); | 604 | 27.0k | vmask4 a_lt_1408 = a <= vfloat4(1408.0f); | 605 | | | 606 | 27.0k | vfloat4 a2a = a * (4.0f / 3.0f); | 607 | 27.0k | vfloat4 a2b = a + 128.0f; | 608 | 27.0k | vfloat4 a2c = (a + 512.0f) * (4.0f / 5.0f); | 609 | | | 610 | 27.0k | a = a2c; | 611 | 27.0k | a = select(a, a2b, a_lt_1408); | 612 | 27.0k | a = select(a, a2a, a_lt_384); | 613 | | | 614 | 27.0k | a = a + (int_to_float(exp) * 2048.0f) + 1.0f; | 615 | | | 616 | 27.0k | a = select(a, vfloat4(65535.0f), mask_infinity); | 617 | 27.0k | a = select(a, vfloat4::zero(), mask_underflow_nan); | 618 | | | 619 | 27.0k | return a; | 620 | 27.0k | } |
Unexecuted instantiation: astcenc_mathlib_softfloat.cpp:float_to_lns(vfloat4) Unexecuted instantiation: astcenc_percentile_tables.cpp:float_to_lns(vfloat4) Unexecuted instantiation: astcenc_symbolic_physical.cpp:float_to_lns(vfloat4) Unexecuted instantiation: astcenc_weight_align.cpp:float_to_lns(vfloat4) Unexecuted instantiation: astcenc_weight_quant_xfer_tables.cpp:float_to_lns(vfloat4) Unexecuted instantiation: astcenc_block_sizes.cpp:float_to_lns(vfloat4) Unexecuted instantiation: astcenc_color_unquantize.cpp:float_to_lns(vfloat4) Unexecuted instantiation: astcenc_compress_symbolic.cpp:float_to_lns(vfloat4) Unexecuted instantiation: astcenc_compute_variance.cpp:float_to_lns(vfloat4) Unexecuted instantiation: astcenc_decompress_symbolic.cpp:float_to_lns(vfloat4) Unexecuted instantiation: astcenc_find_best_partitioning.cpp:float_to_lns(vfloat4) Unexecuted instantiation: astcenc_ideal_endpoints_and_weights.cpp:float_to_lns(vfloat4) Unexecuted instantiation: astcenc_integer_sequence.cpp:float_to_lns(vfloat4) Unexecuted instantiation: astcenc_mathlib.cpp:float_to_lns(vfloat4) Unexecuted instantiation: astcenc_partition_tables.cpp:float_to_lns(vfloat4) Unexecuted instantiation: astcenc_pick_best_endpoint_format.cpp:float_to_lns(vfloat4) Unexecuted instantiation: astcenc_quantization.cpp:float_to_lns(vfloat4) Unexecuted instantiation: astcenc_averages_and_directions.cpp:float_to_lns(vfloat4) Unexecuted instantiation: astcenc_color_quantize.cpp:float_to_lns(vfloat4) Unexecuted instantiation: fuzz_astc_physical_to_symbolic.cpp:float_to_lns(vfloat4) |
621 | | |
622 | | namespace astc |
623 | | { |
624 | | |
625 | | static ASTCENC_SIMD_INLINE float pow(float x, float y) |
626 | 1.79k | { |
627 | 1.79k | return pow(vfloat4(x), vfloat4(y)).lane<0>(); |
628 | 1.79k | } astcenc_entry.cpp:astc::pow(float, float) Line | Count | Source | 626 | 1.79k | { | 627 | 1.79k | return pow(vfloat4(x), vfloat4(y)).lane<0>(); | 628 | 1.79k | } |
Unexecuted instantiation: astcenc_image.cpp:astc::pow(float, float) Unexecuted instantiation: astcenc_mathlib_softfloat.cpp:astc::pow(float, float) Unexecuted instantiation: astcenc_percentile_tables.cpp:astc::pow(float, float) Unexecuted instantiation: astcenc_symbolic_physical.cpp:astc::pow(float, float) Unexecuted instantiation: astcenc_weight_align.cpp:astc::pow(float, float) Unexecuted instantiation: astcenc_weight_quant_xfer_tables.cpp:astc::pow(float, float) Unexecuted instantiation: astcenc_block_sizes.cpp:astc::pow(float, float) Unexecuted instantiation: astcenc_color_unquantize.cpp:astc::pow(float, float) Unexecuted instantiation: astcenc_compress_symbolic.cpp:astc::pow(float, float) Unexecuted instantiation: astcenc_compute_variance.cpp:astc::pow(float, float) Unexecuted instantiation: astcenc_decompress_symbolic.cpp:astc::pow(float, float) Unexecuted instantiation: astcenc_find_best_partitioning.cpp:astc::pow(float, float) Unexecuted instantiation: astcenc_ideal_endpoints_and_weights.cpp:astc::pow(float, float) Unexecuted instantiation: astcenc_integer_sequence.cpp:astc::pow(float, float) Unexecuted instantiation: astcenc_mathlib.cpp:astc::pow(float, float) Unexecuted instantiation: astcenc_partition_tables.cpp:astc::pow(float, float) Unexecuted instantiation: astcenc_pick_best_endpoint_format.cpp:astc::pow(float, float) Unexecuted instantiation: astcenc_quantization.cpp:astc::pow(float, float) Unexecuted instantiation: astcenc_averages_and_directions.cpp:astc::pow(float, float) Unexecuted instantiation: astcenc_color_quantize.cpp:astc::pow(float, float) Unexecuted instantiation: fuzz_astc_physical_to_symbolic.cpp:astc::pow(float, float) |
629 | | |
630 | | } |
631 | | |
632 | | #endif // #ifndef ASTC_VECMATHLIB_H_INCLUDED |