Coverage Report

Created: 2026-05-16 06:27

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/astc-encoder/Source/astcenc_vecmathlib.h
Line
Count
Source
1
// SPDX-License-Identifier: Apache-2.0
2
// ----------------------------------------------------------------------------
3
// Copyright 2019-2026 Arm Limited
4
// Copyright 2008 Jose Fonseca
5
// Copyright 2026 Olaf Bernstein
6
//
7
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
8
// use this file except in compliance with the License. You may obtain a copy
9
// of the License at:
10
//
11
//     http://www.apache.org/licenses/LICENSE-2.0
12
//
13
// Unless required by applicable law or agreed to in writing, software
14
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
15
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
16
// License for the specific language governing permissions and limitations
17
// under the License.
18
// ----------------------------------------------------------------------------
19
20
/*
21
 * This module implements vector support for floats, ints, and vector lane
22
 * control masks. It provides access to both explicit vector width types, and
23
 * flexible N-wide types where N can be determined at compile time.
24
 *
25
 * The design of this module encourages use of vector length agnostic code, via
26
 * the vint, vfloat, and vmask types. These will take on the widest SIMD vector
27
 * with that is available at compile time. The current vector width is
28
 * accessible for e.g. loop strides via the ASTCENC_SIMD_WIDTH constant.
29
 *
30
 * Explicit scalar types are accessible via the vint1, vfloat1, vmask1 types.
31
 * These are provided primarily for prototyping and algorithm debug of VLA
32
 * implementations.
33
 *
34
 * Explicit 4-wide types are accessible via the vint4, vfloat4, and vmask4
35
 * types. These are provided for use by VLA code, but are also expected to be
36
 * used as a fixed-width type and will supported a reference C++ fallback for
37
 * use on platforms without SIMD intrinsics.
38
 *
39
 * Explicit 8-wide types are accessible via the vint8, vfloat8, and vmask8
40
 * types. These are provide for use by VLA code, and are not expected to be
41
 * used as a fixed-width type in normal code. No reference C implementation is
42
 * provided on platforms without underlying SIMD intrinsics.
43
 *
44
 * With the current implementation ISA support is provided for:
45
 *
46
 *     * 1-wide for scalar reference
47
 *     * 4-wide for Armv8-A NEON
48
 *     * 4-wide for x86-64 SSE2
49
 *     * 4-wide for x86-64 SSE4.1
50
 *     * 8-wide for Armv8-A SVE
51
 *     * 8-wide for x86-64 AVX2
52
 */
53
54
#ifndef ASTC_VECMATHLIB_H_INCLUDED
55
#define ASTC_VECMATHLIB_H_INCLUDED
56
57
#if ASTCENC_SSE != 0 || ASTCENC_AVX != 0
58
  #include <immintrin.h>
59
#endif
60
61
#if ASTCENC_SVE != 0
62
  #include <arm_sve.h>
63
  #include <arm_neon_sve_bridge.h>
64
#endif
65
66
#if ASTCENC_NEON != 0
67
  #include <arm_neon.h>
68
#endif
69
70
#if !defined(__clang__) && defined(_MSC_VER)
71
  #define ASTCENC_SIMD_INLINE __forceinline
72
  #define ASTCENC_NO_INLINE
73
#elif defined(__GNUC__) && !defined(__clang__)
74
  #define ASTCENC_SIMD_INLINE __attribute__((always_inline)) inline
75
  #define ASTCENC_NO_INLINE __attribute__ ((noinline))
76
#else
77
  #define ASTCENC_SIMD_INLINE __attribute__((always_inline, nodebug)) inline
78
  #define ASTCENC_NO_INLINE __attribute__ ((noinline))
79
#endif
80
81
template<typename T> T gatherf_byte_inds(const float* base, const uint8_t* indices);
82
83
#if ASTCENC_AVX >= 2
84
  // If we have AVX2 expose 8-wide VLA.
85
  #include "astcenc_vecmathlib_sse_4.h"
86
  #include "astcenc_vecmathlib_common_4.h"
87
  #include "astcenc_vecmathlib_avx2_8.h"
88
89
  #define ASTCENC_SIMD_WIDTH 8
90
91
  using vfloat = vfloat8;
92
93
  #if defined(ASTCENC_NO_INVARIANCE)
94
    using vfloatacc = vfloat8;
95
  #else
96
    using vfloatacc = vfloat4;
97
  #endif
98
99
  using vint = vint8;
100
  using vmask = vmask8;
101
102
  using vtable_16x8 = vtable8_16x8;
103
  using vtable_32x8 = vtable8_32x8;
104
  using vtable_64x8 = vtable8_64x8;
105
106
  constexpr auto loada = vfloat8::loada;
107
  constexpr auto load1 = vfloat8::load1;
108
  constexpr auto vint_from_size = vint8_from_size;
109
110
#elif ASTCENC_SSE >= 20
111
  // If we have SSE expose 4-wide VLA, and 4-wide fixed width.
112
  #include "astcenc_vecmathlib_sse_4.h"
113
  #include "astcenc_vecmathlib_common_4.h"
114
115
  #define ASTCENC_SIMD_WIDTH 4
116
117
  using vfloat = vfloat4;
118
  using vfloatacc = vfloat4;
119
  using vint = vint4;
120
  using vmask = vmask4;
121
122
  using vtable_16x8 = vtable4_16x8;
123
  using vtable_32x8 = vtable4_32x8;
124
  using vtable_64x8 = vtable4_64x8;
125
126
  constexpr auto loada = vfloat4::loada;
127
  constexpr auto load1 = vfloat4::load1;
128
  constexpr auto vint_from_size = vint4_from_size;
129
130
#elif ASTCENC_SVE == 8
131
  // Check the compiler is configured with fixed-length 256-bit SVE.
132
  #if !defined(__ARM_FEATURE_SVE_BITS) || (__ARM_FEATURE_SVE_BITS != 256)
133
    #error "__ARM_FEATURE_SVE_BITS is not set to 256 bits"
134
  #endif
135
136
  // If we have SVE configured as 8-wide, expose 8-wide VLA.
137
  #include "astcenc_vecmathlib_neon_4.h"
138
  #include "astcenc_vecmathlib_common_4.h"
139
  #include "astcenc_vecmathlib_sve_8.h"
140
141
  #define ASTCENC_SIMD_WIDTH 8
142
143
  using vfloat = vfloat8;
144
145
  #if defined(ASTCENC_NO_INVARIANCE)
146
    using vfloatacc = vfloat8;
147
  #else
148
    using vfloatacc = vfloat4;
149
  #endif
150
151
  using vint = vint8;
152
  using vmask = vmask8;
153
154
  using vtable_16x8 = vtable8_16x8;
155
  using vtable_32x8 = vtable8_32x8;
156
  using vtable_64x8 = vtable8_64x8;
157
158
  constexpr auto loada = vfloat8::loada;
159
  constexpr auto load1 = vfloat8::load1;
160
  constexpr auto vint_from_size = vint8_from_size;
161
162
#elif ASTCENC_NEON > 0
163
  // If we have NEON expose 4-wide VLA.
164
  #include "astcenc_vecmathlib_neon_4.h"
165
  #include "astcenc_vecmathlib_common_4.h"
166
167
  #define ASTCENC_SIMD_WIDTH 4
168
169
  using vfloat = vfloat4;
170
  using vfloatacc = vfloat4;
171
  using vint = vint4;
172
  using vmask = vmask4;
173
174
  using vtable_16x8 = vtable4_16x8;
175
  using vtable_32x8 = vtable4_32x8;
176
  using vtable_64x8 = vtable4_64x8;
177
178
  constexpr auto loada = vfloat4::loada;
179
  constexpr auto load1 = vfloat4::load1;
180
  constexpr auto vint_from_size = vint4_from_size;
181
182
#elif defined(__riscv_v_fixed_vlen) && (__riscv_v_fixed_vlen == 256)
183
  #include <riscv_vector.h>
184
  #include "astcenc_vecmathlib_none_4.h"
185
  #include "astcenc_vecmathlib_common_4.h"
186
  #include "astcenc_vecmathlib_rvv_n.h"
187
188
  #if defined(ASTCENC_NO_INVARIANCE)
189
    using vfloatacc = vfloat;
190
  #else
191
    using vfloatacc = vfloat4;
192
  #endif
193
194
  constexpr auto loada = vfloat::loada;
195
  constexpr auto load1 = vfloat::load1;
196
197
  // For unit-tests
198
  using vfloat8 = vfloat;
199
  using vint8 = vint;
200
  using vmask8 = vmask;
201
  using vtable8_16x8 = vtable_16x8;
202
  using vtable8_32x8 = vtable_32x8;
203
  using vtable8_64x8 = vtable_64x8;
204
205
#else
206
  // If we have nothing expose 4-wide VLA, and 4-wide fixed width.
207
208
  // Note: We no longer expose the 1-wide scalar fallback because it is not
209
  // invariant with the 4-wide path due to algorithms that use horizontal
210
  // operations that accumulate a local vector sum before accumulating into
211
  // a running sum.
212
  //
213
  // For 4 items adding into an accumulator using 1-wide vectors the sum is:
214
  //
215
  //     result = ((((sum + l0) + l1) + l2) + l3)
216
  //
217
  // ... whereas the accumulator for a 4-wide vector sum is:
218
  //
219
  //     result = sum + ((l0 + l2) + (l1 + l3))
220
  //
221
  // In "normal maths" this is the same, but the floating point reassociation
222
  // differences mean that these will not produce the same result.
223
224
  #include "astcenc_vecmathlib_none_4.h"
225
  #include "astcenc_vecmathlib_common_4.h"
226
227
144M
  #define ASTCENC_SIMD_WIDTH 4
228
229
  using vfloat = vfloat4;
230
  using vfloatacc = vfloat4;
231
  using vint = vint4;
232
  using vmask = vmask4;
233
234
  using vtable_16x8 = vtable4_16x8;
235
  using vtable_32x8 = vtable4_32x8;
236
  using vtable_64x8 = vtable4_64x8;
237
238
  constexpr auto loada = vfloat4::loada;
239
  constexpr auto load1 = vfloat4::load1;
240
  constexpr auto vint_from_size = vint4_from_size;
241
#endif
242
243
/**
244
 * @brief Round a count down to the largest multiple of the SIMD width.
245
 *
246
 * Assumption that the vector width is a power of two ...
247
 *
248
 * @param count   The unrounded value.
249
 *
250
 * @return The rounded value.
251
 */
252
ASTCENC_SIMD_INLINE size_t round_down_to_simd_multiple_vla(size_t count)
253
69.9k
{
254
69.9k
  return count & static_cast<size_t>(~(ASTCENC_SIMD_WIDTH - 1));
255
69.9k
}
256
257
/**
258
 * @brief Round a count up to the largest multiple of the SIMD width.
259
 *
260
 * Assumption that the vector width is a power of two ...
261
 *
262
 * @param count   The unrounded value.
263
 *
264
 * @return The rounded value.
265
 */
266
ASTCENC_SIMD_INLINE size_t round_up_to_simd_multiple_vla(size_t count)
267
42.7M
{
268
42.7M
  size_t multiples = (count + ASTCENC_SIMD_WIDTH - 1) / ASTCENC_SIMD_WIDTH;
269
42.7M
  return multiples * ASTCENC_SIMD_WIDTH;
270
42.7M
}
271
272
/**
273
 * @brief Return @c a with lanes negated if the @c b lane is negative.
274
 */
275
ASTCENC_SIMD_INLINE vfloat change_sign(vfloat a, vfloat b)
276
617k
{
277
617k
  vint ia = float_as_int(a);
278
617k
  vint ib = float_as_int(b);
279
617k
  vint sign_mask(static_cast<int>(0x80000000));
280
617k
  vint r = ia ^ (ib & sign_mask);
281
617k
  return int_as_float(r);
282
617k
}
283
284
/**
285
 * @brief Return fast, but approximate, vector atan(x).
286
 *
287
 * Max error of this implementation is 0.004883.
288
 */
289
ASTCENC_SIMD_INLINE vfloat atan(vfloat x)
290
308k
{
291
308k
  vmask c = abs(x) > vfloat(1.0f);
292
308k
  vfloat z = change_sign(vfloat(astc::PI_OVER_TWO), x);
293
308k
  vfloat y = select(x, vfloat(1.0f) / x, c);
294
308k
  y = y / (y * y * vfloat(0.28f) + vfloat(1.0f));
295
308k
  return select(y, z - y, c);
296
308k
}
297
298
/**
299
 * @brief Return fast, but approximate, vector atan2(x, y).
300
 */
301
ASTCENC_SIMD_INLINE vfloat atan2(vfloat y, vfloat x)
302
308k
{
303
308k
  vfloat z = atan(abs(y / x));
304
308k
  vmask xmask = x < vfloat::zero();
305
308k
  return change_sign(select(z, vfloat(astc::PI) - z, xmask), y);
306
308k
}
307
308
/*
309
 * @brief Factory that returns a unit length 4 component vfloat4.
310
 */
311
static ASTCENC_SIMD_INLINE vfloat4 unit4()
312
525k
{
313
525k
  return vfloat4(0.5f);
314
525k
}
Unexecuted instantiation: astcenc_entry.cpp:unit4()
Unexecuted instantiation: astcenc_image.cpp:unit4()
Unexecuted instantiation: astcenc_mathlib_softfloat.cpp:unit4()
Unexecuted instantiation: astcenc_percentile_tables.cpp:unit4()
Unexecuted instantiation: astcenc_symbolic_physical.cpp:unit4()
Unexecuted instantiation: astcenc_weight_align.cpp:unit4()
Unexecuted instantiation: astcenc_weight_quant_xfer_tables.cpp:unit4()
Unexecuted instantiation: astcenc_block_sizes.cpp:unit4()
Unexecuted instantiation: astcenc_color_unquantize.cpp:unit4()
Unexecuted instantiation: astcenc_compress_symbolic.cpp:unit4()
Unexecuted instantiation: astcenc_compute_variance.cpp:unit4()
Unexecuted instantiation: astcenc_decompress_symbolic.cpp:unit4()
astcenc_find_best_partitioning.cpp:unit4()
Line
Count
Source
312
504k
{
313
504k
  return vfloat4(0.5f);
314
504k
}
astcenc_ideal_endpoints_and_weights.cpp:unit4()
Line
Count
Source
312
21.0k
{
313
21.0k
  return vfloat4(0.5f);
314
21.0k
}
Unexecuted instantiation: astcenc_integer_sequence.cpp:unit4()
Unexecuted instantiation: astcenc_mathlib.cpp:unit4()
Unexecuted instantiation: astcenc_partition_tables.cpp:unit4()
Unexecuted instantiation: astcenc_pick_best_endpoint_format.cpp:unit4()
Unexecuted instantiation: astcenc_quantization.cpp:unit4()
Unexecuted instantiation: astcenc_averages_and_directions.cpp:unit4()
Unexecuted instantiation: astcenc_color_quantize.cpp:unit4()
Unexecuted instantiation: fuzz_astc_physical_to_symbolic.cpp:unit4()
315
316
/**
317
 * @brief Factory that returns a unit length 3 component vfloat4.
318
 */
319
static ASTCENC_SIMD_INLINE vfloat4 unit3()
320
232k
{
321
232k
  float val = 0.577350258827209473f;
322
232k
  return vfloat4(val, val, val, 0.0f);
323
232k
}
Unexecuted instantiation: astcenc_entry.cpp:unit3()
Unexecuted instantiation: astcenc_image.cpp:unit3()
Unexecuted instantiation: astcenc_mathlib_softfloat.cpp:unit3()
Unexecuted instantiation: astcenc_percentile_tables.cpp:unit3()
Unexecuted instantiation: astcenc_symbolic_physical.cpp:unit3()
Unexecuted instantiation: astcenc_weight_align.cpp:unit3()
Unexecuted instantiation: astcenc_weight_quant_xfer_tables.cpp:unit3()
Unexecuted instantiation: astcenc_block_sizes.cpp:unit3()
Unexecuted instantiation: astcenc_color_unquantize.cpp:unit3()
Unexecuted instantiation: astcenc_compress_symbolic.cpp:unit3()
Unexecuted instantiation: astcenc_compute_variance.cpp:unit3()
Unexecuted instantiation: astcenc_decompress_symbolic.cpp:unit3()
astcenc_find_best_partitioning.cpp:unit3()
Line
Count
Source
320
94.6k
{
321
94.6k
  float val = 0.577350258827209473f;
322
94.6k
  return vfloat4(val, val, val, 0.0f);
323
94.6k
}
astcenc_ideal_endpoints_and_weights.cpp:unit3()
Line
Count
Source
320
10.1k
{
321
10.1k
  float val = 0.577350258827209473f;
322
10.1k
  return vfloat4(val, val, val, 0.0f);
323
10.1k
}
Unexecuted instantiation: astcenc_integer_sequence.cpp:unit3()
Unexecuted instantiation: astcenc_mathlib.cpp:unit3()
Unexecuted instantiation: astcenc_partition_tables.cpp:unit3()
astcenc_pick_best_endpoint_format.cpp:unit3()
Line
Count
Source
320
128k
{
321
128k
  float val = 0.577350258827209473f;
322
128k
  return vfloat4(val, val, val, 0.0f);
323
128k
}
Unexecuted instantiation: astcenc_quantization.cpp:unit3()
Unexecuted instantiation: astcenc_averages_and_directions.cpp:unit3()
Unexecuted instantiation: astcenc_color_quantize.cpp:unit3()
Unexecuted instantiation: fuzz_astc_physical_to_symbolic.cpp:unit3()
324
325
/**
326
 * @brief Factory that returns a unit length 2 component vfloat4.
327
 */
328
static ASTCENC_SIMD_INLINE vfloat4 unit2()
329
813
{
330
813
  float val = 0.707106769084930420f;
331
813
  return vfloat4(val, val, 0.0f, 0.0f);
332
813
}
Unexecuted instantiation: astcenc_entry.cpp:unit2()
Unexecuted instantiation: astcenc_image.cpp:unit2()
Unexecuted instantiation: astcenc_mathlib_softfloat.cpp:unit2()
Unexecuted instantiation: astcenc_percentile_tables.cpp:unit2()
Unexecuted instantiation: astcenc_symbolic_physical.cpp:unit2()
Unexecuted instantiation: astcenc_weight_align.cpp:unit2()
Unexecuted instantiation: astcenc_weight_quant_xfer_tables.cpp:unit2()
Unexecuted instantiation: astcenc_block_sizes.cpp:unit2()
Unexecuted instantiation: astcenc_color_unquantize.cpp:unit2()
Unexecuted instantiation: astcenc_compress_symbolic.cpp:unit2()
Unexecuted instantiation: astcenc_compute_variance.cpp:unit2()
Unexecuted instantiation: astcenc_decompress_symbolic.cpp:unit2()
Unexecuted instantiation: astcenc_find_best_partitioning.cpp:unit2()
astcenc_ideal_endpoints_and_weights.cpp:unit2()
Line
Count
Source
329
813
{
330
813
  float val = 0.707106769084930420f;
331
813
  return vfloat4(val, val, 0.0f, 0.0f);
332
813
}
Unexecuted instantiation: astcenc_integer_sequence.cpp:unit2()
Unexecuted instantiation: astcenc_mathlib.cpp:unit2()
Unexecuted instantiation: astcenc_partition_tables.cpp:unit2()
Unexecuted instantiation: astcenc_pick_best_endpoint_format.cpp:unit2()
Unexecuted instantiation: astcenc_quantization.cpp:unit2()
Unexecuted instantiation: astcenc_averages_and_directions.cpp:unit2()
Unexecuted instantiation: astcenc_color_quantize.cpp:unit2()
Unexecuted instantiation: fuzz_astc_physical_to_symbolic.cpp:unit2()
333
334
/**
335
 * @brief Factory that returns a 3 component vfloat4.
336
 */
337
static ASTCENC_SIMD_INLINE vfloat4 vfloat3(float a, float b, float c)
338
1.42M
{
339
1.42M
  return vfloat4(a, b, c, 0.0f);
340
1.42M
}
Unexecuted instantiation: astcenc_entry.cpp:vfloat3(float, float, float)
Unexecuted instantiation: astcenc_image.cpp:vfloat3(float, float, float)
Unexecuted instantiation: astcenc_mathlib_softfloat.cpp:vfloat3(float, float, float)
Unexecuted instantiation: astcenc_percentile_tables.cpp:vfloat3(float, float, float)
Unexecuted instantiation: astcenc_symbolic_physical.cpp:vfloat3(float, float, float)
Unexecuted instantiation: astcenc_weight_align.cpp:vfloat3(float, float, float)
Unexecuted instantiation: astcenc_weight_quant_xfer_tables.cpp:vfloat3(float, float, float)
Unexecuted instantiation: astcenc_block_sizes.cpp:vfloat3(float, float, float)
Unexecuted instantiation: astcenc_color_unquantize.cpp:vfloat3(float, float, float)
Unexecuted instantiation: astcenc_compress_symbolic.cpp:vfloat3(float, float, float)
Unexecuted instantiation: astcenc_compute_variance.cpp:vfloat3(float, float, float)
Unexecuted instantiation: astcenc_decompress_symbolic.cpp:vfloat3(float, float, float)
Unexecuted instantiation: astcenc_find_best_partitioning.cpp:vfloat3(float, float, float)
astcenc_ideal_endpoints_and_weights.cpp:vfloat3(float, float, float)
Line
Count
Source
338
317k
{
339
317k
  return vfloat4(a, b, c, 0.0f);
340
317k
}
Unexecuted instantiation: astcenc_integer_sequence.cpp:vfloat3(float, float, float)
Unexecuted instantiation: astcenc_mathlib.cpp:vfloat3(float, float, float)
Unexecuted instantiation: astcenc_partition_tables.cpp:vfloat3(float, float, float)
Unexecuted instantiation: astcenc_pick_best_endpoint_format.cpp:vfloat3(float, float, float)
Unexecuted instantiation: astcenc_quantization.cpp:vfloat3(float, float, float)
astcenc_averages_and_directions.cpp:vfloat3(float, float, float)
Line
Count
Source
338
1.10M
{
339
1.10M
  return vfloat4(a, b, c, 0.0f);
340
1.10M
}
Unexecuted instantiation: astcenc_color_quantize.cpp:vfloat3(float, float, float)
Unexecuted instantiation: fuzz_astc_physical_to_symbolic.cpp:vfloat3(float, float, float)
341
342
/**
343
 * @brief Factory that returns a 2 component vfloat4.
344
 */
345
static ASTCENC_SIMD_INLINE vfloat4 vfloat2(float a, float b)
346
1.83M
{
347
1.83M
  return vfloat4(a, b, 0.0f, 0.0f);
348
1.83M
}
Unexecuted instantiation: astcenc_entry.cpp:vfloat2(float, float)
Unexecuted instantiation: astcenc_image.cpp:vfloat2(float, float)
Unexecuted instantiation: astcenc_mathlib_softfloat.cpp:vfloat2(float, float)
Unexecuted instantiation: astcenc_percentile_tables.cpp:vfloat2(float, float)
Unexecuted instantiation: astcenc_symbolic_physical.cpp:vfloat2(float, float)
Unexecuted instantiation: astcenc_weight_align.cpp:vfloat2(float, float)
Unexecuted instantiation: astcenc_weight_quant_xfer_tables.cpp:vfloat2(float, float)
Unexecuted instantiation: astcenc_block_sizes.cpp:vfloat2(float, float)
Unexecuted instantiation: astcenc_color_unquantize.cpp:vfloat2(float, float)
Unexecuted instantiation: astcenc_compress_symbolic.cpp:vfloat2(float, float)
Unexecuted instantiation: astcenc_compute_variance.cpp:vfloat2(float, float)
Unexecuted instantiation: astcenc_decompress_symbolic.cpp:vfloat2(float, float)
Unexecuted instantiation: astcenc_find_best_partitioning.cpp:vfloat2(float, float)
astcenc_ideal_endpoints_and_weights.cpp:vfloat2(float, float)
Line
Count
Source
346
1.81M
{
347
1.81M
  return vfloat4(a, b, 0.0f, 0.0f);
348
1.81M
}
Unexecuted instantiation: astcenc_integer_sequence.cpp:vfloat2(float, float)
Unexecuted instantiation: astcenc_mathlib.cpp:vfloat2(float, float)
Unexecuted instantiation: astcenc_partition_tables.cpp:vfloat2(float, float)
Unexecuted instantiation: astcenc_pick_best_endpoint_format.cpp:vfloat2(float, float)
Unexecuted instantiation: astcenc_quantization.cpp:vfloat2(float, float)
astcenc_averages_and_directions.cpp:vfloat2(float, float)
Line
Count
Source
346
16.5k
{
347
16.5k
  return vfloat4(a, b, 0.0f, 0.0f);
348
16.5k
}
Unexecuted instantiation: astcenc_color_quantize.cpp:vfloat2(float, float)
Unexecuted instantiation: fuzz_astc_physical_to_symbolic.cpp:vfloat2(float, float)
349
350
/**
351
 * @brief Normalize a non-zero length vector to unit length.
352
 */
353
static ASTCENC_SIMD_INLINE vfloat4 normalize(vfloat4 a)
354
121k
{
355
121k
  vfloat4 length = dot(a, a);
356
121k
  return a / sqrt(length);
357
121k
}
Unexecuted instantiation: astcenc_entry.cpp:normalize(vfloat4)
Unexecuted instantiation: astcenc_image.cpp:normalize(vfloat4)
Unexecuted instantiation: astcenc_mathlib_softfloat.cpp:normalize(vfloat4)
Unexecuted instantiation: astcenc_percentile_tables.cpp:normalize(vfloat4)
Unexecuted instantiation: astcenc_symbolic_physical.cpp:normalize(vfloat4)
Unexecuted instantiation: astcenc_weight_align.cpp:normalize(vfloat4)
Unexecuted instantiation: astcenc_weight_quant_xfer_tables.cpp:normalize(vfloat4)
Unexecuted instantiation: astcenc_block_sizes.cpp:normalize(vfloat4)
Unexecuted instantiation: astcenc_color_unquantize.cpp:normalize(vfloat4)
Unexecuted instantiation: astcenc_compress_symbolic.cpp:normalize(vfloat4)
Unexecuted instantiation: astcenc_compute_variance.cpp:normalize(vfloat4)
Unexecuted instantiation: astcenc_decompress_symbolic.cpp:normalize(vfloat4)
Unexecuted instantiation: astcenc_find_best_partitioning.cpp:normalize(vfloat4)
astcenc_ideal_endpoints_and_weights.cpp:normalize(vfloat4)
Line
Count
Source
354
121k
{
355
121k
  vfloat4 length = dot(a, a);
356
121k
  return a / sqrt(length);
357
121k
}
Unexecuted instantiation: astcenc_integer_sequence.cpp:normalize(vfloat4)
Unexecuted instantiation: astcenc_mathlib.cpp:normalize(vfloat4)
Unexecuted instantiation: astcenc_partition_tables.cpp:normalize(vfloat4)
Unexecuted instantiation: astcenc_pick_best_endpoint_format.cpp:normalize(vfloat4)
Unexecuted instantiation: astcenc_quantization.cpp:normalize(vfloat4)
Unexecuted instantiation: astcenc_averages_and_directions.cpp:normalize(vfloat4)
Unexecuted instantiation: astcenc_color_quantize.cpp:normalize(vfloat4)
Unexecuted instantiation: fuzz_astc_physical_to_symbolic.cpp:normalize(vfloat4)
358
359
/**
360
 * @brief Normalize a vector, returning @c safe if len is zero.
361
 */
362
static ASTCENC_SIMD_INLINE vfloat4 normalize_safe(vfloat4 a, vfloat4 safe)
363
695k
{
364
695k
  vfloat4 length = dot(a, a);
365
695k
  if (length.lane<0>() != 0.0f)
366
636k
  {
367
636k
    return a / sqrt(length);
368
636k
  }
369
370
59.0k
  return safe;
371
695k
}
Unexecuted instantiation: astcenc_entry.cpp:normalize_safe(vfloat4, vfloat4)
Unexecuted instantiation: astcenc_image.cpp:normalize_safe(vfloat4, vfloat4)
Unexecuted instantiation: astcenc_mathlib_softfloat.cpp:normalize_safe(vfloat4, vfloat4)
Unexecuted instantiation: astcenc_percentile_tables.cpp:normalize_safe(vfloat4, vfloat4)
Unexecuted instantiation: astcenc_symbolic_physical.cpp:normalize_safe(vfloat4, vfloat4)
Unexecuted instantiation: astcenc_weight_align.cpp:normalize_safe(vfloat4, vfloat4)
Unexecuted instantiation: astcenc_weight_quant_xfer_tables.cpp:normalize_safe(vfloat4, vfloat4)
Unexecuted instantiation: astcenc_block_sizes.cpp:normalize_safe(vfloat4, vfloat4)
Unexecuted instantiation: astcenc_color_unquantize.cpp:normalize_safe(vfloat4, vfloat4)
Unexecuted instantiation: astcenc_compress_symbolic.cpp:normalize_safe(vfloat4, vfloat4)
Unexecuted instantiation: astcenc_compute_variance.cpp:normalize_safe(vfloat4, vfloat4)
Unexecuted instantiation: astcenc_decompress_symbolic.cpp:normalize_safe(vfloat4, vfloat4)
astcenc_find_best_partitioning.cpp:normalize_safe(vfloat4, vfloat4)
Line
Count
Source
363
599k
{
364
599k
  vfloat4 length = dot(a, a);
365
599k
  if (length.lane<0>() != 0.0f)
366
546k
  {
367
546k
    return a / sqrt(length);
368
546k
  }
369
370
52.8k
  return safe;
371
599k
}
astcenc_ideal_endpoints_and_weights.cpp:normalize_safe(vfloat4, vfloat4)
Line
Count
Source
363
32.0k
{
364
32.0k
  vfloat4 length = dot(a, a);
365
32.0k
  if (length.lane<0>() != 0.0f)
366
29.5k
  {
367
29.5k
    return a / sqrt(length);
368
29.5k
  }
369
370
2.49k
  return safe;
371
32.0k
}
Unexecuted instantiation: astcenc_integer_sequence.cpp:normalize_safe(vfloat4, vfloat4)
Unexecuted instantiation: astcenc_mathlib.cpp:normalize_safe(vfloat4, vfloat4)
Unexecuted instantiation: astcenc_partition_tables.cpp:normalize_safe(vfloat4, vfloat4)
astcenc_pick_best_endpoint_format.cpp:normalize_safe(vfloat4, vfloat4)
Line
Count
Source
363
64.0k
{
364
64.0k
  vfloat4 length = dot(a, a);
365
64.0k
  if (length.lane<0>() != 0.0f)
366
60.3k
  {
367
60.3k
    return a / sqrt(length);
368
60.3k
  }
369
370
3.67k
  return safe;
371
64.0k
}
Unexecuted instantiation: astcenc_quantization.cpp:normalize_safe(vfloat4, vfloat4)
Unexecuted instantiation: astcenc_averages_and_directions.cpp:normalize_safe(vfloat4, vfloat4)
Unexecuted instantiation: astcenc_color_quantize.cpp:normalize_safe(vfloat4, vfloat4)
Unexecuted instantiation: fuzz_astc_physical_to_symbolic.cpp:normalize_safe(vfloat4, vfloat4)
372
373
374
375
7.19k
#define POLY0(x, c0)                     (                                     c0)
376
7.19k
#define POLY1(x, c0, c1)                 ((POLY0(x, c1) * x)                 + c0)
377
7.19k
#define POLY2(x, c0, c1, c2)             ((POLY1(x, c1, c2) * x)             + c0)
378
7.19k
#define POLY3(x, c0, c1, c2, c3)         ((POLY2(x, c1, c2, c3) * x)         + c0)
379
5.39k
#define POLY4(x, c0, c1, c2, c3, c4)     ((POLY3(x, c1, c2, c3, c4) * x)     + c0)
380
1.79k
#define POLY5(x, c0, c1, c2, c3, c4, c5) ((POLY4(x, c1, c2, c3, c4, c5) * x) + c0)
381
382
/**
383
 * @brief Compute an approximate exp2(x) for each lane in the vector.
384
 *
385
 * Based on 5th degree minimax polynomials, ported from this blog
386
 * https://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html
387
 */
388
static ASTCENC_SIMD_INLINE vfloat4 exp2(vfloat4 x)
389
1.79k
{
390
1.79k
  x = clamp(-126.99999f, 129.0f, x);
391
392
1.79k
  vint4 ipart = float_to_int(x - 0.5f);
393
1.79k
  vfloat4 fpart = x - int_to_float(ipart);
394
395
  // Integer contrib, using 1 << ipart
396
1.79k
  vfloat4 iexp = int_as_float(lsl<23>(ipart + 127));
397
398
  // Fractional contrib, using polynomial fit of 2^x in range [-0.5, 0.5)
399
1.79k
  vfloat4 fexp = POLY5(fpart,
400
1.79k
                       9.9999994e-1f,
401
1.79k
                       6.9315308e-1f,
402
1.79k
                       2.4015361e-1f,
403
1.79k
                       5.5826318e-2f,
404
1.79k
                       8.9893397e-3f,
405
1.79k
                       1.8775767e-3f);
406
407
1.79k
  return iexp * fexp;
408
1.79k
}
astcenc_entry.cpp:exp2(vfloat4)
Line
Count
Source
389
1.79k
{
390
1.79k
  x = clamp(-126.99999f, 129.0f, x);
391
392
1.79k
  vint4 ipart = float_to_int(x - 0.5f);
393
1.79k
  vfloat4 fpart = x - int_to_float(ipart);
394
395
  // Integer contrib, using 1 << ipart
396
1.79k
  vfloat4 iexp = int_as_float(lsl<23>(ipart + 127));
397
398
  // Fractional contrib, using polynomial fit of 2^x in range [-0.5, 0.5)
399
1.79k
  vfloat4 fexp = POLY5(fpart,
400
1.79k
                       9.9999994e-1f,
401
1.79k
                       6.9315308e-1f,
402
1.79k
                       2.4015361e-1f,
403
1.79k
                       5.5826318e-2f,
404
1.79k
                       8.9893397e-3f,
405
1.79k
                       1.8775767e-3f);
406
407
1.79k
  return iexp * fexp;
408
1.79k
}
Unexecuted instantiation: astcenc_image.cpp:exp2(vfloat4)
Unexecuted instantiation: astcenc_mathlib_softfloat.cpp:exp2(vfloat4)
Unexecuted instantiation: astcenc_percentile_tables.cpp:exp2(vfloat4)
Unexecuted instantiation: astcenc_symbolic_physical.cpp:exp2(vfloat4)
Unexecuted instantiation: astcenc_weight_align.cpp:exp2(vfloat4)
Unexecuted instantiation: astcenc_weight_quant_xfer_tables.cpp:exp2(vfloat4)
Unexecuted instantiation: astcenc_block_sizes.cpp:exp2(vfloat4)
Unexecuted instantiation: astcenc_color_unquantize.cpp:exp2(vfloat4)
Unexecuted instantiation: astcenc_compress_symbolic.cpp:exp2(vfloat4)
Unexecuted instantiation: astcenc_compute_variance.cpp:exp2(vfloat4)
Unexecuted instantiation: astcenc_decompress_symbolic.cpp:exp2(vfloat4)
Unexecuted instantiation: astcenc_find_best_partitioning.cpp:exp2(vfloat4)
Unexecuted instantiation: astcenc_ideal_endpoints_and_weights.cpp:exp2(vfloat4)
Unexecuted instantiation: astcenc_integer_sequence.cpp:exp2(vfloat4)
Unexecuted instantiation: astcenc_mathlib.cpp:exp2(vfloat4)
Unexecuted instantiation: astcenc_partition_tables.cpp:exp2(vfloat4)
Unexecuted instantiation: astcenc_pick_best_endpoint_format.cpp:exp2(vfloat4)
Unexecuted instantiation: astcenc_quantization.cpp:exp2(vfloat4)
Unexecuted instantiation: astcenc_averages_and_directions.cpp:exp2(vfloat4)
Unexecuted instantiation: astcenc_color_quantize.cpp:exp2(vfloat4)
Unexecuted instantiation: fuzz_astc_physical_to_symbolic.cpp:exp2(vfloat4)
409
410
/**
411
 * @brief Compute an approximate log2(x) for each lane in the vector.
412
 *
413
 * Based on 5th degree minimax polynomials, ported from this blog
414
 * https://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html
415
 */
416
static ASTCENC_SIMD_INLINE vfloat4 log2(vfloat4 x)
417
1.79k
{
418
1.79k
  vint4 exp(0x7F800000);
419
1.79k
  vint4 mant(0x007FFFFF);
420
1.79k
  vint4 one(0x3F800000);
421
422
1.79k
  vint4 i = float_as_int(x);
423
424
1.79k
  vfloat4 e = int_to_float(lsr<23>(i & exp) - 127);
425
426
1.79k
  vfloat4 m = int_as_float((i & mant) | one);
427
428
  // Polynomial fit of log2(x)/(x - 1), for x in range [1, 2)
429
1.79k
  vfloat4 p = POLY4(m,
430
1.79k
                    2.8882704548164776201f,
431
1.79k
                   -2.52074962577807006663f,
432
1.79k
                    1.48116647521213171641f,
433
1.79k
                   -0.465725644288844778798f,
434
1.79k
                    0.0596515482674574969533f);
435
436
  // Increases the polynomial degree, but ensures that log2(1) == 0
437
1.79k
  p = p * (m - 1.0f);
438
439
1.79k
  return p + e;
440
1.79k
}
astcenc_entry.cpp:log2(vfloat4)
Line
Count
Source
417
1.79k
{
418
1.79k
  vint4 exp(0x7F800000);
419
1.79k
  vint4 mant(0x007FFFFF);
420
1.79k
  vint4 one(0x3F800000);
421
422
1.79k
  vint4 i = float_as_int(x);
423
424
1.79k
  vfloat4 e = int_to_float(lsr<23>(i & exp) - 127);
425
426
1.79k
  vfloat4 m = int_as_float((i & mant) | one);
427
428
  // Polynomial fit of log2(x)/(x - 1), for x in range [1, 2)
429
1.79k
  vfloat4 p = POLY4(m,
430
1.79k
                    2.8882704548164776201f,
431
1.79k
                   -2.52074962577807006663f,
432
1.79k
                    1.48116647521213171641f,
433
1.79k
                   -0.465725644288844778798f,
434
1.79k
                    0.0596515482674574969533f);
435
436
  // Increases the polynomial degree, but ensures that log2(1) == 0
437
1.79k
  p = p * (m - 1.0f);
438
439
1.79k
  return p + e;
440
1.79k
}
Unexecuted instantiation: astcenc_image.cpp:log2(vfloat4)
Unexecuted instantiation: astcenc_mathlib_softfloat.cpp:log2(vfloat4)
Unexecuted instantiation: astcenc_percentile_tables.cpp:log2(vfloat4)
Unexecuted instantiation: astcenc_symbolic_physical.cpp:log2(vfloat4)
Unexecuted instantiation: astcenc_weight_align.cpp:log2(vfloat4)
Unexecuted instantiation: astcenc_weight_quant_xfer_tables.cpp:log2(vfloat4)
Unexecuted instantiation: astcenc_block_sizes.cpp:log2(vfloat4)
Unexecuted instantiation: astcenc_color_unquantize.cpp:log2(vfloat4)
Unexecuted instantiation: astcenc_compress_symbolic.cpp:log2(vfloat4)
Unexecuted instantiation: astcenc_compute_variance.cpp:log2(vfloat4)
Unexecuted instantiation: astcenc_decompress_symbolic.cpp:log2(vfloat4)
Unexecuted instantiation: astcenc_find_best_partitioning.cpp:log2(vfloat4)
Unexecuted instantiation: astcenc_ideal_endpoints_and_weights.cpp:log2(vfloat4)
Unexecuted instantiation: astcenc_integer_sequence.cpp:log2(vfloat4)
Unexecuted instantiation: astcenc_mathlib.cpp:log2(vfloat4)
Unexecuted instantiation: astcenc_partition_tables.cpp:log2(vfloat4)
Unexecuted instantiation: astcenc_pick_best_endpoint_format.cpp:log2(vfloat4)
Unexecuted instantiation: astcenc_quantization.cpp:log2(vfloat4)
Unexecuted instantiation: astcenc_averages_and_directions.cpp:log2(vfloat4)
Unexecuted instantiation: astcenc_color_quantize.cpp:log2(vfloat4)
Unexecuted instantiation: fuzz_astc_physical_to_symbolic.cpp:log2(vfloat4)
441
442
/**
443
 * @brief Compute an approximate pow(x, y) for each lane in the vector.
444
 *
445
 * Power function based on the exp2(log2(x) * y) transform.
446
 */
447
static ASTCENC_SIMD_INLINE vfloat4 pow(vfloat4 x, vfloat4 y)
448
1.79k
{
449
1.79k
  vmask4 zero_mask = y == vfloat4(0.0f);
450
1.79k
  vfloat4 estimate = exp2(log2(x) * y);
451
452
  // Guarantee that y == 0 returns exactly 1.0f
453
1.79k
  return select(estimate, vfloat4(1.0f), zero_mask);
454
1.79k
}
astcenc_entry.cpp:pow(vfloat4, vfloat4)
Line
Count
Source
448
1.79k
{
449
1.79k
  vmask4 zero_mask = y == vfloat4(0.0f);
450
1.79k
  vfloat4 estimate = exp2(log2(x) * y);
451
452
  // Guarantee that y == 0 returns exactly 1.0f
453
1.79k
  return select(estimate, vfloat4(1.0f), zero_mask);
454
1.79k
}
Unexecuted instantiation: astcenc_image.cpp:pow(vfloat4, vfloat4)
Unexecuted instantiation: astcenc_mathlib_softfloat.cpp:pow(vfloat4, vfloat4)
Unexecuted instantiation: astcenc_percentile_tables.cpp:pow(vfloat4, vfloat4)
Unexecuted instantiation: astcenc_symbolic_physical.cpp:pow(vfloat4, vfloat4)
Unexecuted instantiation: astcenc_weight_align.cpp:pow(vfloat4, vfloat4)
Unexecuted instantiation: astcenc_weight_quant_xfer_tables.cpp:pow(vfloat4, vfloat4)
Unexecuted instantiation: astcenc_block_sizes.cpp:pow(vfloat4, vfloat4)
Unexecuted instantiation: astcenc_color_unquantize.cpp:pow(vfloat4, vfloat4)
Unexecuted instantiation: astcenc_compress_symbolic.cpp:pow(vfloat4, vfloat4)
Unexecuted instantiation: astcenc_compute_variance.cpp:pow(vfloat4, vfloat4)
Unexecuted instantiation: astcenc_decompress_symbolic.cpp:pow(vfloat4, vfloat4)
Unexecuted instantiation: astcenc_find_best_partitioning.cpp:pow(vfloat4, vfloat4)
Unexecuted instantiation: astcenc_ideal_endpoints_and_weights.cpp:pow(vfloat4, vfloat4)
Unexecuted instantiation: astcenc_integer_sequence.cpp:pow(vfloat4, vfloat4)
Unexecuted instantiation: astcenc_mathlib.cpp:pow(vfloat4, vfloat4)
Unexecuted instantiation: astcenc_partition_tables.cpp:pow(vfloat4, vfloat4)
Unexecuted instantiation: astcenc_pick_best_endpoint_format.cpp:pow(vfloat4, vfloat4)
Unexecuted instantiation: astcenc_quantization.cpp:pow(vfloat4, vfloat4)
Unexecuted instantiation: astcenc_averages_and_directions.cpp:pow(vfloat4, vfloat4)
Unexecuted instantiation: astcenc_color_quantize.cpp:pow(vfloat4, vfloat4)
Unexecuted instantiation: fuzz_astc_physical_to_symbolic.cpp:pow(vfloat4, vfloat4)
455
456
/**
457
 * @brief Count the leading zeros for each lane in @c a.
458
 *
459
 * Valid for all data values of @c a; will return a per-lane value [0, 32].
460
 */
461
static ASTCENC_SIMD_INLINE vint4 clz(vint4 a)
462
190k
{
463
  // This function is a horrible abuse of floating point exponents to convert
464
  // the original integer value into a 2^N encoding we can recover easily.
465
466
  // Convert to float without risk of rounding up by keeping only top 8 bits.
467
  // This trick is is guaranteed to keep top 8 bits and clear the 9th.
468
190k
  a = (~lsr<8>(a)) & a;
469
190k
  a = float_as_int(int_to_float(a));
470
471
  // Extract and unbias exponent
472
190k
  a = vint4(127 + 31) - lsr<23>(a);
473
474
  // Clamp result to a valid 32-bit range
475
190k
  return clamp(0, 32, a);
476
190k
}
Unexecuted instantiation: astcenc_entry.cpp:clz(vint4)
Unexecuted instantiation: astcenc_image.cpp:clz(vint4)
Unexecuted instantiation: astcenc_mathlib_softfloat.cpp:clz(vint4)
Unexecuted instantiation: astcenc_percentile_tables.cpp:clz(vint4)
Unexecuted instantiation: astcenc_symbolic_physical.cpp:clz(vint4)
Unexecuted instantiation: astcenc_weight_align.cpp:clz(vint4)
Unexecuted instantiation: astcenc_weight_quant_xfer_tables.cpp:clz(vint4)
Unexecuted instantiation: astcenc_block_sizes.cpp:clz(vint4)
Unexecuted instantiation: astcenc_color_unquantize.cpp:clz(vint4)
Unexecuted instantiation: astcenc_compress_symbolic.cpp:clz(vint4)
Unexecuted instantiation: astcenc_compute_variance.cpp:clz(vint4)
astcenc_decompress_symbolic.cpp:clz(vint4)
Line
Count
Source
462
190k
{
463
  // This function is a horrible abuse of floating point exponents to convert
464
  // the original integer value into a 2^N encoding we can recover easily.
465
466
  // Convert to float without risk of rounding up by keeping only top 8 bits.
467
  // This trick is is guaranteed to keep top 8 bits and clear the 9th.
468
190k
  a = (~lsr<8>(a)) & a;
469
190k
  a = float_as_int(int_to_float(a));
470
471
  // Extract and unbias exponent
472
190k
  a = vint4(127 + 31) - lsr<23>(a);
473
474
  // Clamp result to a valid 32-bit range
475
190k
  return clamp(0, 32, a);
476
190k
}
Unexecuted instantiation: astcenc_find_best_partitioning.cpp:clz(vint4)
Unexecuted instantiation: astcenc_ideal_endpoints_and_weights.cpp:clz(vint4)
Unexecuted instantiation: astcenc_integer_sequence.cpp:clz(vint4)
Unexecuted instantiation: astcenc_mathlib.cpp:clz(vint4)
Unexecuted instantiation: astcenc_partition_tables.cpp:clz(vint4)
Unexecuted instantiation: astcenc_pick_best_endpoint_format.cpp:clz(vint4)
Unexecuted instantiation: astcenc_quantization.cpp:clz(vint4)
Unexecuted instantiation: astcenc_averages_and_directions.cpp:clz(vint4)
Unexecuted instantiation: astcenc_color_quantize.cpp:clz(vint4)
Unexecuted instantiation: fuzz_astc_physical_to_symbolic.cpp:clz(vint4)
477
478
/**
479
 * @brief Return lanewise 2^a for each lane in @c a.
480
 *
481
 * Use of signed int means that this is only valid for values in range [0, 31].
482
 */
483
static ASTCENC_SIMD_INLINE vint4 two_to_the_n(vint4 a)
484
190k
{
485
  // 2^30 is the largest signed number than can be represented
486
190k
  assert(all(a < vint4(31)));
487
488
  // This function is a horrible abuse of floating point to use the exponent
489
  // and float conversion to generate a 2^N multiple.
490
491
  // Bias the exponent
492
190k
  vint4 exp = a + 127;
493
190k
  exp = lsl<23>(exp);
494
495
  // Reinterpret the bits as a float, and then convert to an int
496
190k
  vfloat4 f = int_as_float(exp);
497
190k
  return float_to_int(f);
498
190k
}
Unexecuted instantiation: astcenc_entry.cpp:two_to_the_n(vint4)
Unexecuted instantiation: astcenc_image.cpp:two_to_the_n(vint4)
Unexecuted instantiation: astcenc_mathlib_softfloat.cpp:two_to_the_n(vint4)
Unexecuted instantiation: astcenc_percentile_tables.cpp:two_to_the_n(vint4)
Unexecuted instantiation: astcenc_symbolic_physical.cpp:two_to_the_n(vint4)
Unexecuted instantiation: astcenc_weight_align.cpp:two_to_the_n(vint4)
Unexecuted instantiation: astcenc_weight_quant_xfer_tables.cpp:two_to_the_n(vint4)
Unexecuted instantiation: astcenc_block_sizes.cpp:two_to_the_n(vint4)
Unexecuted instantiation: astcenc_color_unquantize.cpp:two_to_the_n(vint4)
Unexecuted instantiation: astcenc_compress_symbolic.cpp:two_to_the_n(vint4)
Unexecuted instantiation: astcenc_compute_variance.cpp:two_to_the_n(vint4)
astcenc_decompress_symbolic.cpp:two_to_the_n(vint4)
Line
Count
Source
484
190k
{
485
  // 2^30 is the largest signed number than can be represented
486
190k
  assert(all(a < vint4(31)));
487
488
  // This function is a horrible abuse of floating point to use the exponent
489
  // and float conversion to generate a 2^N multiple.
490
491
  // Bias the exponent
492
190k
  vint4 exp = a + 127;
493
190k
  exp = lsl<23>(exp);
494
495
  // Reinterpret the bits as a float, and then convert to an int
496
190k
  vfloat4 f = int_as_float(exp);
497
190k
  return float_to_int(f);
498
190k
}
Unexecuted instantiation: astcenc_find_best_partitioning.cpp:two_to_the_n(vint4)
Unexecuted instantiation: astcenc_ideal_endpoints_and_weights.cpp:two_to_the_n(vint4)
Unexecuted instantiation: astcenc_integer_sequence.cpp:two_to_the_n(vint4)
Unexecuted instantiation: astcenc_mathlib.cpp:two_to_the_n(vint4)
Unexecuted instantiation: astcenc_partition_tables.cpp:two_to_the_n(vint4)
Unexecuted instantiation: astcenc_pick_best_endpoint_format.cpp:two_to_the_n(vint4)
Unexecuted instantiation: astcenc_quantization.cpp:two_to_the_n(vint4)
Unexecuted instantiation: astcenc_averages_and_directions.cpp:two_to_the_n(vint4)
Unexecuted instantiation: astcenc_color_quantize.cpp:two_to_the_n(vint4)
Unexecuted instantiation: fuzz_astc_physical_to_symbolic.cpp:two_to_the_n(vint4)
499
500
/**
501
 * @brief Convert unorm16 [0, 65535] to float16 in range [0, 1].
502
 */
503
static ASTCENC_SIMD_INLINE vint4 unorm16_to_sf16(vint4 p)
504
190k
{
505
190k
  vint4 fp16_one = vint4(0x3C00);
506
190k
  vint4 fp16_small = lsl<8>(p);
507
508
190k
  vmask4 is_one = p == vint4(0xFFFF);
509
190k
  vmask4 is_small = p < vint4(4);
510
511
  // Manually inline clz() on Visual Studio to avoid release build codegen bug
512
  // see https://github.com/ARM-software/astc-encoder/issues/259
513
#if !defined(__clang__) && defined(_MSC_VER)
514
  vint4 a = (~lsr<8>(p)) & p;
515
  a = float_as_int(int_to_float(a));
516
  a = vint4(127 + 31) - lsr<23>(a);
517
  vint4 lz = clamp(0, 32, a) - 16;
518
#else
519
190k
  vint4 lz = clz(p) - 16;
520
190k
#endif
521
522
190k
  p = p * two_to_the_n(lz + 1);
523
190k
  p = p & vint4(0xFFFF);
524
525
190k
  p = lsr<6>(p);
526
527
190k
  p = p | lsl<10>(vint4(14) - lz);
528
529
190k
  vint4 r = select(p, fp16_one, is_one);
530
190k
  r = select(r, fp16_small, is_small);
531
190k
  return r;
532
190k
}
Unexecuted instantiation: astcenc_entry.cpp:unorm16_to_sf16(vint4)
Unexecuted instantiation: astcenc_image.cpp:unorm16_to_sf16(vint4)
Unexecuted instantiation: astcenc_mathlib_softfloat.cpp:unorm16_to_sf16(vint4)
Unexecuted instantiation: astcenc_percentile_tables.cpp:unorm16_to_sf16(vint4)
Unexecuted instantiation: astcenc_symbolic_physical.cpp:unorm16_to_sf16(vint4)
Unexecuted instantiation: astcenc_weight_align.cpp:unorm16_to_sf16(vint4)
Unexecuted instantiation: astcenc_weight_quant_xfer_tables.cpp:unorm16_to_sf16(vint4)
Unexecuted instantiation: astcenc_block_sizes.cpp:unorm16_to_sf16(vint4)
Unexecuted instantiation: astcenc_color_unquantize.cpp:unorm16_to_sf16(vint4)
Unexecuted instantiation: astcenc_compress_symbolic.cpp:unorm16_to_sf16(vint4)
Unexecuted instantiation: astcenc_compute_variance.cpp:unorm16_to_sf16(vint4)
astcenc_decompress_symbolic.cpp:unorm16_to_sf16(vint4)
Line
Count
Source
504
190k
{
505
190k
  vint4 fp16_one = vint4(0x3C00);
506
190k
  vint4 fp16_small = lsl<8>(p);
507
508
190k
  vmask4 is_one = p == vint4(0xFFFF);
509
190k
  vmask4 is_small = p < vint4(4);
510
511
  // Manually inline clz() on Visual Studio to avoid release build codegen bug
512
  // see https://github.com/ARM-software/astc-encoder/issues/259
513
#if !defined(__clang__) && defined(_MSC_VER)
514
  vint4 a = (~lsr<8>(p)) & p;
515
  a = float_as_int(int_to_float(a));
516
  a = vint4(127 + 31) - lsr<23>(a);
517
  vint4 lz = clamp(0, 32, a) - 16;
518
#else
519
190k
  vint4 lz = clz(p) - 16;
520
190k
#endif
521
522
190k
  p = p * two_to_the_n(lz + 1);
523
190k
  p = p & vint4(0xFFFF);
524
525
190k
  p = lsr<6>(p);
526
527
190k
  p = p | lsl<10>(vint4(14) - lz);
528
529
190k
  vint4 r = select(p, fp16_one, is_one);
530
190k
  r = select(r, fp16_small, is_small);
531
190k
  return r;
532
190k
}
Unexecuted instantiation: astcenc_find_best_partitioning.cpp:unorm16_to_sf16(vint4)
Unexecuted instantiation: astcenc_ideal_endpoints_and_weights.cpp:unorm16_to_sf16(vint4)
Unexecuted instantiation: astcenc_integer_sequence.cpp:unorm16_to_sf16(vint4)
Unexecuted instantiation: astcenc_mathlib.cpp:unorm16_to_sf16(vint4)
Unexecuted instantiation: astcenc_partition_tables.cpp:unorm16_to_sf16(vint4)
Unexecuted instantiation: astcenc_pick_best_endpoint_format.cpp:unorm16_to_sf16(vint4)
Unexecuted instantiation: astcenc_quantization.cpp:unorm16_to_sf16(vint4)
Unexecuted instantiation: astcenc_averages_and_directions.cpp:unorm16_to_sf16(vint4)
Unexecuted instantiation: astcenc_color_quantize.cpp:unorm16_to_sf16(vint4)
Unexecuted instantiation: fuzz_astc_physical_to_symbolic.cpp:unorm16_to_sf16(vint4)
533
534
/**
535
 * @brief Convert 16-bit LNS to float16.
536
 */
537
static ASTCENC_SIMD_INLINE vint4 lns_to_sf16(vint4 p)
538
36.6k
{
539
36.6k
  vint4 mc = p & 0x7FF;
540
36.6k
  vint4 ec = lsr<11>(p);
541
542
36.6k
  vint4 mc_512 = mc * 3;
543
36.6k
  vmask4 mask_512 = mc < vint4(512);
544
545
36.6k
  vint4 mc_1536 = mc * 4 - 512;
546
36.6k
  vmask4 mask_1536 = mc < vint4(1536);
547
548
36.6k
  vint4 mc_else = mc * 5 - 2048;
549
550
36.6k
  vint4 mt = mc_else;
551
36.6k
  mt = select(mt, mc_1536, mask_1536);
552
36.6k
  mt = select(mt, mc_512, mask_512);
553
554
36.6k
  vint4 res = lsl<10>(ec) | lsr<3>(mt);
555
36.6k
  return min(res, vint4(0x7BFF));
556
36.6k
}
Unexecuted instantiation: astcenc_entry.cpp:lns_to_sf16(vint4)
astcenc_image.cpp:lns_to_sf16(vint4)
Line
Count
Source
538
1.07k
{
539
1.07k
  vint4 mc = p & 0x7FF;
540
1.07k
  vint4 ec = lsr<11>(p);
541
542
1.07k
  vint4 mc_512 = mc * 3;
543
1.07k
  vmask4 mask_512 = mc < vint4(512);
544
545
1.07k
  vint4 mc_1536 = mc * 4 - 512;
546
1.07k
  vmask4 mask_1536 = mc < vint4(1536);
547
548
1.07k
  vint4 mc_else = mc * 5 - 2048;
549
550
1.07k
  vint4 mt = mc_else;
551
1.07k
  mt = select(mt, mc_1536, mask_1536);
552
1.07k
  mt = select(mt, mc_512, mask_512);
553
554
1.07k
  vint4 res = lsl<10>(ec) | lsr<3>(mt);
555
1.07k
  return min(res, vint4(0x7BFF));
556
1.07k
}
Unexecuted instantiation: astcenc_mathlib_softfloat.cpp:lns_to_sf16(vint4)
Unexecuted instantiation: astcenc_percentile_tables.cpp:lns_to_sf16(vint4)
Unexecuted instantiation: astcenc_symbolic_physical.cpp:lns_to_sf16(vint4)
Unexecuted instantiation: astcenc_weight_align.cpp:lns_to_sf16(vint4)
Unexecuted instantiation: astcenc_weight_quant_xfer_tables.cpp:lns_to_sf16(vint4)
Unexecuted instantiation: astcenc_block_sizes.cpp:lns_to_sf16(vint4)
Unexecuted instantiation: astcenc_color_unquantize.cpp:lns_to_sf16(vint4)
Unexecuted instantiation: astcenc_compress_symbolic.cpp:lns_to_sf16(vint4)
Unexecuted instantiation: astcenc_compute_variance.cpp:lns_to_sf16(vint4)
astcenc_decompress_symbolic.cpp:lns_to_sf16(vint4)
Line
Count
Source
538
35.5k
{
539
35.5k
  vint4 mc = p & 0x7FF;
540
35.5k
  vint4 ec = lsr<11>(p);
541
542
35.5k
  vint4 mc_512 = mc * 3;
543
35.5k
  vmask4 mask_512 = mc < vint4(512);
544
545
35.5k
  vint4 mc_1536 = mc * 4 - 512;
546
35.5k
  vmask4 mask_1536 = mc < vint4(1536);
547
548
35.5k
  vint4 mc_else = mc * 5 - 2048;
549
550
35.5k
  vint4 mt = mc_else;
551
35.5k
  mt = select(mt, mc_1536, mask_1536);
552
35.5k
  mt = select(mt, mc_512, mask_512);
553
554
35.5k
  vint4 res = lsl<10>(ec) | lsr<3>(mt);
555
35.5k
  return min(res, vint4(0x7BFF));
556
35.5k
}
Unexecuted instantiation: astcenc_find_best_partitioning.cpp:lns_to_sf16(vint4)
Unexecuted instantiation: astcenc_ideal_endpoints_and_weights.cpp:lns_to_sf16(vint4)
Unexecuted instantiation: astcenc_integer_sequence.cpp:lns_to_sf16(vint4)
Unexecuted instantiation: astcenc_mathlib.cpp:lns_to_sf16(vint4)
Unexecuted instantiation: astcenc_partition_tables.cpp:lns_to_sf16(vint4)
Unexecuted instantiation: astcenc_pick_best_endpoint_format.cpp:lns_to_sf16(vint4)
Unexecuted instantiation: astcenc_quantization.cpp:lns_to_sf16(vint4)
Unexecuted instantiation: astcenc_averages_and_directions.cpp:lns_to_sf16(vint4)
Unexecuted instantiation: astcenc_color_quantize.cpp:lns_to_sf16(vint4)
Unexecuted instantiation: fuzz_astc_physical_to_symbolic.cpp:lns_to_sf16(vint4)
557
558
/**
559
 * @brief Extract mantissa and exponent of a float value.
560
 *
561
 * @param      a      The input value.
562
 * @param[out] exp    The output exponent.
563
 *
564
 * @return The mantissa.
565
 */
566
static ASTCENC_SIMD_INLINE vfloat4 frexp(vfloat4 a, vint4& exp)
567
27.0k
{
568
  // Interpret the bits as an integer
569
27.0k
  vint4 ai = float_as_int(a);
570
571
  // Extract and unbias the exponent
572
27.0k
  exp = (lsr<23>(ai) & 0xFF) - 126;
573
574
  // Extract and unbias the mantissa
575
27.0k
  vint4 manti = (ai &  static_cast<int>(0x807FFFFF)) | 0x3F000000;
576
27.0k
  return int_as_float(manti);
577
27.0k
}
Unexecuted instantiation: astcenc_entry.cpp:frexp(vfloat4, vint4&)
astcenc_image.cpp:frexp(vfloat4, vint4&)
Line
Count
Source
567
27.0k
{
568
  // Interpret the bits as an integer
569
27.0k
  vint4 ai = float_as_int(a);
570
571
  // Extract and unbias the exponent
572
27.0k
  exp = (lsr<23>(ai) & 0xFF) - 126;
573
574
  // Extract and unbias the mantissa
575
27.0k
  vint4 manti = (ai &  static_cast<int>(0x807FFFFF)) | 0x3F000000;
576
27.0k
  return int_as_float(manti);
577
27.0k
}
Unexecuted instantiation: astcenc_mathlib_softfloat.cpp:frexp(vfloat4, vint4&)
Unexecuted instantiation: astcenc_percentile_tables.cpp:frexp(vfloat4, vint4&)
Unexecuted instantiation: astcenc_symbolic_physical.cpp:frexp(vfloat4, vint4&)
Unexecuted instantiation: astcenc_weight_align.cpp:frexp(vfloat4, vint4&)
Unexecuted instantiation: astcenc_weight_quant_xfer_tables.cpp:frexp(vfloat4, vint4&)
Unexecuted instantiation: astcenc_block_sizes.cpp:frexp(vfloat4, vint4&)
Unexecuted instantiation: astcenc_color_unquantize.cpp:frexp(vfloat4, vint4&)
Unexecuted instantiation: astcenc_compress_symbolic.cpp:frexp(vfloat4, vint4&)
Unexecuted instantiation: astcenc_compute_variance.cpp:frexp(vfloat4, vint4&)
Unexecuted instantiation: astcenc_decompress_symbolic.cpp:frexp(vfloat4, vint4&)
Unexecuted instantiation: astcenc_find_best_partitioning.cpp:frexp(vfloat4, vint4&)
Unexecuted instantiation: astcenc_ideal_endpoints_and_weights.cpp:frexp(vfloat4, vint4&)
Unexecuted instantiation: astcenc_integer_sequence.cpp:frexp(vfloat4, vint4&)
Unexecuted instantiation: astcenc_mathlib.cpp:frexp(vfloat4, vint4&)
Unexecuted instantiation: astcenc_partition_tables.cpp:frexp(vfloat4, vint4&)
Unexecuted instantiation: astcenc_pick_best_endpoint_format.cpp:frexp(vfloat4, vint4&)
Unexecuted instantiation: astcenc_quantization.cpp:frexp(vfloat4, vint4&)
Unexecuted instantiation: astcenc_averages_and_directions.cpp:frexp(vfloat4, vint4&)
Unexecuted instantiation: astcenc_color_quantize.cpp:frexp(vfloat4, vint4&)
Unexecuted instantiation: fuzz_astc_physical_to_symbolic.cpp:frexp(vfloat4, vint4&)
578
579
/**
580
 * @brief Convert float to 16-bit LNS.
581
 */
582
static ASTCENC_SIMD_INLINE vfloat4 float_to_lns(vfloat4 a)
583
27.0k
{
584
27.0k
  vint4 exp;
585
27.0k
  vfloat4 mant = frexp(a, exp);
586
587
  // Do these early before we start messing about ...
588
27.0k
  vmask4 mask_underflow_nan = ~(a > vfloat4(1.0f / 67108864.0f));
589
27.0k
  vmask4 mask_infinity = a >= vfloat4(65536.0f);
590
591
  // If input is smaller than 2^-14, multiply by 2^25 and don't bias.
592
27.0k
  vmask4 exp_lt_m13 = exp < vint4(-13);
593
594
27.0k
  vfloat4 a1a = a * 33554432.0f;
595
27.0k
  vint4 expa = vint4::zero();
596
597
27.0k
  vfloat4 a1b = (mant - 0.5f) * 4096;
598
27.0k
  vint4 expb = exp + 14;
599
600
27.0k
  a = select(a1b, a1a, exp_lt_m13);
601
27.0k
  exp = select(expb, expa, exp_lt_m13);
602
603
27.0k
  vmask4 a_lt_384 = a < vfloat4(384.0f);
604
27.0k
  vmask4 a_lt_1408 = a <= vfloat4(1408.0f);
605
606
27.0k
  vfloat4 a2a = a * (4.0f / 3.0f);
607
27.0k
  vfloat4 a2b = a + 128.0f;
608
27.0k
  vfloat4 a2c = (a + 512.0f) * (4.0f / 5.0f);
609
610
27.0k
  a = a2c;
611
27.0k
  a = select(a, a2b, a_lt_1408);
612
27.0k
  a = select(a, a2a, a_lt_384);
613
614
27.0k
  a = a + (int_to_float(exp) * 2048.0f) + 1.0f;
615
616
27.0k
  a = select(a, vfloat4(65535.0f), mask_infinity);
617
27.0k
  a = select(a, vfloat4::zero(), mask_underflow_nan);
618
619
27.0k
  return a;
620
27.0k
}
Unexecuted instantiation: astcenc_entry.cpp:float_to_lns(vfloat4)
astcenc_image.cpp:float_to_lns(vfloat4)
Line
Count
Source
583
27.0k
{
584
27.0k
  vint4 exp;
585
27.0k
  vfloat4 mant = frexp(a, exp);
586
587
  // Do these early before we start messing about ...
588
27.0k
  vmask4 mask_underflow_nan = ~(a > vfloat4(1.0f / 67108864.0f));
589
27.0k
  vmask4 mask_infinity = a >= vfloat4(65536.0f);
590
591
  // If input is smaller than 2^-14, multiply by 2^25 and don't bias.
592
27.0k
  vmask4 exp_lt_m13 = exp < vint4(-13);
593
594
27.0k
  vfloat4 a1a = a * 33554432.0f;
595
27.0k
  vint4 expa = vint4::zero();
596
597
27.0k
  vfloat4 a1b = (mant - 0.5f) * 4096;
598
27.0k
  vint4 expb = exp + 14;
599
600
27.0k
  a = select(a1b, a1a, exp_lt_m13);
601
27.0k
  exp = select(expb, expa, exp_lt_m13);
602
603
27.0k
  vmask4 a_lt_384 = a < vfloat4(384.0f);
604
27.0k
  vmask4 a_lt_1408 = a <= vfloat4(1408.0f);
605
606
27.0k
  vfloat4 a2a = a * (4.0f / 3.0f);
607
27.0k
  vfloat4 a2b = a + 128.0f;
608
27.0k
  vfloat4 a2c = (a + 512.0f) * (4.0f / 5.0f);
609
610
27.0k
  a = a2c;
611
27.0k
  a = select(a, a2b, a_lt_1408);
612
27.0k
  a = select(a, a2a, a_lt_384);
613
614
27.0k
  a = a + (int_to_float(exp) * 2048.0f) + 1.0f;
615
616
27.0k
  a = select(a, vfloat4(65535.0f), mask_infinity);
617
27.0k
  a = select(a, vfloat4::zero(), mask_underflow_nan);
618
619
27.0k
  return a;
620
27.0k
}
Unexecuted instantiation: astcenc_mathlib_softfloat.cpp:float_to_lns(vfloat4)
Unexecuted instantiation: astcenc_percentile_tables.cpp:float_to_lns(vfloat4)
Unexecuted instantiation: astcenc_symbolic_physical.cpp:float_to_lns(vfloat4)
Unexecuted instantiation: astcenc_weight_align.cpp:float_to_lns(vfloat4)
Unexecuted instantiation: astcenc_weight_quant_xfer_tables.cpp:float_to_lns(vfloat4)
Unexecuted instantiation: astcenc_block_sizes.cpp:float_to_lns(vfloat4)
Unexecuted instantiation: astcenc_color_unquantize.cpp:float_to_lns(vfloat4)
Unexecuted instantiation: astcenc_compress_symbolic.cpp:float_to_lns(vfloat4)
Unexecuted instantiation: astcenc_compute_variance.cpp:float_to_lns(vfloat4)
Unexecuted instantiation: astcenc_decompress_symbolic.cpp:float_to_lns(vfloat4)
Unexecuted instantiation: astcenc_find_best_partitioning.cpp:float_to_lns(vfloat4)
Unexecuted instantiation: astcenc_ideal_endpoints_and_weights.cpp:float_to_lns(vfloat4)
Unexecuted instantiation: astcenc_integer_sequence.cpp:float_to_lns(vfloat4)
Unexecuted instantiation: astcenc_mathlib.cpp:float_to_lns(vfloat4)
Unexecuted instantiation: astcenc_partition_tables.cpp:float_to_lns(vfloat4)
Unexecuted instantiation: astcenc_pick_best_endpoint_format.cpp:float_to_lns(vfloat4)
Unexecuted instantiation: astcenc_quantization.cpp:float_to_lns(vfloat4)
Unexecuted instantiation: astcenc_averages_and_directions.cpp:float_to_lns(vfloat4)
Unexecuted instantiation: astcenc_color_quantize.cpp:float_to_lns(vfloat4)
Unexecuted instantiation: fuzz_astc_physical_to_symbolic.cpp:float_to_lns(vfloat4)
621
622
namespace astc
623
{
624
625
static ASTCENC_SIMD_INLINE float pow(float x, float y)
626
1.79k
{
627
1.79k
  return pow(vfloat4(x), vfloat4(y)).lane<0>();
628
1.79k
}
astcenc_entry.cpp:astc::pow(float, float)
Line
Count
Source
626
1.79k
{
627
1.79k
  return pow(vfloat4(x), vfloat4(y)).lane<0>();
628
1.79k
}
Unexecuted instantiation: astcenc_image.cpp:astc::pow(float, float)
Unexecuted instantiation: astcenc_mathlib_softfloat.cpp:astc::pow(float, float)
Unexecuted instantiation: astcenc_percentile_tables.cpp:astc::pow(float, float)
Unexecuted instantiation: astcenc_symbolic_physical.cpp:astc::pow(float, float)
Unexecuted instantiation: astcenc_weight_align.cpp:astc::pow(float, float)
Unexecuted instantiation: astcenc_weight_quant_xfer_tables.cpp:astc::pow(float, float)
Unexecuted instantiation: astcenc_block_sizes.cpp:astc::pow(float, float)
Unexecuted instantiation: astcenc_color_unquantize.cpp:astc::pow(float, float)
Unexecuted instantiation: astcenc_compress_symbolic.cpp:astc::pow(float, float)
Unexecuted instantiation: astcenc_compute_variance.cpp:astc::pow(float, float)
Unexecuted instantiation: astcenc_decompress_symbolic.cpp:astc::pow(float, float)
Unexecuted instantiation: astcenc_find_best_partitioning.cpp:astc::pow(float, float)
Unexecuted instantiation: astcenc_ideal_endpoints_and_weights.cpp:astc::pow(float, float)
Unexecuted instantiation: astcenc_integer_sequence.cpp:astc::pow(float, float)
Unexecuted instantiation: astcenc_mathlib.cpp:astc::pow(float, float)
Unexecuted instantiation: astcenc_partition_tables.cpp:astc::pow(float, float)
Unexecuted instantiation: astcenc_pick_best_endpoint_format.cpp:astc::pow(float, float)
Unexecuted instantiation: astcenc_quantization.cpp:astc::pow(float, float)
Unexecuted instantiation: astcenc_averages_and_directions.cpp:astc::pow(float, float)
Unexecuted instantiation: astcenc_color_quantize.cpp:astc::pow(float, float)
Unexecuted instantiation: fuzz_astc_physical_to_symbolic.cpp:astc::pow(float, float)
629
630
}
631
632
#endif // #ifndef ASTC_VECMATHLIB_H_INCLUDED