Coverage Report

Created: 2024-09-08 06:14

/src/astc-encoder/Source/astcenc_vecmathlib.h
Line
Count
Source (jump to first uncovered line)
1
// SPDX-License-Identifier: Apache-2.0
2
// ----------------------------------------------------------------------------
3
// Copyright 2019-2024 Arm Limited
4
// Copyright 2008 Jose Fonseca
5
//
6
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
7
// use this file except in compliance with the License. You may obtain a copy
8
// of the License at:
9
//
10
//     http://www.apache.org/licenses/LICENSE-2.0
11
//
12
// Unless required by applicable law or agreed to in writing, software
13
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
14
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
15
// License for the specific language governing permissions and limitations
16
// under the License.
17
// ----------------------------------------------------------------------------
18
19
/*
20
 * This module implements vector support for floats, ints, and vector lane
21
 * control masks. It provides access to both explicit vector width types, and
22
 * flexible N-wide types where N can be determined at compile time.
23
 *
24
 * The design of this module encourages use of vector length agnostic code, via
25
 * the vint, vfloat, and vmask types. These will take on the widest SIMD vector
26
 * with that is available at compile time. The current vector width is
27
 * accessible for e.g. loop strides via the ASTCENC_SIMD_WIDTH constant.
28
 *
29
 * Explicit scalar types are accessible via the vint1, vfloat1, vmask1 types.
30
 * These are provided primarily for prototyping and algorithm debug of VLA
31
 * implementations.
32
 *
33
 * Explicit 4-wide types are accessible via the vint4, vfloat4, and vmask4
34
 * types. These are provided for use by VLA code, but are also expected to be
35
 * used as a fixed-width type and will supported a reference C++ fallback for
36
 * use on platforms without SIMD intrinsics.
37
 *
38
 * Explicit 8-wide types are accessible via the vint8, vfloat8, and vmask8
39
 * types. These are provide for use by VLA code, and are not expected to be
40
 * used as a fixed-width type in normal code. No reference C implementation is
41
 * provided on platforms without underlying SIMD intrinsics.
42
 *
43
 * With the current implementation ISA support is provided for:
44
 *
45
 *     * 1-wide for scalar reference
46
 *     * 4-wide for Armv8-A NEON
47
 *     * 4-wide for x86-64 SSE2
48
 *     * 4-wide for x86-64 SSE4.1
49
 *     * 8-wide for Armv8-A SVE
50
 *     * 8-wide for x86-64 AVX2
51
 */
52
53
#ifndef ASTC_VECMATHLIB_H_INCLUDED
54
#define ASTC_VECMATHLIB_H_INCLUDED
55
56
#if ASTCENC_SSE != 0 || ASTCENC_AVX != 0
57
  #include <immintrin.h>
58
#endif
59
60
#if ASTCENC_SVE != 0
61
  #include <arm_sve.h>
62
  #include <arm_neon_sve_bridge.h>
63
#endif
64
65
#if ASTCENC_NEON != 0
66
  #include <arm_neon.h>
67
#endif
68
69
#if !defined(__clang__) && defined(_MSC_VER)
70
  #define ASTCENC_SIMD_INLINE __forceinline
71
  #define ASTCENC_NO_INLINE
72
#elif defined(__GNUC__) && !defined(__clang__)
73
  #define ASTCENC_SIMD_INLINE __attribute__((always_inline)) inline
74
  #define ASTCENC_NO_INLINE __attribute__ ((noinline))
75
#else
76
  #define ASTCENC_SIMD_INLINE __attribute__((always_inline, nodebug)) inline
77
  #define ASTCENC_NO_INLINE __attribute__ ((noinline))
78
#endif
79
80
#if ASTCENC_AVX >= 2
81
  // If we have AVX2 expose 8-wide VLA.
82
  #include "astcenc_vecmathlib_sse_4.h"
83
  #include "astcenc_vecmathlib_common_4.h"
84
  #include "astcenc_vecmathlib_avx2_8.h"
85
86
  #define ASTCENC_SIMD_WIDTH 8
87
88
  using vfloat = vfloat8;
89
90
  #if defined(ASTCENC_NO_INVARIANCE)
91
    using vfloatacc = vfloat8;
92
  #else
93
    using vfloatacc = vfloat4;
94
  #endif
95
96
  using vint = vint8;
97
  using vmask = vmask8;
98
99
  using vtable_16x8 = vtable8_16x8;
100
  using vtable_32x8 = vtable8_32x8;
101
  using vtable_64x8 = vtable8_64x8;
102
103
  constexpr auto loada = vfloat8::loada;
104
  constexpr auto load1 = vfloat8::load1;
105
106
#elif ASTCENC_SSE >= 20
107
  // If we have SSE expose 4-wide VLA, and 4-wide fixed width.
108
  #include "astcenc_vecmathlib_sse_4.h"
109
  #include "astcenc_vecmathlib_common_4.h"
110
111
  #define ASTCENC_SIMD_WIDTH 4
112
113
  using vfloat = vfloat4;
114
  using vfloatacc = vfloat4;
115
  using vint = vint4;
116
  using vmask = vmask4;
117
118
  using vtable_16x8 = vtable4_16x8;
119
  using vtable_32x8 = vtable4_32x8;
120
  using vtable_64x8 = vtable4_64x8;
121
122
  constexpr auto loada = vfloat4::loada;
123
  constexpr auto load1 = vfloat4::load1;
124
125
#elif ASTCENC_SVE == 8
126
  // Check the compiler is configured with fixed-length 256-bit SVE.
127
  #if !defined(__ARM_FEATURE_SVE_BITS) || (__ARM_FEATURE_SVE_BITS != 256)
128
    #error "__ARM_FEATURE_SVE_BITS is not set to 256 bits"
129
  #endif
130
131
  // If we have SVE configured as 8-wide, expose 8-wide VLA.
132
  #include "astcenc_vecmathlib_neon_4.h"
133
  #include "astcenc_vecmathlib_common_4.h"
134
  #include "astcenc_vecmathlib_sve_8.h"
135
136
  #define ASTCENC_SIMD_WIDTH 8
137
138
  using vfloat = vfloat8;
139
140
  #if defined(ASTCENC_NO_INVARIANCE)
141
    using vfloatacc = vfloat8;
142
  #else
143
    using vfloatacc = vfloat4;
144
  #endif
145
146
  using vint = vint8;
147
  using vmask = vmask8;
148
149
  using vtable_16x8 = vtable8_16x8;
150
  using vtable_32x8 = vtable8_32x8;
151
  using vtable_64x8 = vtable8_64x8;
152
153
  constexpr auto loada = vfloat8::loada;
154
  constexpr auto load1 = vfloat8::load1;
155
156
#elif ASTCENC_NEON > 0
157
  // If we have NEON expose 4-wide VLA.
158
  #include "astcenc_vecmathlib_neon_4.h"
159
  #include "astcenc_vecmathlib_common_4.h"
160
161
  #define ASTCENC_SIMD_WIDTH 4
162
163
  using vfloat = vfloat4;
164
  using vfloatacc = vfloat4;
165
  using vint = vint4;
166
  using vmask = vmask4;
167
168
  using vtable_16x8 = vtable4_16x8;
169
  using vtable_32x8 = vtable4_32x8;
170
  using vtable_64x8 = vtable4_64x8;
171
172
  constexpr auto loada = vfloat4::loada;
173
  constexpr auto load1 = vfloat4::load1;
174
175
#else
176
  // If we have nothing expose 4-wide VLA, and 4-wide fixed width.
177
178
  // Note: We no longer expose the 1-wide scalar fallback because it is not
179
  // invariant with the 4-wide path due to algorithms that use horizontal
180
  // operations that accumulate a local vector sum before accumulating into
181
  // a running sum.
182
  //
183
  // For 4 items adding into an accumulator using 1-wide vectors the sum is:
184
  //
185
  //     result = ((((sum + l0) + l1) + l2) + l3)
186
  //
187
    // ... whereas the accumulator for a 4-wide vector sum is:
188
  //
189
  //     result = sum + ((l0 + l2) + (l1 + l3))
190
  //
191
  // In "normal maths" this is the same, but the floating point reassociation
192
  // differences mean that these will not produce the same result.
193
194
  #include "astcenc_vecmathlib_none_4.h"
195
  #include "astcenc_vecmathlib_common_4.h"
196
197
121k
  #define ASTCENC_SIMD_WIDTH 4
198
199
  using vfloat = vfloat4;
200
  using vfloatacc = vfloat4;
201
  using vint = vint4;
202
  using vmask = vmask4;
203
204
  using vtable_16x8 = vtable4_16x8;
205
  using vtable_32x8 = vtable4_32x8;
206
  using vtable_64x8 = vtable4_64x8;
207
208
  constexpr auto loada = vfloat4::loada;
209
  constexpr auto load1 = vfloat4::load1;
210
#endif
211
212
/**
213
 * @brief Round a count down to the largest multiple of 8.
214
 *
215
 * @param count   The unrounded value.
216
 *
217
 * @return The rounded value.
218
 */
219
ASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_8(unsigned int count)
220
0
{
221
0
  return count & static_cast<unsigned int>(~(8 - 1));
222
0
}
223
224
/**
225
 * @brief Round a count down to the largest multiple of 4.
226
 *
227
 * @param count   The unrounded value.
228
 *
229
 * @return The rounded value.
230
 */
231
ASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_4(unsigned int count)
232
0
{
233
0
  return count & static_cast<unsigned int>(~(4 - 1));
234
0
}
235
236
/**
237
 * @brief Round a count down to the largest multiple of the SIMD width.
238
 *
239
 * Assumption that the vector width is a power of two ...
240
 *
241
 * @param count   The unrounded value.
242
 *
243
 * @return The rounded value.
244
 */
245
ASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_vla(unsigned int count)
246
0
{
247
0
  return count & static_cast<unsigned int>(~(ASTCENC_SIMD_WIDTH - 1));
248
0
}
249
250
/**
251
 * @brief Round a count up to the largest multiple of the SIMD width.
252
 *
253
 * Assumption that the vector width is a power of two ...
254
 *
255
 * @param count   The unrounded value.
256
 *
257
 * @return The rounded value.
258
 */
259
ASTCENC_SIMD_INLINE unsigned int round_up_to_simd_multiple_vla(unsigned int count)
260
40.5k
{
261
40.5k
  unsigned int multiples = (count + ASTCENC_SIMD_WIDTH - 1) / ASTCENC_SIMD_WIDTH;
262
40.5k
  return multiples * ASTCENC_SIMD_WIDTH;
263
40.5k
}
264
265
/**
266
 * @brief Return @c a with lanes negated if the @c b lane is negative.
267
 */
268
ASTCENC_SIMD_INLINE vfloat change_sign(vfloat a, vfloat b)
269
0
{
270
0
  vint ia = float_as_int(a);
271
0
  vint ib = float_as_int(b);
272
0
  vint sign_mask(static_cast<int>(0x80000000));
273
0
  vint r = ia ^ (ib & sign_mask);
274
0
  return int_as_float(r);
275
0
}
276
277
/**
278
 * @brief Return fast, but approximate, vector atan(x).
279
 *
280
 * Max error of this implementation is 0.004883.
281
 */
282
ASTCENC_SIMD_INLINE vfloat atan(vfloat x)
283
0
{
284
0
  vmask c = abs(x) > vfloat(1.0f);
285
0
  vfloat z = change_sign(vfloat(astc::PI_OVER_TWO), x);
286
0
  vfloat y = select(x, vfloat(1.0f) / x, c);
287
0
  y = y / (y * y * vfloat(0.28f) + vfloat(1.0f));
288
0
  return select(y, z - y, c);
289
0
}
290
291
/**
292
 * @brief Return fast, but approximate, vector atan2(x, y).
293
 */
294
ASTCENC_SIMD_INLINE vfloat atan2(vfloat y, vfloat x)
295
0
{
296
0
  vfloat z = atan(abs(y / x));
297
0
  vmask xmask = x < vfloat::zero();
298
0
  return change_sign(select(z, vfloat(astc::PI) - z, xmask), y);
299
0
}
300
301
/*
302
 * @brief Factory that returns a unit length 4 component vfloat4.
303
 */
304
static ASTCENC_SIMD_INLINE vfloat4 unit4()
305
0
{
306
0
  return vfloat4(0.5f);
307
0
}
Unexecuted instantiation: fuzz_astc_physical_to_symbolic.cpp:unit4()
Unexecuted instantiation: astcenc_block_sizes.cpp:unit4()
Unexecuted instantiation: astcenc_integer_sequence.cpp:unit4()
Unexecuted instantiation: astcenc_mathlib.cpp:unit4()
Unexecuted instantiation: astcenc_partition_tables.cpp:unit4()
Unexecuted instantiation: astcenc_percentile_tables.cpp:unit4()
Unexecuted instantiation: astcenc_symbolic_physical.cpp:unit4()
Unexecuted instantiation: astcenc_weight_quant_xfer_tables.cpp:unit4()
Unexecuted instantiation: astcenc_quantization.cpp:unit4()
308
309
/**
310
 * @brief Factory that returns a unit length 3 component vfloat4.
311
 */
312
static ASTCENC_SIMD_INLINE vfloat4 unit3()
313
0
{
314
0
  float val = 0.577350258827209473f;
315
0
  return vfloat4(val, val, val, 0.0f);
316
0
}
Unexecuted instantiation: fuzz_astc_physical_to_symbolic.cpp:unit3()
Unexecuted instantiation: astcenc_block_sizes.cpp:unit3()
Unexecuted instantiation: astcenc_integer_sequence.cpp:unit3()
Unexecuted instantiation: astcenc_mathlib.cpp:unit3()
Unexecuted instantiation: astcenc_partition_tables.cpp:unit3()
Unexecuted instantiation: astcenc_percentile_tables.cpp:unit3()
Unexecuted instantiation: astcenc_symbolic_physical.cpp:unit3()
Unexecuted instantiation: astcenc_weight_quant_xfer_tables.cpp:unit3()
Unexecuted instantiation: astcenc_quantization.cpp:unit3()
317
318
/**
319
 * @brief Factory that returns a unit length 2 component vfloat4.
320
 */
321
static ASTCENC_SIMD_INLINE vfloat4 unit2()
322
0
{
323
0
  float val = 0.707106769084930420f;
324
0
  return vfloat4(val, val, 0.0f, 0.0f);
325
0
}
Unexecuted instantiation: fuzz_astc_physical_to_symbolic.cpp:unit2()
Unexecuted instantiation: astcenc_block_sizes.cpp:unit2()
Unexecuted instantiation: astcenc_integer_sequence.cpp:unit2()
Unexecuted instantiation: astcenc_mathlib.cpp:unit2()
Unexecuted instantiation: astcenc_partition_tables.cpp:unit2()
Unexecuted instantiation: astcenc_percentile_tables.cpp:unit2()
Unexecuted instantiation: astcenc_symbolic_physical.cpp:unit2()
Unexecuted instantiation: astcenc_weight_quant_xfer_tables.cpp:unit2()
Unexecuted instantiation: astcenc_quantization.cpp:unit2()
326
327
/**
328
 * @brief Factory that returns a 3 component vfloat4.
329
 */
330
static ASTCENC_SIMD_INLINE vfloat4 vfloat3(float a, float b, float c)
331
0
{
332
0
  return vfloat4(a, b, c, 0.0f);
333
0
}
Unexecuted instantiation: fuzz_astc_physical_to_symbolic.cpp:vfloat3(float, float, float)
Unexecuted instantiation: astcenc_block_sizes.cpp:vfloat3(float, float, float)
Unexecuted instantiation: astcenc_integer_sequence.cpp:vfloat3(float, float, float)
Unexecuted instantiation: astcenc_mathlib.cpp:vfloat3(float, float, float)
Unexecuted instantiation: astcenc_partition_tables.cpp:vfloat3(float, float, float)
Unexecuted instantiation: astcenc_percentile_tables.cpp:vfloat3(float, float, float)
Unexecuted instantiation: astcenc_symbolic_physical.cpp:vfloat3(float, float, float)
Unexecuted instantiation: astcenc_weight_quant_xfer_tables.cpp:vfloat3(float, float, float)
Unexecuted instantiation: astcenc_quantization.cpp:vfloat3(float, float, float)
334
335
/**
336
 * @brief Factory that returns a 2 component vfloat4.
337
 */
338
static ASTCENC_SIMD_INLINE vfloat4 vfloat2(float a, float b)
339
0
{
340
0
  return vfloat4(a, b, 0.0f, 0.0f);
341
0
}
Unexecuted instantiation: fuzz_astc_physical_to_symbolic.cpp:vfloat2(float, float)
Unexecuted instantiation: astcenc_block_sizes.cpp:vfloat2(float, float)
Unexecuted instantiation: astcenc_integer_sequence.cpp:vfloat2(float, float)
Unexecuted instantiation: astcenc_mathlib.cpp:vfloat2(float, float)
Unexecuted instantiation: astcenc_partition_tables.cpp:vfloat2(float, float)
Unexecuted instantiation: astcenc_percentile_tables.cpp:vfloat2(float, float)
Unexecuted instantiation: astcenc_symbolic_physical.cpp:vfloat2(float, float)
Unexecuted instantiation: astcenc_weight_quant_xfer_tables.cpp:vfloat2(float, float)
Unexecuted instantiation: astcenc_quantization.cpp:vfloat2(float, float)
342
343
/**
344
 * @brief Normalize a non-zero length vector to unit length.
345
 */
346
static ASTCENC_SIMD_INLINE vfloat4 normalize(vfloat4 a)
347
0
{
348
0
  vfloat4 length = dot(a, a);
349
0
  return a / sqrt(length);
350
0
}
Unexecuted instantiation: fuzz_astc_physical_to_symbolic.cpp:normalize(vfloat4)
Unexecuted instantiation: astcenc_block_sizes.cpp:normalize(vfloat4)
Unexecuted instantiation: astcenc_integer_sequence.cpp:normalize(vfloat4)
Unexecuted instantiation: astcenc_mathlib.cpp:normalize(vfloat4)
Unexecuted instantiation: astcenc_partition_tables.cpp:normalize(vfloat4)
Unexecuted instantiation: astcenc_percentile_tables.cpp:normalize(vfloat4)
Unexecuted instantiation: astcenc_symbolic_physical.cpp:normalize(vfloat4)
Unexecuted instantiation: astcenc_weight_quant_xfer_tables.cpp:normalize(vfloat4)
Unexecuted instantiation: astcenc_quantization.cpp:normalize(vfloat4)
351
352
/**
353
 * @brief Normalize a vector, returning @c safe if len is zero.
354
 */
355
static ASTCENC_SIMD_INLINE vfloat4 normalize_safe(vfloat4 a, vfloat4 safe)
356
0
{
357
0
  vfloat4 length = dot(a, a);
358
0
  if (length.lane<0>() != 0.0f)
359
0
  {
360
0
    return a / sqrt(length);
361
0
  }
362
0
363
0
  return safe;
364
0
}
Unexecuted instantiation: fuzz_astc_physical_to_symbolic.cpp:normalize_safe(vfloat4, vfloat4)
Unexecuted instantiation: astcenc_block_sizes.cpp:normalize_safe(vfloat4, vfloat4)
Unexecuted instantiation: astcenc_integer_sequence.cpp:normalize_safe(vfloat4, vfloat4)
Unexecuted instantiation: astcenc_mathlib.cpp:normalize_safe(vfloat4, vfloat4)
Unexecuted instantiation: astcenc_partition_tables.cpp:normalize_safe(vfloat4, vfloat4)
Unexecuted instantiation: astcenc_percentile_tables.cpp:normalize_safe(vfloat4, vfloat4)
Unexecuted instantiation: astcenc_symbolic_physical.cpp:normalize_safe(vfloat4, vfloat4)
Unexecuted instantiation: astcenc_weight_quant_xfer_tables.cpp:normalize_safe(vfloat4, vfloat4)
Unexecuted instantiation: astcenc_quantization.cpp:normalize_safe(vfloat4, vfloat4)
365
366
367
368
#define POLY0(x, c0)                     (                                     c0)
369
#define POLY1(x, c0, c1)                 ((POLY0(x, c1) * x)                 + c0)
370
#define POLY2(x, c0, c1, c2)             ((POLY1(x, c1, c2) * x)             + c0)
371
#define POLY3(x, c0, c1, c2, c3)         ((POLY2(x, c1, c2, c3) * x)         + c0)
372
#define POLY4(x, c0, c1, c2, c3, c4)     ((POLY3(x, c1, c2, c3, c4) * x)     + c0)
373
#define POLY5(x, c0, c1, c2, c3, c4, c5) ((POLY4(x, c1, c2, c3, c4, c5) * x) + c0)
374
375
/**
376
 * @brief Compute an approximate exp2(x) for each lane in the vector.
377
 *
378
 * Based on 5th degree minimax polynomials, ported from this blog
379
 * https://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html
380
 */
381
static ASTCENC_SIMD_INLINE vfloat4 exp2(vfloat4 x)
382
0
{
383
0
  x = clamp(-126.99999f, 129.0f, x);
384
0
385
0
  vint4 ipart = float_to_int(x - 0.5f);
386
0
  vfloat4 fpart = x - int_to_float(ipart);
387
0
388
0
  // Integer contrib, using 1 << ipart
389
0
  vfloat4 iexp = int_as_float(lsl<23>(ipart + 127));
390
0
391
0
  // Fractional contrib, using polynomial fit of 2^x in range [-0.5, 0.5)
392
0
  vfloat4 fexp = POLY5(fpart,
393
0
                       9.9999994e-1f,
394
0
                       6.9315308e-1f,
395
0
                       2.4015361e-1f,
396
0
                       5.5826318e-2f,
397
0
                       8.9893397e-3f,
398
0
                       1.8775767e-3f);
399
0
400
0
  return iexp * fexp;
401
0
}
Unexecuted instantiation: fuzz_astc_physical_to_symbolic.cpp:exp2(vfloat4)
Unexecuted instantiation: astcenc_block_sizes.cpp:exp2(vfloat4)
Unexecuted instantiation: astcenc_integer_sequence.cpp:exp2(vfloat4)
Unexecuted instantiation: astcenc_mathlib.cpp:exp2(vfloat4)
Unexecuted instantiation: astcenc_partition_tables.cpp:exp2(vfloat4)
Unexecuted instantiation: astcenc_percentile_tables.cpp:exp2(vfloat4)
Unexecuted instantiation: astcenc_symbolic_physical.cpp:exp2(vfloat4)
Unexecuted instantiation: astcenc_weight_quant_xfer_tables.cpp:exp2(vfloat4)
Unexecuted instantiation: astcenc_quantization.cpp:exp2(vfloat4)
402
403
/**
404
 * @brief Compute an approximate log2(x) for each lane in the vector.
405
 *
406
 * Based on 5th degree minimax polynomials, ported from this blog
407
 * https://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html
408
 */
409
static ASTCENC_SIMD_INLINE vfloat4 log2(vfloat4 x)
410
0
{
411
0
  vint4 exp(0x7F800000);
412
0
  vint4 mant(0x007FFFFF);
413
0
  vint4 one(0x3F800000);
414
0
415
0
  vint4 i = float_as_int(x);
416
0
417
0
  vfloat4 e = int_to_float(lsr<23>(i & exp) - 127);
418
0
419
0
  vfloat4 m = int_as_float((i & mant) | one);
420
0
421
0
  // Polynomial fit of log2(x)/(x - 1), for x in range [1, 2)
422
0
  vfloat4 p = POLY4(m,
423
0
                    2.8882704548164776201f,
424
0
                   -2.52074962577807006663f,
425
0
                    1.48116647521213171641f,
426
0
                   -0.465725644288844778798f,
427
0
                    0.0596515482674574969533f);
428
0
429
0
  // Increases the polynomial degree, but ensures that log2(1) == 0
430
0
  p = p * (m - 1.0f);
431
0
432
0
  return p + e;
433
0
}
Unexecuted instantiation: fuzz_astc_physical_to_symbolic.cpp:log2(vfloat4)
Unexecuted instantiation: astcenc_block_sizes.cpp:log2(vfloat4)
Unexecuted instantiation: astcenc_integer_sequence.cpp:log2(vfloat4)
Unexecuted instantiation: astcenc_mathlib.cpp:log2(vfloat4)
Unexecuted instantiation: astcenc_partition_tables.cpp:log2(vfloat4)
Unexecuted instantiation: astcenc_percentile_tables.cpp:log2(vfloat4)
Unexecuted instantiation: astcenc_symbolic_physical.cpp:log2(vfloat4)
Unexecuted instantiation: astcenc_weight_quant_xfer_tables.cpp:log2(vfloat4)
Unexecuted instantiation: astcenc_quantization.cpp:log2(vfloat4)
434
435
/**
436
 * @brief Compute an approximate pow(x, y) for each lane in the vector.
437
 *
438
 * Power function based on the exp2(log2(x) * y) transform.
439
 */
440
static ASTCENC_SIMD_INLINE vfloat4 pow(vfloat4 x, vfloat4 y)
441
0
{
442
0
  vmask4 zero_mask = y == vfloat4(0.0f);
443
0
  vfloat4 estimate = exp2(log2(x) * y);
444
0
445
0
  // Guarantee that y == 0 returns exactly 1.0f
446
0
  return select(estimate, vfloat4(1.0f), zero_mask);
447
0
}
Unexecuted instantiation: fuzz_astc_physical_to_symbolic.cpp:pow(vfloat4, vfloat4)
Unexecuted instantiation: astcenc_block_sizes.cpp:pow(vfloat4, vfloat4)
Unexecuted instantiation: astcenc_integer_sequence.cpp:pow(vfloat4, vfloat4)
Unexecuted instantiation: astcenc_mathlib.cpp:pow(vfloat4, vfloat4)
Unexecuted instantiation: astcenc_partition_tables.cpp:pow(vfloat4, vfloat4)
Unexecuted instantiation: astcenc_percentile_tables.cpp:pow(vfloat4, vfloat4)
Unexecuted instantiation: astcenc_symbolic_physical.cpp:pow(vfloat4, vfloat4)
Unexecuted instantiation: astcenc_weight_quant_xfer_tables.cpp:pow(vfloat4, vfloat4)
Unexecuted instantiation: astcenc_quantization.cpp:pow(vfloat4, vfloat4)
448
449
/**
450
 * @brief Count the leading zeros for each lane in @c a.
451
 *
452
 * Valid for all data values of @c a; will return a per-lane value [0, 32].
453
 */
454
static ASTCENC_SIMD_INLINE vint4 clz(vint4 a)
455
0
{
456
0
  // This function is a horrible abuse of floating point exponents to convert
457
0
  // the original integer value into a 2^N encoding we can recover easily.
458
0
459
0
  // Convert to float without risk of rounding up by keeping only top 8 bits.
460
0
  // This trick is is guaranteed to keep top 8 bits and clear the 9th.
461
0
  a = (~lsr<8>(a)) & a;
462
0
  a = float_as_int(int_to_float(a));
463
0
464
0
  // Extract and unbias exponent
465
0
  a = vint4(127 + 31) - lsr<23>(a);
466
0
467
0
  // Clamp result to a valid 32-bit range
468
0
  return clamp(0, 32, a);
469
0
}
Unexecuted instantiation: fuzz_astc_physical_to_symbolic.cpp:clz(vint4)
Unexecuted instantiation: astcenc_block_sizes.cpp:clz(vint4)
Unexecuted instantiation: astcenc_integer_sequence.cpp:clz(vint4)
Unexecuted instantiation: astcenc_mathlib.cpp:clz(vint4)
Unexecuted instantiation: astcenc_partition_tables.cpp:clz(vint4)
Unexecuted instantiation: astcenc_percentile_tables.cpp:clz(vint4)
Unexecuted instantiation: astcenc_symbolic_physical.cpp:clz(vint4)
Unexecuted instantiation: astcenc_weight_quant_xfer_tables.cpp:clz(vint4)
Unexecuted instantiation: astcenc_quantization.cpp:clz(vint4)
470
471
/**
472
 * @brief Return lanewise 2^a for each lane in @c a.
473
 *
474
 * Use of signed int means that this is only valid for values in range [0, 31].
475
 */
476
static ASTCENC_SIMD_INLINE vint4 two_to_the_n(vint4 a)
477
0
{
478
0
  // 2^30 is the largest signed number than can be represented
479
0
  assert(all(a < vint4(31)));
480
0
481
0
  // This function is a horrible abuse of floating point to use the exponent
482
0
  // and float conversion to generate a 2^N multiple.
483
0
484
0
  // Bias the exponent
485
0
  vint4 exp = a + 127;
486
0
  exp = lsl<23>(exp);
487
0
488
0
  // Reinterpret the bits as a float, and then convert to an int
489
0
  vfloat4 f = int_as_float(exp);
490
0
  return float_to_int(f);
491
0
}
Unexecuted instantiation: fuzz_astc_physical_to_symbolic.cpp:two_to_the_n(vint4)
Unexecuted instantiation: astcenc_block_sizes.cpp:two_to_the_n(vint4)
Unexecuted instantiation: astcenc_integer_sequence.cpp:two_to_the_n(vint4)
Unexecuted instantiation: astcenc_mathlib.cpp:two_to_the_n(vint4)
Unexecuted instantiation: astcenc_partition_tables.cpp:two_to_the_n(vint4)
Unexecuted instantiation: astcenc_percentile_tables.cpp:two_to_the_n(vint4)
Unexecuted instantiation: astcenc_symbolic_physical.cpp:two_to_the_n(vint4)
Unexecuted instantiation: astcenc_weight_quant_xfer_tables.cpp:two_to_the_n(vint4)
Unexecuted instantiation: astcenc_quantization.cpp:two_to_the_n(vint4)
492
493
/**
494
 * @brief Convert unorm16 [0, 65535] to float16 in range [0, 1].
495
 */
496
static ASTCENC_SIMD_INLINE vint4 unorm16_to_sf16(vint4 p)
497
0
{
498
0
  vint4 fp16_one = vint4(0x3C00);
499
0
  vint4 fp16_small = lsl<8>(p);
500
0
501
0
  vmask4 is_one = p == vint4(0xFFFF);
502
0
  vmask4 is_small = p < vint4(4);
503
0
504
0
  // Manually inline clz() on Visual Studio to avoid release build codegen bug
505
0
  // see https://github.com/ARM-software/astc-encoder/issues/259
506
0
#if !defined(__clang__) && defined(_MSC_VER)
507
0
  vint4 a = (~lsr<8>(p)) & p;
508
0
  a = float_as_int(int_to_float(a));
509
0
  a = vint4(127 + 31) - lsr<23>(a);
510
0
  vint4 lz = clamp(0, 32, a) - 16;
511
0
#else
512
0
  vint4 lz = clz(p) - 16;
513
0
#endif
514
0
515
0
  p = p * two_to_the_n(lz + 1);
516
0
  p = p & vint4(0xFFFF);
517
0
518
0
  p = lsr<6>(p);
519
0
520
0
  p = p | lsl<10>(vint4(14) - lz);
521
0
522
0
  vint4 r = select(p, fp16_one, is_one);
523
0
  r = select(r, fp16_small, is_small);
524
0
  return r;
525
0
}
Unexecuted instantiation: fuzz_astc_physical_to_symbolic.cpp:unorm16_to_sf16(vint4)
Unexecuted instantiation: astcenc_block_sizes.cpp:unorm16_to_sf16(vint4)
Unexecuted instantiation: astcenc_integer_sequence.cpp:unorm16_to_sf16(vint4)
Unexecuted instantiation: astcenc_mathlib.cpp:unorm16_to_sf16(vint4)
Unexecuted instantiation: astcenc_partition_tables.cpp:unorm16_to_sf16(vint4)
Unexecuted instantiation: astcenc_percentile_tables.cpp:unorm16_to_sf16(vint4)
Unexecuted instantiation: astcenc_symbolic_physical.cpp:unorm16_to_sf16(vint4)
Unexecuted instantiation: astcenc_weight_quant_xfer_tables.cpp:unorm16_to_sf16(vint4)
Unexecuted instantiation: astcenc_quantization.cpp:unorm16_to_sf16(vint4)
526
527
/**
528
 * @brief Convert 16-bit LNS to float16.
529
 */
530
static ASTCENC_SIMD_INLINE vint4 lns_to_sf16(vint4 p)
531
0
{
532
0
  vint4 mc = p & 0x7FF;
533
0
  vint4 ec = lsr<11>(p);
534
0
535
0
  vint4 mc_512 = mc * 3;
536
0
  vmask4 mask_512 = mc < vint4(512);
537
0
538
0
  vint4 mc_1536 = mc * 4 - 512;
539
0
  vmask4 mask_1536 = mc < vint4(1536);
540
0
541
0
  vint4 mc_else = mc * 5 - 2048;
542
0
543
0
  vint4 mt = mc_else;
544
0
  mt = select(mt, mc_1536, mask_1536);
545
0
  mt = select(mt, mc_512, mask_512);
546
0
547
0
  vint4 res = lsl<10>(ec) | lsr<3>(mt);
548
0
  return min(res, vint4(0x7BFF));
549
0
}
Unexecuted instantiation: fuzz_astc_physical_to_symbolic.cpp:lns_to_sf16(vint4)
Unexecuted instantiation: astcenc_block_sizes.cpp:lns_to_sf16(vint4)
Unexecuted instantiation: astcenc_integer_sequence.cpp:lns_to_sf16(vint4)
Unexecuted instantiation: astcenc_mathlib.cpp:lns_to_sf16(vint4)
Unexecuted instantiation: astcenc_partition_tables.cpp:lns_to_sf16(vint4)
Unexecuted instantiation: astcenc_percentile_tables.cpp:lns_to_sf16(vint4)
Unexecuted instantiation: astcenc_symbolic_physical.cpp:lns_to_sf16(vint4)
Unexecuted instantiation: astcenc_weight_quant_xfer_tables.cpp:lns_to_sf16(vint4)
Unexecuted instantiation: astcenc_quantization.cpp:lns_to_sf16(vint4)
550
551
/**
552
 * @brief Extract mantissa and exponent of a float value.
553
 *
554
 * @param      a      The input value.
555
 * @param[out] exp    The output exponent.
556
 *
557
 * @return The mantissa.
558
 */
559
static ASTCENC_SIMD_INLINE vfloat4 frexp(vfloat4 a, vint4& exp)
560
0
{
561
0
  // Interpret the bits as an integer
562
0
  vint4 ai = float_as_int(a);
563
0
564
0
  // Extract and unbias the exponent
565
0
  exp = (lsr<23>(ai) & 0xFF) - 126;
566
0
567
0
  // Extract and unbias the mantissa
568
0
  vint4 manti = (ai &  static_cast<int>(0x807FFFFF)) | 0x3F000000;
569
0
  return int_as_float(manti);
570
0
}
Unexecuted instantiation: fuzz_astc_physical_to_symbolic.cpp:frexp(vfloat4, vint4&)
Unexecuted instantiation: astcenc_block_sizes.cpp:frexp(vfloat4, vint4&)
Unexecuted instantiation: astcenc_integer_sequence.cpp:frexp(vfloat4, vint4&)
Unexecuted instantiation: astcenc_mathlib.cpp:frexp(vfloat4, vint4&)
Unexecuted instantiation: astcenc_partition_tables.cpp:frexp(vfloat4, vint4&)
Unexecuted instantiation: astcenc_percentile_tables.cpp:frexp(vfloat4, vint4&)
Unexecuted instantiation: astcenc_symbolic_physical.cpp:frexp(vfloat4, vint4&)
Unexecuted instantiation: astcenc_weight_quant_xfer_tables.cpp:frexp(vfloat4, vint4&)
Unexecuted instantiation: astcenc_quantization.cpp:frexp(vfloat4, vint4&)
571
572
/**
573
 * @brief Convert float to 16-bit LNS.
574
 */
575
static ASTCENC_SIMD_INLINE vfloat4 float_to_lns(vfloat4 a)
576
0
{
577
0
  vint4 exp;
578
0
  vfloat4 mant = frexp(a, exp);
579
0
580
0
  // Do these early before we start messing about ...
581
0
  vmask4 mask_underflow_nan = ~(a > vfloat4(1.0f / 67108864.0f));
582
0
  vmask4 mask_infinity = a >= vfloat4(65536.0f);
583
0
584
0
  // If input is smaller than 2^-14, multiply by 2^25 and don't bias.
585
0
  vmask4 exp_lt_m13 = exp < vint4(-13);
586
0
587
0
  vfloat4 a1a = a * 33554432.0f;
588
0
  vint4 expa = vint4::zero();
589
0
590
0
  vfloat4 a1b = (mant - 0.5f) * 4096;
591
0
  vint4 expb = exp + 14;
592
0
593
0
  a = select(a1b, a1a, exp_lt_m13);
594
0
  exp = select(expb, expa, exp_lt_m13);
595
0
596
0
  vmask4 a_lt_384 = a < vfloat4(384.0f);
597
0
  vmask4 a_lt_1408 = a <= vfloat4(1408.0f);
598
0
599
0
  vfloat4 a2a = a * (4.0f / 3.0f);
600
0
  vfloat4 a2b = a + 128.0f;
601
0
  vfloat4 a2c = (a + 512.0f) * (4.0f / 5.0f);
602
0
603
0
  a = a2c;
604
0
  a = select(a, a2b, a_lt_1408);
605
0
  a = select(a, a2a, a_lt_384);
606
0
607
0
  a = a + (int_to_float(exp) * 2048.0f) + 1.0f;
608
0
609
0
  a = select(a, vfloat4(65535.0f), mask_infinity);
610
0
  a = select(a, vfloat4::zero(), mask_underflow_nan);
611
0
612
0
  return a;
613
0
}
Unexecuted instantiation: fuzz_astc_physical_to_symbolic.cpp:float_to_lns(vfloat4)
Unexecuted instantiation: astcenc_block_sizes.cpp:float_to_lns(vfloat4)
Unexecuted instantiation: astcenc_integer_sequence.cpp:float_to_lns(vfloat4)
Unexecuted instantiation: astcenc_mathlib.cpp:float_to_lns(vfloat4)
Unexecuted instantiation: astcenc_partition_tables.cpp:float_to_lns(vfloat4)
Unexecuted instantiation: astcenc_percentile_tables.cpp:float_to_lns(vfloat4)
Unexecuted instantiation: astcenc_symbolic_physical.cpp:float_to_lns(vfloat4)
Unexecuted instantiation: astcenc_weight_quant_xfer_tables.cpp:float_to_lns(vfloat4)
Unexecuted instantiation: astcenc_quantization.cpp:float_to_lns(vfloat4)
614
615
namespace astc
616
{
617
618
static ASTCENC_SIMD_INLINE float pow(float x, float y)
619
0
{
620
0
  return pow(vfloat4(x), vfloat4(y)).lane<0>();
621
0
}
Unexecuted instantiation: fuzz_astc_physical_to_symbolic.cpp:astc::pow(float, float)
Unexecuted instantiation: astcenc_block_sizes.cpp:astc::pow(float, float)
Unexecuted instantiation: astcenc_integer_sequence.cpp:astc::pow(float, float)
Unexecuted instantiation: astcenc_mathlib.cpp:astc::pow(float, float)
Unexecuted instantiation: astcenc_partition_tables.cpp:astc::pow(float, float)
Unexecuted instantiation: astcenc_percentile_tables.cpp:astc::pow(float, float)
Unexecuted instantiation: astcenc_symbolic_physical.cpp:astc::pow(float, float)
Unexecuted instantiation: astcenc_weight_quant_xfer_tables.cpp:astc::pow(float, float)
Unexecuted instantiation: astcenc_quantization.cpp:astc::pow(float, float)
622
623
}
624
625
#endif // #ifndef ASTC_VECMATHLIB_H_INCLUDED