Coverage Report

Created: 2026-06-10 07:06

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/astc-encoder/Source/astcenc_vecmathlib_none_4.h
Line
Count
Source
1
// SPDX-License-Identifier: Apache-2.0
2
// ----------------------------------------------------------------------------
3
// Copyright 2019-2026 Arm Limited
4
//
5
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
6
// use this file except in compliance with the License. You may obtain a copy
7
// of the License at:
8
//
9
//     http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing, software
12
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14
// License for the specific language governing permissions and limitations
15
// under the License.
16
// ----------------------------------------------------------------------------
17
18
/**
19
 * @brief 4x32-bit vectors, implemented using plain C++.
20
 *
21
 * This module implements 4-wide 32-bit float, int, and mask vectors. This
22
 * module provides a scalar fallback for VLA code, primarily useful for
23
 * debugging VLA algorithms without the complexity of handling SIMD. Only the
24
 * baseline level of functionality needed to support VLA is provided.
25
 *
26
 * Note that the vector conditional operators implemented by this module are
27
 * designed to behave like SIMD conditional operators that generate lane masks.
28
 * Rather than returning 0/1 booleans like normal C++ code they will return
29
 * 0/-1 to give a full lane-width bitmask.
30
 *
31
 * Note that the documentation for this module still talks about "vectors" to
32
 * help developers think about the implied VLA behavior when writing optimized
33
 * paths.
34
 */
35
36
#ifndef ASTC_VECMATHLIB_NONE_4_H_INCLUDED
37
#define ASTC_VECMATHLIB_NONE_4_H_INCLUDED
38
39
#ifndef ASTCENC_SIMD_INLINE
40
  #error "Include astcenc_vecmathlib.h, do not include directly"
41
#endif
42
43
#include <algorithm>
44
#include <cstdio>
45
#include <cstring>
46
#include <cfenv>
47
48
// ============================================================================
49
// vfloat4 data type
50
// ============================================================================
51
52
/**
53
 * @brief Data type for 4-wide floats.
54
 */
55
struct vfloat4
56
{
57
  /**
58
   * @brief Construct from zero-initialized value.
59
   */
60
  ASTCENC_SIMD_INLINE vfloat4() = default;
61
62
  /**
63
   * @brief Construct from 4 values loaded from an unaligned address.
64
   *
65
   * Consider using loada() which is better with wider VLA vectors if data is
66
   * aligned to vector length.
67
   */
68
  ASTCENC_SIMD_INLINE explicit vfloat4(const float* p)
69
53.2M
  {
70
53.2M
    m[0] = p[0];
71
53.2M
    m[1] = p[1];
72
53.2M
    m[2] = p[2];
73
53.2M
    m[3] = p[3];
74
53.2M
  }
75
76
  /**
77
   * @brief Construct from 4 scalar values replicated across all lanes.
78
   *
79
   * Consider using zero() for constexpr zeros.
80
   */
81
  ASTCENC_SIMD_INLINE explicit vfloat4(float a)
82
111M
  {
83
111M
    m[0] = a;
84
111M
    m[1] = a;
85
111M
    m[2] = a;
86
111M
    m[3] = a;
87
111M
  }
88
89
  /**
90
   * @brief Construct from 4 scalar values.
91
   *
92
   * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
93
   */
94
  ASTCENC_SIMD_INLINE explicit vfloat4(float a, float b, float c, float d)
95
510M
  {
96
510M
    m[0] = a;
97
510M
    m[1] = b;
98
510M
    m[2] = c;
99
510M
    m[3] = d;
100
510M
  }
101
102
  /**
103
   * @brief Get the scalar value of a single lane.
104
   */
105
  template <int l> ASTCENC_SIMD_INLINE float lane() const
106
96.1M
  {
107
96.1M
    return m[l];
108
96.1M
  }
float vfloat4::lane<0>() const
Line
Count
Source
106
30.1M
  {
107
30.1M
    return m[l];
108
30.1M
  }
float vfloat4::lane<3>() const
Line
Count
Source
106
16.6M
  {
107
16.6M
    return m[l];
108
16.6M
  }
float vfloat4::lane<1>() const
Line
Count
Source
106
24.8M
  {
107
24.8M
    return m[l];
108
24.8M
  }
float vfloat4::lane<2>() const
Line
Count
Source
106
24.5M
  {
107
24.5M
    return m[l];
108
24.5M
  }
109
110
  /**
111
   * @brief Set the scalar value of a single lane.
112
   */
113
  template <int l> ASTCENC_SIMD_INLINE void set_lane(float a)
114
147k
  {
115
147k
    m[l] = a;
116
147k
  }
void vfloat4::set_lane<3>(float)
Line
Count
Source
114
116k
  {
115
116k
    m[l] = a;
116
116k
  }
void vfloat4::set_lane<0>(float)
Line
Count
Source
114
10.0k
  {
115
10.0k
    m[l] = a;
116
10.0k
  }
void vfloat4::set_lane<1>(float)
Line
Count
Source
114
10.0k
  {
115
10.0k
    m[l] = a;
116
10.0k
  }
void vfloat4::set_lane<2>(float)
Line
Count
Source
114
10.0k
  {
115
10.0k
    m[l] = a;
116
10.0k
  }
117
118
  /**
119
   * @brief Factory that returns a vector of zeros.
120
   */
121
  static ASTCENC_SIMD_INLINE vfloat4 zero()
122
22.2M
  {
123
22.2M
    return vfloat4(0.0f);
124
22.2M
  }
125
126
  /**
127
   * @brief Factory that returns a replicated scalar loaded from memory.
128
   */
129
  static ASTCENC_SIMD_INLINE vfloat4 load1(const float* p)
130
5.77M
  {
131
5.77M
    return vfloat4(*p);
132
5.77M
  }
133
134
  /**
135
   * @brief Factory that returns a vector loaded from aligned memory.
136
   */
137
  static ASTCENC_SIMD_INLINE vfloat4 loada(const float* p)
138
52.5M
  {
139
52.5M
    return vfloat4(p);
140
52.5M
  }
141
142
  /**
143
   * @brief Return a swizzled float 2.
144
   */
145
  template <int l0, int l1> ASTCENC_SIMD_INLINE vfloat4 swz() const
146
1.50k
  {
147
1.50k
    return  vfloat4(lane<l0>(), lane<l1>(), 0.0f, 0.0f);
148
1.50k
  }
vfloat4 vfloat4::swz<0, 1>() const
Line
Count
Source
146
566
  {
147
566
    return  vfloat4(lane<l0>(), lane<l1>(), 0.0f, 0.0f);
148
566
  }
vfloat4 vfloat4::swz<0, 2>() const
Line
Count
Source
146
472
  {
147
472
    return  vfloat4(lane<l0>(), lane<l1>(), 0.0f, 0.0f);
148
472
  }
vfloat4 vfloat4::swz<1, 2>() const
Line
Count
Source
146
462
  {
147
462
    return  vfloat4(lane<l0>(), lane<l1>(), 0.0f, 0.0f);
148
462
  }
149
150
  /**
151
   * @brief Return a swizzled float 3.
152
   */
153
  template <int l0, int l1, int l2> ASTCENC_SIMD_INLINE vfloat4 swz() const
154
222k
  {
155
222k
    return vfloat4(lane<l0>(), lane<l1>(), lane<l2>(), 0.0f);
156
222k
  }
vfloat4 vfloat4::swz<0, 2, 3>() const
Line
Count
Source
154
7.28k
  {
155
7.28k
    return vfloat4(lane<l0>(), lane<l1>(), lane<l2>(), 0.0f);
156
7.28k
  }
vfloat4 vfloat4::swz<0, 1, 3>() const
Line
Count
Source
154
7.73k
  {
155
7.73k
    return vfloat4(lane<l0>(), lane<l1>(), lane<l2>(), 0.0f);
156
7.73k
  }
vfloat4 vfloat4::swz<0, 1, 2>() const
Line
Count
Source
154
201k
  {
155
201k
    return vfloat4(lane<l0>(), lane<l1>(), lane<l2>(), 0.0f);
156
201k
  }
vfloat4 vfloat4::swz<1, 2, 3>() const
Line
Count
Source
154
5.82k
  {
155
5.82k
    return vfloat4(lane<l0>(), lane<l1>(), lane<l2>(), 0.0f);
156
5.82k
  }
157
158
  /**
159
   * @brief Return a swizzled float 4.
160
   */
161
  template <int l0, int l1, int l2, int l3> ASTCENC_SIMD_INLINE vfloat4 swz() const
162
15.2M
  {
163
15.2M
    return vfloat4(lane<l0>(), lane<l1>(), lane<l2>(), lane<l3>());
164
15.2M
  }
vfloat4 vfloat4::swz<0, 0, 0, 0>() const
Line
Count
Source
162
4.08M
  {
163
4.08M
    return vfloat4(lane<l0>(), lane<l1>(), lane<l2>(), lane<l3>());
164
4.08M
  }
vfloat4 vfloat4::swz<1, 1, 2, 2>() const
Line
Count
Source
162
57.1k
  {
163
57.1k
    return vfloat4(lane<l0>(), lane<l1>(), lane<l2>(), lane<l3>());
164
57.1k
  }
vfloat4 vfloat4::swz<1, 1, 1, 1>() const
Line
Count
Source
162
4.02M
  {
163
4.02M
    return vfloat4(lane<l0>(), lane<l1>(), lane<l2>(), lane<l3>());
164
4.02M
  }
vfloat4 vfloat4::swz<2, 2, 2, 2>() const
Line
Count
Source
162
4.01M
  {
163
4.01M
    return vfloat4(lane<l0>(), lane<l1>(), lane<l2>(), lane<l3>());
164
4.01M
  }
vfloat4 vfloat4::swz<3, 3, 3, 3>() const
Line
Count
Source
162
2.97M
  {
163
2.97M
    return vfloat4(lane<l0>(), lane<l1>(), lane<l2>(), lane<l3>());
164
2.97M
  }
vfloat4 vfloat4::swz<2, 2, 2, 3>() const
Line
Count
Source
162
81.5k
  {
163
81.5k
    return vfloat4(lane<l0>(), lane<l1>(), lane<l2>(), lane<l3>());
164
81.5k
  }
vfloat4 vfloat4::swz<1, 0, 2, 3>() const
Line
Count
Source
162
19.8k
  {
163
19.8k
    return vfloat4(lane<l0>(), lane<l1>(), lane<l2>(), lane<l3>());
164
19.8k
  }
vfloat4 vfloat4::swz<2, 1, 0, 3>() const
Line
Count
Source
162
27.3k
  {
163
27.3k
    return vfloat4(lane<l0>(), lane<l1>(), lane<l2>(), lane<l3>());
164
27.3k
  }
165
166
  /**
167
   * @brief The vector ...
168
   */
169
  float m[4];
170
};
171
172
// ============================================================================
173
// vint4 data type
174
// ============================================================================
175
176
/**
177
 * @brief Data type for 4-wide ints.
178
 */
179
struct vint4
180
{
181
  /**
182
   * @brief Construct from zero-initialized value.
183
   */
184
  ASTCENC_SIMD_INLINE vint4() = default;
185
186
  /**
187
   * @brief Construct from 4 values loaded from an unaligned address.
188
   *
189
   * Consider using vint4::loada() which is better with wider VLA vectors
190
   * if data is aligned.
191
   */
192
  ASTCENC_SIMD_INLINE explicit vint4(const int* p)
193
89.5k
  {
194
89.5k
    m[0] = p[0];
195
89.5k
    m[1] = p[1];
196
89.5k
    m[2] = p[2];
197
89.5k
    m[3] = p[3];
198
89.5k
  }
199
200
  /**
201
   * @brief Construct from 4 uint8_t loaded from an unaligned address.
202
   */
203
  ASTCENC_SIMD_INLINE explicit vint4(const uint8_t *p)
204
5.94M
  {
205
5.94M
    m[0] = p[0];
206
5.94M
    m[1] = p[1];
207
5.94M
    m[2] = p[2];
208
5.94M
    m[3] = p[3];
209
5.94M
  }
210
211
  /**
212
   * @brief Construct from 4 scalar values.
213
   *
214
   * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
215
   */
216
  ASTCENC_SIMD_INLINE explicit vint4(int a, int b, int c, int d)
217
50.7M
  {
218
50.7M
    m[0] = a;
219
50.7M
    m[1] = b;
220
50.7M
    m[2] = c;
221
50.7M
    m[3] = d;
222
50.7M
  }
223
224
225
  /**
226
   * @brief Construct from 4 scalar values replicated across all lanes.
227
   *
228
   * Consider using zero() for constexpr zeros.
229
   */
230
  ASTCENC_SIMD_INLINE explicit vint4(int a)
231
27.0M
  {
232
27.0M
    m[0] = a;
233
27.0M
    m[1] = a;
234
27.0M
    m[2] = a;
235
27.0M
    m[3] = a;
236
27.0M
  }
237
238
  /**
239
   * @brief Get the scalar value of a single lane.
240
   */
241
  template <int l> ASTCENC_SIMD_INLINE int lane() const
242
23.8M
  {
243
23.8M
    return m[l];
244
23.8M
  }
int vint4::lane<0>() const
Line
Count
Source
242
7.61M
  {
243
7.61M
    return m[l];
244
7.61M
  }
int vint4::lane<1>() const
Line
Count
Source
242
5.51M
  {
243
5.51M
    return m[l];
244
5.51M
  }
int vint4::lane<2>() const
Line
Count
Source
242
5.52M
  {
243
5.52M
    return m[l];
244
5.52M
  }
int vint4::lane<3>() const
Line
Count
Source
242
5.17M
  {
243
5.17M
    return m[l];
244
5.17M
  }
245
246
  /**
247
   * @brief Set the scalar value of a single lane.
248
   */
249
  template <int l> ASTCENC_SIMD_INLINE void set_lane(int a)
250
380k
  {
251
380k
    m[l] = a;
252
380k
  }
253
254
  /**
255
   * @brief Factory that returns a vector of zeros.
256
   */
257
  static ASTCENC_SIMD_INLINE vint4 zero()
258
554k
  {
259
554k
    return vint4(0);
260
554k
  }
261
262
  /**
263
   * @brief Factory that returns a replicated scalar loaded from memory.
264
   */
265
  static ASTCENC_SIMD_INLINE vint4 load1(const int* p)
266
0
  {
267
0
    return vint4(*p);
268
0
  }
269
270
  /**
271
   * @brief Factory that returns a vector loaded from unaligned memory.
272
   */
273
  static ASTCENC_SIMD_INLINE vint4 load(const uint8_t* p)
274
0
  {
275
0
    vint4 data;
276
0
    std::memcpy(&data.m, p, 4 * sizeof(int));
277
0
    return data;
278
0
  }
279
280
  /**
281
   * @brief Factory that returns a vector loaded from 16B aligned memory.
282
   */
283
  static ASTCENC_SIMD_INLINE vint4 loada(const int* p)
284
89.2k
  {
285
89.2k
    return vint4(p);
286
89.2k
  }
287
288
  /**
289
   * @brief Factory that returns a vector containing the lane IDs.
290
   */
291
  static ASTCENC_SIMD_INLINE vint4 lane_id()
292
1.09M
  {
293
1.09M
    return vint4(0, 1, 2, 3);
294
1.09M
  }
295
296
  /**
297
   * @brief The vector ...
298
   */
299
  int m[4];
300
};
301
302
// ============================================================================
303
// vmask4 data type
304
// ============================================================================
305
306
/**
307
 * @brief Data type for 4-wide control plane masks.
308
 */
309
struct vmask4
310
{
311
  /**
312
   * @brief Construct from an existing mask value.
313
   */
314
  ASTCENC_SIMD_INLINE explicit vmask4(int* p)
315
0
  {
316
0
    m[0] = p[0];
317
0
    m[1] = p[1];
318
0
    m[2] = p[2];
319
0
    m[3] = p[3];
320
0
  }
321
322
  /**
323
   * @brief Construct from 1 scalar value.
324
   */
325
  ASTCENC_SIMD_INLINE explicit vmask4(bool a)
326
449k
  {
327
449k
    m[0] = a == false ? 0 : -1;
328
449k
    m[1] = a == false ? 0 : -1;
329
449k
    m[2] = a == false ? 0 : -1;
330
449k
    m[3] = a == false ? 0 : -1;
331
449k
  }
332
333
  /**
334
   * @brief Construct from 4 scalar values.
335
   *
336
   * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
337
   */
338
  ASTCENC_SIMD_INLINE explicit vmask4(bool a, bool b, bool c, bool d)
339
44.0M
  {
340
44.0M
    m[0] = a == false ? 0 : -1;
341
44.0M
    m[1] = b == false ? 0 : -1;
342
44.0M
    m[2] = c == false ? 0 : -1;
343
44.0M
    m[3] = d == false ? 0 : -1;
344
44.0M
  }
345
346
  /**
347
   * @brief Get the scalar value of a single lane.
348
   */
349
  template <int l> ASTCENC_SIMD_INLINE bool lane() const
350
357k
  {
351
357k
    return m[l] != 0;
352
357k
  }
bool vmask4::lane<0>() const
Line
Count
Source
350
89.2k
  {
351
89.2k
    return m[l] != 0;
352
89.2k
  }
bool vmask4::lane<1>() const
Line
Count
Source
350
89.2k
  {
351
89.2k
    return m[l] != 0;
352
89.2k
  }
bool vmask4::lane<2>() const
Line
Count
Source
350
89.2k
  {
351
89.2k
    return m[l] != 0;
352
89.2k
  }
bool vmask4::lane<3>() const
Line
Count
Source
350
89.2k
  {
351
89.2k
    return m[l] != 0;
352
89.2k
  }
353
354
  /**
355
   * @brief The vector ...
356
   */
357
  int m[4];
358
};
359
360
// ============================================================================
361
// vmask4 operators and functions
362
// ============================================================================
363
364
/**
365
 * @brief Overload: mask union (or).
366
 */
367
ASTCENC_SIMD_INLINE vmask4 operator|(vmask4 a, vmask4 b)
368
152k
{
369
152k
  return vmask4(a.m[0] | b.m[0],
370
152k
                a.m[1] | b.m[1],
371
152k
                a.m[2] | b.m[2],
372
152k
                a.m[3] | b.m[3]);
373
152k
}
374
375
/**
376
 * @brief Overload: mask intersect (and).
377
 */
378
ASTCENC_SIMD_INLINE vmask4 operator&(vmask4 a, vmask4 b)
379
1.89M
{
380
1.89M
  return vmask4(a.m[0] & b.m[0],
381
1.89M
                a.m[1] & b.m[1],
382
1.89M
                a.m[2] & b.m[2],
383
1.89M
                a.m[3] & b.m[3]);
384
1.89M
}
385
386
/**
387
 * @brief Overload: mask difference (xor).
388
 */
389
ASTCENC_SIMD_INLINE vmask4 operator^(vmask4 a, vmask4 b)
390
0
{
391
0
  return vmask4(a.m[0] ^ b.m[0],
392
0
                a.m[1] ^ b.m[1],
393
0
                a.m[2] ^ b.m[2],
394
0
                a.m[3] ^ b.m[3]);
395
0
}
396
397
/**
398
 * @brief Overload: mask invert (not).
399
 */
400
ASTCENC_SIMD_INLINE vmask4 operator~(vmask4 a)
401
89.8k
{
402
89.8k
  return vmask4(~a.m[0],
403
89.8k
                ~a.m[1],
404
89.8k
                ~a.m[2],
405
89.8k
                ~a.m[3]);
406
89.8k
}
407
408
/**
409
 * @brief Return a 1-bit mask code indicating mask status.
410
 *
411
 * bit0 = lane 0
412
 */
413
ASTCENC_SIMD_INLINE unsigned int mask(vmask4 a)
414
1.13M
{
415
1.13M
  return (a.m[0] & 0x1) |
416
1.13M
         (a.m[1] & 0x2) |
417
1.13M
         (a.m[2] & 0x4) |
418
1.13M
         (a.m[3] & 0x8);
419
1.13M
}
420
421
/**
422
 * @brief True if any lanes are enabled, false otherwise.
423
 */
424
ASTCENC_SIMD_INLINE bool any(vmask4 a)
425
396k
{
426
396k
  return mask(a) != 0;
427
396k
}
428
429
/**
430
 * @brief True if all lanes are enabled, false otherwise.
431
 */
432
ASTCENC_SIMD_INLINE bool all(vmask4 a)
433
705k
{
434
705k
  return mask(a) == 0xF;
435
705k
}
436
437
// ============================================================================
438
// vint4 operators and functions
439
// ============================================================================
440
441
/**
442
 * @brief Overload: vector by vector addition.
443
 */
444
ASTCENC_SIMD_INLINE vint4 operator+(vint4 a, vint4 b)
445
11.0M
{
446
11.0M
  return vint4(a.m[0] + b.m[0],
447
11.0M
               a.m[1] + b.m[1],
448
11.0M
               a.m[2] + b.m[2],
449
11.0M
               a.m[3] + b.m[3]);
450
11.0M
}
451
452
/**
453
 * @brief Overload: vector by vector subtraction.
454
 */
455
ASTCENC_SIMD_INLINE vint4 operator-(vint4 a, vint4 b)
456
2.53M
{
457
2.53M
  return vint4(a.m[0] - b.m[0],
458
2.53M
               a.m[1] - b.m[1],
459
2.53M
               a.m[2] - b.m[2],
460
2.53M
               a.m[3] - b.m[3]);
461
2.53M
}
462
463
/**
464
 * @brief Overload: vector by vector multiplication.
465
 */
466
ASTCENC_SIMD_INLINE vint4 operator*(vint4 a, vint4 b)
467
8.45M
{
468
8.45M
  return vint4(a.m[0] * b.m[0],
469
8.45M
               a.m[1] * b.m[1],
470
8.45M
               a.m[2] * b.m[2],
471
8.45M
               a.m[3] * b.m[3]);
472
8.45M
}
473
474
/**
475
 * @brief Overload: vector bit invert.
476
 */
477
ASTCENC_SIMD_INLINE vint4 operator~(vint4 a)
478
186k
{
479
186k
  return vint4(~a.m[0],
480
186k
               ~a.m[1],
481
186k
               ~a.m[2],
482
186k
               ~a.m[3]);
483
186k
}
484
485
/**
486
 * @brief Overload: vector by vector bitwise or.
487
 */
488
ASTCENC_SIMD_INLINE vint4 operator|(vint4 a, vint4 b)
489
434k
{
490
434k
  return vint4(a.m[0] | b.m[0],
491
434k
               a.m[1] | b.m[1],
492
434k
               a.m[2] | b.m[2],
493
434k
               a.m[3] | b.m[3]);
494
434k
}
495
496
/**
497
 * @brief Overload: vector by vector bitwise and.
498
 */
499
ASTCENC_SIMD_INLINE vint4 operator&(vint4 a, vint4 b)
500
1.22M
{
501
1.22M
  return vint4(a.m[0] & b.m[0],
502
1.22M
               a.m[1] & b.m[1],
503
1.22M
               a.m[2] & b.m[2],
504
1.22M
               a.m[3] & b.m[3]);
505
1.22M
}
506
507
/**
508
 * @brief Overload: vector by vector bitwise xor.
509
 */
510
ASTCENC_SIMD_INLINE vint4 operator^(vint4 a, vint4 b)
511
657k
{
512
657k
  return vint4(a.m[0] ^ b.m[0],
513
657k
               a.m[1] ^ b.m[1],
514
657k
               a.m[2] ^ b.m[2],
515
657k
               a.m[3] ^ b.m[3]);
516
657k
}
517
518
/**
519
 * @brief Overload: vector by vector equality.
520
 */
521
ASTCENC_SIMD_INLINE vmask4 operator==(vint4 a, vint4 b)
522
1.77M
{
523
1.77M
  return vmask4(a.m[0] == b.m[0],
524
1.77M
                a.m[1] == b.m[1],
525
1.77M
                a.m[2] == b.m[2],
526
1.77M
                a.m[3] == b.m[3]);
527
1.77M
}
528
529
/**
530
 * @brief Overload: vector by vector inequality.
531
 */
532
ASTCENC_SIMD_INLINE vmask4 operator!=(vint4 a, vint4 b)
533
45.9k
{
534
45.9k
  return vmask4(a.m[0] != b.m[0],
535
45.9k
                a.m[1] != b.m[1],
536
45.9k
                a.m[2] != b.m[2],
537
45.9k
                a.m[3] != b.m[3]);
538
45.9k
}
539
540
/**
541
 * @brief Overload: vector by vector less than.
542
 */
543
ASTCENC_SIMD_INLINE vmask4 operator<(vint4 a, vint4 b)
544
3.40M
{
545
3.40M
  return vmask4(a.m[0] < b.m[0],
546
3.40M
                a.m[1] < b.m[1],
547
3.40M
                a.m[2] < b.m[2],
548
3.40M
                a.m[3] < b.m[3]);
549
3.40M
}
550
551
/**
552
 * @brief Overload: vector by vector greater than.
553
 */
554
ASTCENC_SIMD_INLINE vmask4 operator>(vint4 a, vint4 b)
555
29.7k
{
556
29.7k
  return vmask4(a.m[0] > b.m[0],
557
29.7k
                a.m[1] > b.m[1],
558
29.7k
                a.m[2] > b.m[2],
559
29.7k
                a.m[3] > b.m[3]);
560
29.7k
}
561
562
/**
563
 * @brief Logical shift left.
564
 */
565
template <int s> ASTCENC_SIMD_INLINE vint4 lsl(vint4 a)
566
1.03M
{
567
  // Cast to unsigned to avoid shift in/out of sign bit undefined behavior
568
1.03M
  unsigned int as0 = static_cast<unsigned int>(a.m[0]) << s;
569
1.03M
  unsigned int as1 = static_cast<unsigned int>(a.m[1]) << s;
570
1.03M
  unsigned int as2 = static_cast<unsigned int>(a.m[2]) << s;
571
1.03M
  unsigned int as3 = static_cast<unsigned int>(a.m[3]) << s;
572
573
1.03M
  return vint4(static_cast<int>(as0),
574
1.03M
               static_cast<int>(as1),
575
1.03M
               static_cast<int>(as2),
576
1.03M
               static_cast<int>(as3));
577
1.03M
}
vint4 lsl<23>(vint4)
Line
Count
Source
566
188k
{
567
  // Cast to unsigned to avoid shift in/out of sign bit undefined behavior
568
188k
  unsigned int as0 = static_cast<unsigned int>(a.m[0]) << s;
569
188k
  unsigned int as1 = static_cast<unsigned int>(a.m[1]) << s;
570
188k
  unsigned int as2 = static_cast<unsigned int>(a.m[2]) << s;
571
188k
  unsigned int as3 = static_cast<unsigned int>(a.m[3]) << s;
572
573
188k
  return vint4(static_cast<int>(as0),
574
188k
               static_cast<int>(as1),
575
188k
               static_cast<int>(as2),
576
188k
               static_cast<int>(as3));
577
188k
}
vint4 lsl<10>(vint4)
Line
Count
Source
566
220k
{
567
  // Cast to unsigned to avoid shift in/out of sign bit undefined behavior
568
220k
  unsigned int as0 = static_cast<unsigned int>(a.m[0]) << s;
569
220k
  unsigned int as1 = static_cast<unsigned int>(a.m[1]) << s;
570
220k
  unsigned int as2 = static_cast<unsigned int>(a.m[2]) << s;
571
220k
  unsigned int as3 = static_cast<unsigned int>(a.m[3]) << s;
572
573
220k
  return vint4(static_cast<int>(as0),
574
220k
               static_cast<int>(as1),
575
220k
               static_cast<int>(as2),
576
220k
               static_cast<int>(as3));
577
220k
}
vint4 lsl<8>(vint4)
Line
Count
Source
566
410k
{
567
  // Cast to unsigned to avoid shift in/out of sign bit undefined behavior
568
410k
  unsigned int as0 = static_cast<unsigned int>(a.m[0]) << s;
569
410k
  unsigned int as1 = static_cast<unsigned int>(a.m[1]) << s;
570
410k
  unsigned int as2 = static_cast<unsigned int>(a.m[2]) << s;
571
410k
  unsigned int as3 = static_cast<unsigned int>(a.m[3]) << s;
572
573
410k
  return vint4(static_cast<int>(as0),
574
410k
               static_cast<int>(as1),
575
410k
               static_cast<int>(as2),
576
410k
               static_cast<int>(as3));
577
410k
}
vint4 lsl<16>(vint4)
Line
Count
Source
566
81.2k
{
567
  // Cast to unsigned to avoid shift in/out of sign bit undefined behavior
568
81.2k
  unsigned int as0 = static_cast<unsigned int>(a.m[0]) << s;
569
81.2k
  unsigned int as1 = static_cast<unsigned int>(a.m[1]) << s;
570
81.2k
  unsigned int as2 = static_cast<unsigned int>(a.m[2]) << s;
571
81.2k
  unsigned int as3 = static_cast<unsigned int>(a.m[3]) << s;
572
573
81.2k
  return vint4(static_cast<int>(as0),
574
81.2k
               static_cast<int>(as1),
575
81.2k
               static_cast<int>(as2),
576
81.2k
               static_cast<int>(as3));
577
81.2k
}
vint4 lsl<24>(vint4)
Line
Count
Source
566
81.2k
{
567
  // Cast to unsigned to avoid shift in/out of sign bit undefined behavior
568
81.2k
  unsigned int as0 = static_cast<unsigned int>(a.m[0]) << s;
569
81.2k
  unsigned int as1 = static_cast<unsigned int>(a.m[1]) << s;
570
81.2k
  unsigned int as2 = static_cast<unsigned int>(a.m[2]) << s;
571
81.2k
  unsigned int as3 = static_cast<unsigned int>(a.m[3]) << s;
572
573
81.2k
  return vint4(static_cast<int>(as0),
574
81.2k
               static_cast<int>(as1),
575
81.2k
               static_cast<int>(as2),
576
81.2k
               static_cast<int>(as3));
577
81.2k
}
vint4 lsl<1>(vint4)
Line
Count
Source
566
50.3k
{
567
  // Cast to unsigned to avoid shift in/out of sign bit undefined behavior
568
50.3k
  unsigned int as0 = static_cast<unsigned int>(a.m[0]) << s;
569
50.3k
  unsigned int as1 = static_cast<unsigned int>(a.m[1]) << s;
570
50.3k
  unsigned int as2 = static_cast<unsigned int>(a.m[2]) << s;
571
50.3k
  unsigned int as3 = static_cast<unsigned int>(a.m[3]) << s;
572
573
50.3k
  return vint4(static_cast<int>(as0),
574
50.3k
               static_cast<int>(as1),
575
50.3k
               static_cast<int>(as2),
576
50.3k
               static_cast<int>(as3));
577
50.3k
}
578
579
/**
580
 * @brief Logical shift right.
581
 */
582
template <int s> ASTCENC_SIMD_INLINE vint4 lsr(vint4 a)
583
1.50M
{
584
  // Cast to unsigned to avoid shift in/out of sign bit undefined behavior
585
1.50M
  unsigned int as0 = static_cast<unsigned int>(a.m[0]) >> s;
586
1.50M
  unsigned int as1 = static_cast<unsigned int>(a.m[1]) >> s;
587
1.50M
  unsigned int as2 = static_cast<unsigned int>(a.m[2]) >> s;
588
1.50M
  unsigned int as3 = static_cast<unsigned int>(a.m[3]) >> s;
589
590
1.50M
  return vint4(static_cast<int>(as0),
591
1.50M
               static_cast<int>(as1),
592
1.50M
               static_cast<int>(as2),
593
1.50M
               static_cast<int>(as3));
594
1.50M
}
vint4 lsr<23>(vint4)
Line
Count
Source
583
214k
{
584
  // Cast to unsigned to avoid shift in/out of sign bit undefined behavior
585
214k
  unsigned int as0 = static_cast<unsigned int>(a.m[0]) >> s;
586
214k
  unsigned int as1 = static_cast<unsigned int>(a.m[1]) >> s;
587
214k
  unsigned int as2 = static_cast<unsigned int>(a.m[2]) >> s;
588
214k
  unsigned int as3 = static_cast<unsigned int>(a.m[3]) >> s;
589
590
214k
  return vint4(static_cast<int>(as0),
591
214k
               static_cast<int>(as1),
592
214k
               static_cast<int>(as2),
593
214k
               static_cast<int>(as3));
594
214k
}
vint4 lsr<11>(vint4)
Line
Count
Source
583
34.4k
{
584
  // Cast to unsigned to avoid shift in/out of sign bit undefined behavior
585
34.4k
  unsigned int as0 = static_cast<unsigned int>(a.m[0]) >> s;
586
34.4k
  unsigned int as1 = static_cast<unsigned int>(a.m[1]) >> s;
587
34.4k
  unsigned int as2 = static_cast<unsigned int>(a.m[2]) >> s;
588
34.4k
  unsigned int as3 = static_cast<unsigned int>(a.m[3]) >> s;
589
590
34.4k
  return vint4(static_cast<int>(as0),
591
34.4k
               static_cast<int>(as1),
592
34.4k
               static_cast<int>(as2),
593
34.4k
               static_cast<int>(as3));
594
34.4k
}
vint4 lsr<3>(vint4)
Line
Count
Source
583
34.4k
{
584
  // Cast to unsigned to avoid shift in/out of sign bit undefined behavior
585
34.4k
  unsigned int as0 = static_cast<unsigned int>(a.m[0]) >> s;
586
34.4k
  unsigned int as1 = static_cast<unsigned int>(a.m[1]) >> s;
587
34.4k
  unsigned int as2 = static_cast<unsigned int>(a.m[2]) >> s;
588
34.4k
  unsigned int as3 = static_cast<unsigned int>(a.m[3]) >> s;
589
590
34.4k
  return vint4(static_cast<int>(as0),
591
34.4k
               static_cast<int>(as1),
592
34.4k
               static_cast<int>(as2),
593
34.4k
               static_cast<int>(as3));
594
34.4k
}
vint4 lsr<8>(vint4)
Line
Count
Source
583
186k
{
584
  // Cast to unsigned to avoid shift in/out of sign bit undefined behavior
585
186k
  unsigned int as0 = static_cast<unsigned int>(a.m[0]) >> s;
586
186k
  unsigned int as1 = static_cast<unsigned int>(a.m[1]) >> s;
587
186k
  unsigned int as2 = static_cast<unsigned int>(a.m[2]) >> s;
588
186k
  unsigned int as3 = static_cast<unsigned int>(a.m[3]) >> s;
589
590
186k
  return vint4(static_cast<int>(as0),
591
186k
               static_cast<int>(as1),
592
186k
               static_cast<int>(as2),
593
186k
               static_cast<int>(as3));
594
186k
}
vint4 lsr<6>(vint4)
Line
Count
Source
583
186k
{
584
  // Cast to unsigned to avoid shift in/out of sign bit undefined behavior
585
186k
  unsigned int as0 = static_cast<unsigned int>(a.m[0]) >> s;
586
186k
  unsigned int as1 = static_cast<unsigned int>(a.m[1]) >> s;
587
186k
  unsigned int as2 = static_cast<unsigned int>(a.m[2]) >> s;
588
186k
  unsigned int as3 = static_cast<unsigned int>(a.m[3]) >> s;
589
590
186k
  return vint4(static_cast<int>(as0),
591
186k
               static_cast<int>(as1),
592
186k
               static_cast<int>(as2),
593
186k
               static_cast<int>(as3));
594
186k
}
vint4 lsr<1>(vint4)
Line
Count
Source
583
31.7k
{
584
  // Cast to unsigned to avoid shift in/out of sign bit undefined behavior
585
31.7k
  unsigned int as0 = static_cast<unsigned int>(a.m[0]) >> s;
586
31.7k
  unsigned int as1 = static_cast<unsigned int>(a.m[1]) >> s;
587
31.7k
  unsigned int as2 = static_cast<unsigned int>(a.m[2]) >> s;
588
31.7k
  unsigned int as3 = static_cast<unsigned int>(a.m[3]) >> s;
589
590
31.7k
  return vint4(static_cast<int>(as0),
591
31.7k
               static_cast<int>(as1),
592
31.7k
               static_cast<int>(as2),
593
31.7k
               static_cast<int>(as3));
594
31.7k
}
vint4 lsr<4>(vint4)
Line
Count
Source
583
818k
{
584
  // Cast to unsigned to avoid shift in/out of sign bit undefined behavior
585
818k
  unsigned int as0 = static_cast<unsigned int>(a.m[0]) >> s;
586
818k
  unsigned int as1 = static_cast<unsigned int>(a.m[1]) >> s;
587
818k
  unsigned int as2 = static_cast<unsigned int>(a.m[2]) >> s;
588
818k
  unsigned int as3 = static_cast<unsigned int>(a.m[3]) >> s;
589
590
818k
  return vint4(static_cast<int>(as0),
591
818k
               static_cast<int>(as1),
592
818k
               static_cast<int>(as2),
593
818k
               static_cast<int>(as3));
594
818k
}
595
596
/**
597
 * @brief Arithmetic shift right.
598
 */
599
template <int s> ASTCENC_SIMD_INLINE vint4 asr(vint4 a)
600
4.08M
{
601
4.08M
  return vint4(a.m[0] >> s,
602
4.08M
               a.m[1] >> s,
603
4.08M
               a.m[2] >> s,
604
4.08M
               a.m[3] >> s);
605
4.08M
}
vint4 asr<1>(vint4)
Line
Count
Source
600
22.6k
{
601
22.6k
  return vint4(a.m[0] >> s,
602
22.6k
               a.m[1] >> s,
603
22.6k
               a.m[2] >> s,
604
22.6k
               a.m[3] >> s);
605
22.6k
}
vint4 asr<8>(vint4)
Line
Count
Source
600
2.05M
{
601
2.05M
  return vint4(a.m[0] >> s,
602
2.05M
               a.m[1] >> s,
603
2.05M
               a.m[2] >> s,
604
2.05M
               a.m[3] >> s);
605
2.05M
}
vint4 asr<6>(vint4)
Line
Count
Source
600
2.01M
{
601
2.01M
  return vint4(a.m[0] >> s,
602
2.01M
               a.m[1] >> s,
603
2.01M
               a.m[2] >> s,
604
2.01M
               a.m[3] >> s);
605
2.01M
}
606
607
/**
608
 * @brief Return the min vector of two vectors.
609
 */
610
ASTCENC_SIMD_INLINE vint4 min(vint4 a, vint4 b)
611
2.14M
{
612
2.14M
  return vint4(a.m[0] < b.m[0] ? a.m[0] : b.m[0],
613
2.14M
               a.m[1] < b.m[1] ? a.m[1] : b.m[1],
614
2.14M
               a.m[2] < b.m[2] ? a.m[2] : b.m[2],
615
2.14M
               a.m[3] < b.m[3] ? a.m[3] : b.m[3]);
616
2.14M
}
617
618
/**
619
 * @brief Return the min vector of two vectors.
620
 */
621
ASTCENC_SIMD_INLINE vint4 max(vint4 a, vint4 b)
622
568k
{
623
568k
  return vint4(a.m[0] > b.m[0] ? a.m[0] : b.m[0],
624
568k
               a.m[1] > b.m[1] ? a.m[1] : b.m[1],
625
568k
               a.m[2] > b.m[2] ? a.m[2] : b.m[2],
626
568k
               a.m[3] > b.m[3] ? a.m[3] : b.m[3]);
627
568k
}
628
629
/**
630
 * @brief Return the horizontal minimum of a single vector.
631
 */
632
ASTCENC_SIMD_INLINE vint4 hmin(vint4 a)
633
33.5k
{
634
33.5k
  int b = std::min(a.m[0], a.m[1]);
635
33.5k
  int c = std::min(a.m[2], a.m[3]);
636
33.5k
  return vint4(std::min(b, c));
637
33.5k
}
638
639
/**
640
 * @brief Return the horizontal maximum of a single vector.
641
 */
642
ASTCENC_SIMD_INLINE vint4 hmax(vint4 a)
643
2.06M
{
644
2.06M
  int b = std::max(a.m[0], a.m[1]);
645
2.06M
  int c = std::max(a.m[2], a.m[3]);
646
2.06M
  return vint4(std::max(b, c));
647
2.06M
}
648
649
/**
650
 * @brief Store a vector to an aligned memory address.
651
 */
652
ASTCENC_SIMD_INLINE void storea(vint4 a, int* p)
653
1.15M
{
654
1.15M
  p[0] = a.m[0];
655
1.15M
  p[1] = a.m[1];
656
1.15M
  p[2] = a.m[2];
657
1.15M
  p[3] = a.m[3];
658
1.15M
}
659
660
/**
661
 * @brief Store a vector to an unaligned memory address.
662
 */
663
ASTCENC_SIMD_INLINE void store(vint4 a, int* p)
664
818k
{
665
818k
  p[0] = a.m[0];
666
818k
  p[1] = a.m[1];
667
818k
  p[2] = a.m[2];
668
818k
  p[3] = a.m[3];
669
818k
}
670
671
/**
672
 * @brief Store a vector to an unaligned memory address.
673
 */
674
ASTCENC_SIMD_INLINE void store(vint4 a, uint8_t* p)
675
67.8k
{
676
67.8k
  std::memcpy(p, a.m, sizeof(int) * 4);
677
67.8k
}
678
679
/**
680
 * @brief Store lowest N (vector width) bytes into an unaligned address.
681
 */
682
ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
683
1.61M
{
684
1.61M
  std::memcpy(p, a.m, sizeof(uint8_t) * 4);
685
1.61M
}
686
687
/**
688
 * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
689
 */
690
ASTCENC_SIMD_INLINE void pack_and_store_low_bytes(vint4 a, uint8_t* p)
691
1.54M
{
692
1.54M
  int b0 = a.m[0] & 0xFF;
693
1.54M
  int b1 = a.m[1] & 0xFF;
694
1.54M
  int b2 = a.m[2] & 0xFF;
695
1.54M
  int b3 = a.m[3] & 0xFF;
696
697
1.54M
#if !defined(ASTCENC_BIG_ENDIAN)
698
1.54M
  int b = b0 | (b1 << 8) | (b2 << 16) | (b3 << 24);
699
#else
700
  int b = b3 | (b2 << 8) | (b1 << 16) | (b0 << 24);
701
#endif
702
1.54M
  a = vint4(b, 0, 0, 0);
703
1.54M
  store_nbytes(a, p);
704
1.54M
}
705
706
/**
707
 * @brief Return lanes from @c b if MSB of @c cond is set, else @c a.
708
 */
709
ASTCENC_SIMD_INLINE vint4 select(vint4 a, vint4 b, vmask4 cond)
710
5.60M
{
711
5.60M
  return vint4((cond.m[0] & static_cast<int>(0x80000000)) ? b.m[0] : a.m[0],
712
5.60M
               (cond.m[1] & static_cast<int>(0x80000000)) ? b.m[1] : a.m[1],
713
5.60M
               (cond.m[2] & static_cast<int>(0x80000000)) ? b.m[2] : a.m[2],
714
5.60M
               (cond.m[3] & static_cast<int>(0x80000000)) ? b.m[3] : a.m[3]);
715
5.60M
}
716
717
// ============================================================================
718
// vfloat4 operators and functions
719
// ============================================================================
720
721
/**
722
 * @brief Overload: vector by vector addition.
723
 */
724
ASTCENC_SIMD_INLINE vfloat4 operator+(vfloat4 a, vfloat4 b)
725
139M
{
726
139M
  return vfloat4(a.m[0] + b.m[0],
727
139M
                 a.m[1] + b.m[1],
728
139M
                 a.m[2] + b.m[2],
729
139M
                 a.m[3] + b.m[3]);
730
139M
}
731
732
/**
733
 * @brief Overload: vector by vector subtraction.
734
 */
735
ASTCENC_SIMD_INLINE vfloat4 operator-(vfloat4 a, vfloat4 b)
736
51.1M
{
737
51.1M
  return vfloat4(a.m[0] - b.m[0],
738
51.1M
                 a.m[1] - b.m[1],
739
51.1M
                 a.m[2] - b.m[2],
740
51.1M
                 a.m[3] - b.m[3]);
741
51.1M
}
742
743
/**
744
 * @brief Overload: vector by vector multiplication.
745
 */
746
ASTCENC_SIMD_INLINE vfloat4 operator*(vfloat4 a, vfloat4 b)
747
158M
{
748
158M
  return vfloat4(a.m[0] * b.m[0],
749
158M
                 a.m[1] * b.m[1],
750
158M
                 a.m[2] * b.m[2],
751
158M
                 a.m[3] * b.m[3]);
752
158M
}
753
754
/**
755
 * @brief Overload: vector by vector division.
756
 */
757
ASTCENC_SIMD_INLINE vfloat4 operator/(vfloat4 a, vfloat4 b)
758
4.21M
{
759
4.21M
  return vfloat4(a.m[0] / b.m[0],
760
4.21M
                 a.m[1] / b.m[1],
761
4.21M
                 a.m[2] / b.m[2],
762
4.21M
                 a.m[3] / b.m[3]);
763
4.21M
}
764
765
/**
766
 * @brief Overload: vector by vector equality.
767
 *
768
 * Returns vector of false mask values if a or b is NaN.
769
 */
770
ASTCENC_SIMD_INLINE vmask4 operator==(vfloat4 a, vfloat4 b)
771
12.5M
{
772
12.5M
  return vmask4(a.m[0] == b.m[0],
773
12.5M
                a.m[1] == b.m[1],
774
12.5M
                a.m[2] == b.m[2],
775
12.5M
                a.m[3] == b.m[3]);
776
12.5M
}
777
778
/**
779
 * @brief Overload: vector by vector inequality.
780
 *
781
 * Returns vector of true mask values if a or b is NaN.
782
 */
783
ASTCENC_SIMD_INLINE vmask4 operator!=(vfloat4 a, vfloat4 b)
784
81.2k
{
785
81.2k
  return vmask4(a.m[0] != b.m[0],
786
81.2k
                a.m[1] != b.m[1],
787
81.2k
                a.m[2] != b.m[2],
788
81.2k
                a.m[3] != b.m[3]);
789
81.2k
}
790
791
/**
792
 * @brief Overload: vector by vector less than.
793
 */
794
ASTCENC_SIMD_INLINE vmask4 operator<(vfloat4 a, vfloat4 b)
795
2.23M
{
796
2.23M
  return vmask4(a.m[0] < b.m[0],
797
2.23M
                a.m[1] < b.m[1],
798
2.23M
                a.m[2] < b.m[2],
799
2.23M
                a.m[3] < b.m[3]);
800
2.23M
}
801
802
/**
803
 * @brief Overload: vector by vector greater than.
804
 */
805
ASTCENC_SIMD_INLINE vmask4 operator>(vfloat4 a, vfloat4 b)
806
21.4M
{
807
21.4M
  return vmask4(a.m[0] > b.m[0],
808
21.4M
                a.m[1] > b.m[1],
809
21.4M
                a.m[2] > b.m[2],
810
21.4M
                a.m[3] > b.m[3]);
811
21.4M
}
812
813
/**
814
 * @brief Overload: vector by vector less than or equal.
815
 */
816
ASTCENC_SIMD_INLINE vmask4 operator<=(vfloat4 a, vfloat4 b)
817
25.9k
{
818
25.9k
  return vmask4(a.m[0] <= b.m[0],
819
25.9k
                a.m[1] <= b.m[1],
820
25.9k
                a.m[2] <= b.m[2],
821
25.9k
                a.m[3] <= b.m[3]);
822
25.9k
}
823
824
/**
825
 * @brief Overload: vector by vector greater than or equal.
826
 */
827
ASTCENC_SIMD_INLINE vmask4 operator>=(vfloat4 a, vfloat4 b)
828
118k
{
829
118k
  return vmask4(a.m[0] >= b.m[0],
830
118k
                a.m[1] >= b.m[1],
831
118k
                a.m[2] >= b.m[2],
832
118k
                a.m[3] >= b.m[3]);
833
118k
}
834
835
/**
836
 * @brief Return the min vector of two vectors.
837
 *
838
 * If either lane value is NaN, @c b will be returned for that lane.
839
 */
840
ASTCENC_SIMD_INLINE vfloat4 min(vfloat4 a, vfloat4 b)
841
8.86M
{
842
8.86M
  return vfloat4(a.m[0] < b.m[0] ? a.m[0] : b.m[0],
843
8.86M
                 a.m[1] < b.m[1] ? a.m[1] : b.m[1],
844
8.86M
                 a.m[2] < b.m[2] ? a.m[2] : b.m[2],
845
8.86M
                 a.m[3] < b.m[3] ? a.m[3] : b.m[3]);
846
8.86M
}
847
848
/**
849
 * @brief Return the max vector of two vectors.
850
 *
851
 * If either lane value is NaN, @c b will be returned for that lane.
852
 */
853
ASTCENC_SIMD_INLINE vfloat4 max(vfloat4 a, vfloat4 b)
854
5.77M
{
855
5.77M
  return vfloat4(a.m[0] > b.m[0] ? a.m[0] : b.m[0],
856
5.77M
                 a.m[1] > b.m[1] ? a.m[1] : b.m[1],
857
5.77M
                 a.m[2] > b.m[2] ? a.m[2] : b.m[2],
858
5.77M
                 a.m[3] > b.m[3] ? a.m[3] : b.m[3]);
859
5.77M
}
860
861
/**
862
 * @brief Return the absolute value of the float vector.
863
 */
864
ASTCENC_SIMD_INLINE vfloat4 abs(vfloat4 a)
865
2.62M
{
866
2.62M
  return vfloat4(std::abs(a.m[0]),
867
2.62M
                 std::abs(a.m[1]),
868
2.62M
                 std::abs(a.m[2]),
869
2.62M
                 std::abs(a.m[3]));
870
2.62M
}
871
872
/**
873
 * @brief Return a float rounded to the nearest integer value.
874
 */
875
ASTCENC_SIMD_INLINE vfloat4 round(vfloat4 a)
876
6.42M
{
877
6.42M
  assert(std::fegetround() == FE_TONEAREST);
878
6.42M
  return vfloat4(std::nearbyint(a.m[0]),
879
6.42M
                 std::nearbyint(a.m[1]),
880
6.42M
                 std::nearbyint(a.m[2]),
881
6.42M
                 std::nearbyint(a.m[3]));
882
6.42M
}
883
884
/**
885
 * @brief Return the horizontal minimum of a vector.
886
 */
887
ASTCENC_SIMD_INLINE vfloat4 hmin(vfloat4 a)
888
552k
{
889
552k
  float tmp1 = std::min(a.m[0], a.m[1]);
890
552k
  float tmp2 = std::min(a.m[2], a.m[3]);
891
552k
  return vfloat4(std::min(tmp1, tmp2));
892
552k
}
893
894
/**
895
 * @brief Return the horizontal maximum of a vector.
896
 */
897
ASTCENC_SIMD_INLINE vfloat4 hmax(vfloat4 a)
898
476k
{
899
476k
  float tmp1 = std::max(a.m[0], a.m[1]);
900
476k
  float tmp2 = std::max(a.m[2], a.m[3]);
901
476k
  return vfloat4(std::max(tmp1, tmp2));
902
476k
}
903
904
/**
905
 * @brief Return the horizontal sum of a vector.
906
 */
907
ASTCENC_SIMD_INLINE float hadd_s(vfloat4 a)
908
11.3M
{
909
  // Use halving add, gives invariance with SIMD versions
910
11.3M
  return (a.m[0] + a.m[2]) + (a.m[1] + a.m[3]);
911
11.3M
}
912
913
/**
914
 * @brief Return the sqrt of the lanes in the vector.
915
 */
916
ASTCENC_SIMD_INLINE vfloat4 sqrt(vfloat4 a)
917
783k
{
918
783k
  return vfloat4(std::sqrt(a.m[0]),
919
783k
                 std::sqrt(a.m[1]),
920
783k
                 std::sqrt(a.m[2]),
921
783k
                 std::sqrt(a.m[3]));
922
783k
}
923
924
/**
925
 * @brief Return lanes from @c b if @c cond is set, else @c a.
926
 */
927
ASTCENC_SIMD_INLINE vfloat4 select(vfloat4 a, vfloat4 b, vmask4 cond)
928
46.9M
{
929
46.9M
  return vfloat4((cond.m[0] & static_cast<int>(0x80000000)) ? b.m[0] : a.m[0],
930
46.9M
                 (cond.m[1] & static_cast<int>(0x80000000)) ? b.m[1] : a.m[1],
931
46.9M
                 (cond.m[2] & static_cast<int>(0x80000000)) ? b.m[2] : a.m[2],
932
46.9M
                 (cond.m[3] & static_cast<int>(0x80000000)) ? b.m[3] : a.m[3]);
933
46.9M
}
934
935
/**
936
 * @brief Load a vector of gathered results from an array;
937
 */
938
ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices)
939
0
{
940
0
  return vfloat4(base[indices.m[0]],
941
0
                 base[indices.m[1]],
942
0
                 base[indices.m[2]],
943
0
                 base[indices.m[3]]);
944
0
}
945
946
/**
947
 * @brief Load a vector of gathered results from an array using byte indices from memory
948
 */
949
template<>
950
ASTCENC_SIMD_INLINE vfloat4 gatherf_byte_inds<vfloat4>(const float* base, const uint8_t* indices)
951
41.1M
{
952
41.1M
  return vfloat4(base[indices[0]],
953
41.1M
                 base[indices[1]],
954
41.1M
                 base[indices[2]],
955
41.1M
                 base[indices[3]]);
956
41.1M
}
957
958
/**
959
 * @brief Store a vector to an unaligned memory address.
960
 */
961
ASTCENC_SIMD_INLINE void store(vfloat4 a, float* ptr)
962
0
{
963
0
  ptr[0] = a.m[0];
964
0
  ptr[1] = a.m[1];
965
0
  ptr[2] = a.m[2];
966
0
  ptr[3] = a.m[3];
967
0
}
968
969
/**
970
 * @brief Store a vector to an aligned memory address.
971
 */
972
ASTCENC_SIMD_INLINE void storea(vfloat4 a, float* ptr)
973
7.50M
{
974
7.50M
  ptr[0] = a.m[0];
975
7.50M
  ptr[1] = a.m[1];
976
7.50M
  ptr[2] = a.m[2];
977
7.50M
  ptr[3] = a.m[3];
978
7.50M
}
979
980
/**
981
 * @brief Return a integer value for a float vector, using truncation.
982
 */
983
ASTCENC_SIMD_INLINE vint4 float_to_int(vfloat4 a)
984
2.05M
{
985
2.05M
  return vint4(static_cast<int>(a.m[0]),
986
2.05M
               static_cast<int>(a.m[1]),
987
2.05M
               static_cast<int>(a.m[2]),
988
2.05M
               static_cast<int>(a.m[3]));
989
2.05M
}
990
991
/**f
992
 * @brief Return a integer value for a float vector, using round-to-nearest.
993
 */
994
ASTCENC_SIMD_INLINE vint4 float_to_int_rtn(vfloat4 a)
995
1.29M
{
996
1.29M
  a = a + vfloat4(0.5f);
997
1.29M
  return vint4(static_cast<int>(a.m[0]),
998
1.29M
               static_cast<int>(a.m[1]),
999
1.29M
               static_cast<int>(a.m[2]),
1000
1.29M
               static_cast<int>(a.m[3]));
1001
1.29M
}
1002
1003
/**
1004
 * @brief Return a float value for a integer vector.
1005
 */
1006
ASTCENC_SIMD_INLINE vfloat4 int_to_float(vint4 a)
1007
6.22M
{
1008
6.22M
  return vfloat4(static_cast<float>(a.m[0]),
1009
6.22M
                 static_cast<float>(a.m[1]),
1010
6.22M
                 static_cast<float>(a.m[2]),
1011
6.22M
                 static_cast<float>(a.m[3]));
1012
6.22M
}
1013
1014
/**
1015
 * @brief Return a float16 value for a float vector, using round-to-nearest.
1016
 */
1017
ASTCENC_SIMD_INLINE vint4 float_to_float16(vfloat4 a)
1018
5
{
1019
5
  return vint4(
1020
5
    float_to_sf16(a.lane<0>()),
1021
5
    float_to_sf16(a.lane<1>()),
1022
5
    float_to_sf16(a.lane<2>()),
1023
5
    float_to_sf16(a.lane<3>()));
1024
5
}
1025
1026
/**
1027
 * @brief Return a float16 value for a float scalar, using round-to-nearest.
1028
 */
1029
static inline uint16_t float_to_float16(float a)
1030
0
{
1031
0
  return float_to_sf16(a);
1032
0
}
Unexecuted instantiation: astcenc_entry.cpp:float_to_float16(float)
Unexecuted instantiation: astcenc_image.cpp:float_to_float16(float)
Unexecuted instantiation: astcenc_mathlib_softfloat.cpp:float_to_float16(float)
Unexecuted instantiation: astcenc_percentile_tables.cpp:float_to_float16(float)
Unexecuted instantiation: astcenc_symbolic_physical.cpp:float_to_float16(float)
Unexecuted instantiation: astcenc_weight_align.cpp:float_to_float16(float)
Unexecuted instantiation: astcenc_weight_quant_xfer_tables.cpp:float_to_float16(float)
Unexecuted instantiation: astcenc_block_sizes.cpp:float_to_float16(float)
Unexecuted instantiation: astcenc_color_unquantize.cpp:float_to_float16(float)
Unexecuted instantiation: astcenc_compress_symbolic.cpp:float_to_float16(float)
Unexecuted instantiation: astcenc_compute_variance.cpp:float_to_float16(float)
Unexecuted instantiation: astcenc_decompress_symbolic.cpp:float_to_float16(float)
Unexecuted instantiation: astcenc_find_best_partitioning.cpp:float_to_float16(float)
Unexecuted instantiation: astcenc_ideal_endpoints_and_weights.cpp:float_to_float16(float)
Unexecuted instantiation: astcenc_integer_sequence.cpp:float_to_float16(float)
Unexecuted instantiation: astcenc_mathlib.cpp:float_to_float16(float)
Unexecuted instantiation: astcenc_partition_tables.cpp:float_to_float16(float)
Unexecuted instantiation: astcenc_pick_best_endpoint_format.cpp:float_to_float16(float)
Unexecuted instantiation: astcenc_quantization.cpp:float_to_float16(float)
Unexecuted instantiation: astcenc_averages_and_directions.cpp:float_to_float16(float)
Unexecuted instantiation: astcenc_color_quantize.cpp:float_to_float16(float)
Unexecuted instantiation: fuzz_astc_physical_to_symbolic.cpp:float_to_float16(float)
1033
1034
/**
1035
 * @brief Return a float value for a float16 vector.
1036
 */
1037
ASTCENC_SIMD_INLINE vfloat4 float16_to_float(vint4 a)
1038
208k
{
1039
208k
  return vfloat4(
1040
208k
    sf16_to_float(static_cast<uint16_t>(a.lane<0>())),
1041
208k
    sf16_to_float(static_cast<uint16_t>(a.lane<1>())),
1042
208k
    sf16_to_float(static_cast<uint16_t>(a.lane<2>())),
1043
208k
    sf16_to_float(static_cast<uint16_t>(a.lane<3>())));
1044
208k
}
1045
1046
/**
1047
 * @brief Return a float value for a float16 scalar.
1048
 */
1049
ASTCENC_SIMD_INLINE float float16_to_float(uint16_t a)
1050
0
{
1051
0
  return sf16_to_float(a);
1052
0
}
1053
1054
/**
1055
 * @brief Return a float value as an integer bit pattern (i.e. no conversion).
1056
 *
1057
 * It is a common trick to convert floats into integer bit patterns, perform
1058
 * some bit hackery based on knowledge they are IEEE 754 layout, and then
1059
 * convert them back again. This is the first half of that flip.
1060
 */
1061
ASTCENC_SIMD_INLINE vint4 float_as_int(vfloat4 a)
1062
1.51M
{
1063
1.51M
  static_assert(sizeof(int) == sizeof(float), "int must be 32-bit");
1064
1065
1.51M
  vint4 r;
1066
1.51M
  std::memcpy(r.m, a.m, 4 * sizeof(int));
1067
1.51M
  return r;
1068
1.51M
}
1069
1070
/**
1071
 * @brief Return a integer value as a float bit pattern (i.e. no conversion).
1072
 *
1073
 * It is a common trick to convert floats into integer bit patterns, perform
1074
 * some bit hackery based on knowledge they are IEEE 754 layout, and then
1075
 * convert them back again. This is the second half of that flip.
1076
 */
1077
ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 a)
1078
867k
{
1079
867k
  vfloat4 r;
1080
867k
  std::memcpy(r.m, a.m, 4 * sizeof(float));
1081
867k
  return r;
1082
867k
}
1083
1084
/*
1085
 * Table structure for a 16x 8-bit entry table.
1086
 */
1087
struct vtable4_16x8 {
1088
  const uint8_t* data;
1089
};
1090
1091
/*
1092
 * Table structure for a 32x 8-bit entry table.
1093
 */
1094
struct vtable4_32x8 {
1095
  const uint8_t* data;
1096
};
1097
1098
/*
1099
 * Table structure for a 64x 8-bit entry table.
1100
 */
1101
struct vtable4_64x8 {
1102
  const uint8_t* data;
1103
};
1104
1105
/**
1106
 * @brief Prepare a vtable lookup table for 16x 8-bit entry table.
1107
 */
1108
ASTCENC_SIMD_INLINE void vtable_prepare(
1109
  vtable4_16x8& table,
1110
  const uint8_t* data
1111
259k
) {
1112
259k
  table.data = data;
1113
259k
}
1114
1115
/**
1116
 * @brief Prepare a vtable lookup table for 32x 8-bit entry table.
1117
 */
1118
ASTCENC_SIMD_INLINE void vtable_prepare(
1119
  vtable4_32x8& table,
1120
  const uint8_t* data
1121
86.3k
) {
1122
86.3k
  table.data = data;
1123
86.3k
}
1124
1125
/**
1126
 * @brief Prepare a vtable lookup table 64x 8-bit entry table.
1127
 */
1128
ASTCENC_SIMD_INLINE void vtable_prepare(
1129
  vtable4_64x8& table,
1130
  const uint8_t* data
1131
59.6k
) {
1132
59.6k
  table.data = data;
1133
59.6k
}
1134
1135
/**
1136
 * @brief Perform a vtable lookup in a 16x 8-bit table with 32-bit indices.
1137
 */
1138
ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
1139
  const vtable4_16x8& table,
1140
  vint4 idx
1141
2.89M
) {
1142
2.89M
  return vint4(table.data[idx.lane<0>()],
1143
2.89M
               table.data[idx.lane<1>()],
1144
2.89M
               table.data[idx.lane<2>()],
1145
2.89M
               table.data[idx.lane<3>()]);
1146
2.89M
}
1147
1148
/**
1149
 * @brief Perform a vtable lookup in a 32x 8-bit table with 32-bit indices.
1150
 */
1151
ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
1152
  const vtable4_32x8& table,
1153
  vint4 idx
1154
1.10M
) {
1155
1.10M
  return vint4(table.data[idx.lane<0>()],
1156
1.10M
               table.data[idx.lane<1>()],
1157
1.10M
               table.data[idx.lane<2>()],
1158
1.10M
               table.data[idx.lane<3>()]);
1159
1.10M
}
1160
1161
/**
1162
 * @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices.
1163
 */
1164
ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
1165
  const vtable4_64x8& table,
1166
  vint4 idx
1167
719k
) {
1168
719k
  return vint4(table.data[idx.lane<0>()],
1169
719k
               table.data[idx.lane<1>()],
1170
719k
               table.data[idx.lane<2>()],
1171
719k
               table.data[idx.lane<3>()]);
1172
719k
}
1173
1174
/**
1175
 * @brief Return a vector of interleaved RGBA data.
1176
 *
1177
 * Input vectors have the value stored in the bottom 8 bits of each lane,
1178
 * with high  bits set to zero.
1179
 *
1180
 * Output vector stores a single RGBA texel packed in each lane.
1181
 */
1182
ASTCENC_SIMD_INLINE vint4 interleave_rgba8(vint4 r, vint4 g, vint4 b, vint4 a)
1183
81.2k
{
1184
81.2k
#if !defined(ASTCENC_BIG_ENDIAN)
1185
81.2k
  return r + lsl<8>(g) + lsl<16>(b) + lsl<24>(a);
1186
#else
1187
  return a + lsl<8>(b) + lsl<16>(g) + lsl<24>(r);
1188
#endif
1189
81.2k
}
1190
1191
/**
1192
 * @brief Store a single vector lane to an unaligned address.
1193
 */
1194
ASTCENC_SIMD_INLINE void store_lane(uint8_t* base, int data)
1195
25.4k
{
1196
25.4k
  std::memcpy(base, &data, sizeof(int));
1197
25.4k
}
1198
1199
/**
1200
 * @brief Store a vector, skipping masked lanes.
1201
 *
1202
 * All masked lanes must be at the end of vector, after all non-masked lanes.
1203
 * Input is a byte array of at least 4 bytes per unmasked entry.
1204
 */
1205
ASTCENC_SIMD_INLINE void store_lanes_masked(uint8_t* base, vint4 data, vmask4 mask)
1206
81.2k
{
1207
81.2k
  if (mask.m[3])
1208
67.8k
  {
1209
67.8k
    store(data, base);
1210
67.8k
  }
1211
13.3k
  else if (mask.m[2])
1212
0
  {
1213
0
    store_lane(base + 0, data.lane<0>());
1214
0
    store_lane(base + 4, data.lane<1>());
1215
0
    store_lane(base + 8, data.lane<2>());
1216
0
  }
1217
13.3k
  else if (mask.m[1])
1218
12.0k
  {
1219
12.0k
    store_lane(base + 0, data.lane<0>());
1220
12.0k
    store_lane(base + 4, data.lane<1>());
1221
12.0k
  }
1222
1.31k
  else if (mask.m[0])
1223
1.31k
  {
1224
1.31k
    store_lane(base + 0, data.lane<0>());
1225
1.31k
  }
1226
81.2k
}
1227
1228
#endif // #ifndef ASTC_VECMATHLIB_NONE_4_H_INCLUDED