Coverage Report

Created: 2026-05-30 06:06

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/astc-encoder/Source/astcenc_ideal_endpoints_and_weights.cpp
Line
Count
Source
1
// SPDX-License-Identifier: Apache-2.0
2
// ----------------------------------------------------------------------------
3
// Copyright 2011-2024 Arm Limited
4
//
5
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
6
// use this file except in compliance with the License. You may obtain a copy
7
// of the License at:
8
//
9
//     http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing, software
12
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14
// License for the specific language governing permissions and limitations
15
// under the License.
16
// ----------------------------------------------------------------------------
17
18
#if !defined(ASTCENC_DECOMPRESS_ONLY)
19
20
/**
21
 * @brief Functions for computing color endpoints and texel weights.
22
 */
23
24
#include <cassert>
25
26
#include "astcenc_internal.h"
27
#include "astcenc_vecmathlib.h"
28
29
/**
30
 * @brief Compute the infilled weight for N texel indices in a decimated grid.
31
 *
32
 * @param di        The weight grid decimation to use.
33
 * @param weights   The decimated weight values to use.
34
 * @param index     The first texel index to interpolate.
35
 *
36
 * @return The interpolated weight for the given set of SIMD_WIDTH texels.
37
 */
38
static vfloat bilinear_infill_vla(
39
  const decimation_info& di,
40
  const float* weights,
41
  unsigned int index
42
2.88M
) {
43
  // Load the bilinear filter texel weight indexes in the decimated grid
44
2.88M
  const uint8_t* weight_idx0 = di.texel_weights_tr[0] + index;
45
2.88M
  const uint8_t* weight_idx1 = di.texel_weights_tr[1] + index;
46
2.88M
  const uint8_t* weight_idx2 = di.texel_weights_tr[2] + index;
47
2.88M
  const uint8_t* weight_idx3 = di.texel_weights_tr[3] + index;
48
49
  // Load the bilinear filter weights from the decimated grid
50
2.88M
  vfloat weight_val0 = gatherf_byte_inds<vfloat>(weights, weight_idx0);
51
2.88M
  vfloat weight_val1 = gatherf_byte_inds<vfloat>(weights, weight_idx1);
52
2.88M
  vfloat weight_val2 = gatherf_byte_inds<vfloat>(weights, weight_idx2);
53
2.88M
  vfloat weight_val3 = gatherf_byte_inds<vfloat>(weights, weight_idx3);
54
55
  // Load the weight contribution factors for each decimated weight
56
2.88M
  vfloat tex_weight_float0 = loada(di.texel_weight_contribs_float_tr[0] + index);
57
2.88M
  vfloat tex_weight_float1 = loada(di.texel_weight_contribs_float_tr[1] + index);
58
2.88M
  vfloat tex_weight_float2 = loada(di.texel_weight_contribs_float_tr[2] + index);
59
2.88M
  vfloat tex_weight_float3 = loada(di.texel_weight_contribs_float_tr[3] + index);
60
61
  // Compute the bilinear interpolation to generate the per-texel weight
62
2.88M
  return (weight_val0 * tex_weight_float0 + weight_val1 * tex_weight_float1) +
63
2.88M
         (weight_val2 * tex_weight_float2 + weight_val3 * tex_weight_float3);
64
2.88M
}
65
66
/**
67
 * @brief Compute the infilled weight for N texel indices in a decimated grid.
68
 *
69
 * This is specialized version which computes only two weights per texel for
70
 * encodings that are only decimated in a single axis.
71
 *
72
 * @param di        The weight grid decimation to use.
73
 * @param weights   The decimated weight values to use.
74
 * @param index     The first texel index to interpolate.
75
 *
76
 * @return The interpolated weight for the given set of SIMD_WIDTH texels.
77
 */
78
static vfloat bilinear_infill_vla_2(
79
  const decimation_info& di,
80
  const float* weights,
81
  unsigned int index
82
1.47M
) {
83
  // Load the bilinear filter texel weight indexes in the decimated grid
84
1.47M
  const uint8_t* weight_idx0 = di.texel_weights_tr[0] + index;
85
1.47M
  const uint8_t* weight_idx1 = di.texel_weights_tr[1] + index;
86
87
  // Load the bilinear filter weights from the decimated grid
88
1.47M
  vfloat weight_val0 = gatherf_byte_inds<vfloat>(weights, weight_idx0);
89
1.47M
  vfloat weight_val1 = gatherf_byte_inds<vfloat>(weights, weight_idx1);
90
91
  // Load the weight contribution factors for each decimated weight
92
1.47M
  vfloat tex_weight_float0 = loada(di.texel_weight_contribs_float_tr[0] + index);
93
1.47M
  vfloat tex_weight_float1 = loada(di.texel_weight_contribs_float_tr[1] + index);
94
95
  // Compute the bilinear interpolation to generate the per-texel weight
96
1.47M
  return (weight_val0 * tex_weight_float0 + weight_val1 * tex_weight_float1);
97
1.47M
}
98
99
/**
100
 * @brief Compute the ideal endpoints and weights for 1 color component.
101
 *
102
 * @param      blk         The image block color data to compress.
103
 * @param      pi          The partition info for the current trial.
104
 * @param[out] ei          The computed ideal endpoints and weights.
105
 * @param      component   The color component to compute.
106
 */
107
static void compute_ideal_colors_and_weights_1_comp(
108
  const image_block& blk,
109
  const partition_info& pi,
110
  endpoints_and_weights& ei,
111
  unsigned int component
112
6.99k
) {
113
6.99k
  unsigned int partition_count = pi.partition_count;
114
6.99k
  ei.ep.partition_count = partition_count;
115
6.99k
  promise(partition_count > 0);
116
117
6.99k
  unsigned int texel_count = blk.texel_count;
118
6.99k
  promise(texel_count > 0);
119
120
6.99k
  float error_weight;
121
6.99k
  const float* data_vr = nullptr;
122
123
6.99k
  assert(component < BLOCK_MAX_COMPONENTS);
124
6.99k
  switch (component)
125
6.99k
  {
126
1.66k
  case 0:
127
1.66k
    error_weight = blk.channel_weight.lane<0>();
128
1.66k
    data_vr = blk.data_r;
129
1.66k
    break;
130
1.69k
  case 1:
131
1.69k
    error_weight = blk.channel_weight.lane<1>();
132
1.69k
    data_vr = blk.data_g;
133
1.69k
    break;
134
1.83k
  case 2:
135
1.83k
    error_weight = blk.channel_weight.lane<2>();
136
1.83k
    data_vr = blk.data_b;
137
1.83k
    break;
138
1.79k
  default:
139
1.79k
    assert(component == 3);
140
1.79k
    error_weight = blk.channel_weight.lane<3>();
141
1.79k
    data_vr = blk.data_a;
142
1.79k
    break;
143
6.99k
  }
144
145
6.99k
  vmask4 sep_mask = vint4::lane_id() == vint4(component);
146
6.99k
  bool is_constant_wes { true };
147
6.99k
  float partition0_len_sq { 0.0f };
148
149
13.9k
  for (unsigned int i = 0; i < partition_count; i++)
150
6.99k
  {
151
6.99k
    float lowvalue { 1e10f };
152
6.99k
    float highvalue { -1e10f };
153
154
6.99k
    unsigned int partition_texel_count = pi.partition_texel_count[i];
155
181k
    for (unsigned int j = 0; j < partition_texel_count; j++)
156
174k
    {
157
174k
      unsigned int tix = pi.texels_of_partition[i][j];
158
174k
      float value = data_vr[tix];
159
174k
      lowvalue = astc::min(value, lowvalue);
160
174k
      highvalue = astc::max(value, highvalue);
161
174k
    }
162
163
6.99k
    if (highvalue <= lowvalue)
164
0
    {
165
0
      lowvalue = 0.0f;
166
0
      highvalue = 1e-7f;
167
0
    }
168
169
6.99k
    float length = highvalue - lowvalue;
170
6.99k
    float length_squared = length * length;
171
6.99k
    float scale = 1.0f / length;
172
173
6.99k
    if (i == 0)
174
6.99k
    {
175
6.99k
      partition0_len_sq = length_squared;
176
6.99k
    }
177
0
    else
178
0
    {
179
0
      is_constant_wes = is_constant_wes && length_squared == partition0_len_sq;
180
0
    }
181
182
181k
    for (unsigned int j = 0; j < partition_texel_count; j++)
183
174k
    {
184
174k
      unsigned int tix = pi.texels_of_partition[i][j];
185
174k
      float value = (data_vr[tix] - lowvalue) * scale;
186
174k
      value = astc::clamp1f(value);
187
188
174k
      ei.weights[tix] = value;
189
174k
      ei.weight_error_scale[tix] = length_squared * error_weight;
190
174k
      assert(!astc::isnan(ei.weight_error_scale[tix]));
191
174k
    }
192
193
6.99k
    ei.ep.endpt0[i] = select(blk.data_min, vfloat4(lowvalue), sep_mask);
194
6.99k
    ei.ep.endpt1[i] = select(blk.data_max, vfloat4(highvalue), sep_mask);
195
6.99k
  }
196
197
  // Zero initialize any SIMD over-fetch
198
6.99k
  size_t texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
199
9.10k
  for (size_t i = texel_count; i < texel_count_simd; i++)
200
2.10k
  {
201
2.10k
    ei.weights[i] = 0.0f;
202
2.10k
    ei.weight_error_scale[i] = 0.0f;
203
2.10k
  }
204
205
6.99k
  ei.is_constant_weight_error_scale = is_constant_wes;
206
6.99k
}
207
208
/**
209
 * @brief Compute the ideal endpoints and weights for 2 color components.
210
 *
211
 * @param      blk          The image block color data to compress.
212
 * @param      pi           The partition info for the current trial.
213
 * @param[out] ei           The computed ideal endpoints and weights.
214
 * @param      component1   The first color component to compute.
215
 * @param      component2   The second color component to compute.
216
 */
217
static void compute_ideal_colors_and_weights_2_comp(
218
  const image_block& blk,
219
  const partition_info& pi,
220
  endpoints_and_weights& ei,
221
  int component1,
222
  int component2
223
802
) {
224
802
  unsigned int partition_count = pi.partition_count;
225
802
  ei.ep.partition_count = partition_count;
226
802
  promise(partition_count > 0);
227
228
802
  unsigned int texel_count = blk.texel_count;
229
802
  promise(texel_count > 0);
230
231
802
  partition_metrics pms[BLOCK_MAX_PARTITIONS];
232
233
802
  float error_weight;
234
802
  const float* data_vr = nullptr;
235
802
  const float* data_vg = nullptr;
236
237
802
  if (component1 == 0 && component2 == 1)
238
303
  {
239
303
    error_weight = hadd_s(blk.channel_weight.swz<0, 1>()) / 2.0f;
240
241
303
    data_vr = blk.data_r;
242
303
    data_vg = blk.data_g;
243
303
  }
244
499
  else if (component1 == 0 && component2 == 2)
245
250
  {
246
250
    error_weight = hadd_s(blk.channel_weight.swz<0, 2>()) / 2.0f;
247
248
250
    data_vr = blk.data_r;
249
250
    data_vg = blk.data_b;
250
250
  }
251
249
  else // (component1 == 1 && component2 == 2)
252
249
  {
253
249
    assert(component1 == 1 && component2 == 2);
254
255
249
    error_weight = hadd_s(blk.channel_weight.swz<1, 2>()) / 2.0f;
256
257
249
    data_vr = blk.data_g;
258
249
    data_vg = blk.data_b;
259
249
  }
260
261
802
  compute_avgs_and_dirs_2_comp(pi, blk, component1, component2, pms);
262
263
802
  bool is_constant_wes { true };
264
802
  float partition0_len_sq { 0.0f };
265
266
802
  vmask4 comp1_mask = vint4::lane_id() == vint4(component1);
267
802
  vmask4 comp2_mask = vint4::lane_id() == vint4(component2);
268
269
1.60k
  for (unsigned int i = 0; i < partition_count; i++)
270
802
  {
271
802
    vfloat4 dir = pms[i].dir;
272
802
    if (hadd_s(dir) < 0.0f)
273
8
    {
274
8
      dir = vfloat4::zero() - dir;
275
8
    }
276
277
802
    line2 line { pms[i].avg, normalize_safe(dir, unit2()) };
278
802
    float lowparam { 1e10f };
279
802
    float highparam { -1e10f };
280
281
802
    unsigned int partition_texel_count = pi.partition_texel_count[i];
282
16.6k
    for (unsigned int j = 0; j < partition_texel_count; j++)
283
15.8k
    {
284
15.8k
      unsigned int tix = pi.texels_of_partition[i][j];
285
15.8k
      vfloat4 point = vfloat2(data_vr[tix], data_vg[tix]);
286
15.8k
      float param = dot_s(point - line.a, line.b);
287
15.8k
      ei.weights[tix] = param;
288
289
15.8k
      lowparam = astc::min(param, lowparam);
290
15.8k
      highparam = astc::max(param, highparam);
291
15.8k
    }
292
293
    // It is possible for a uniform-color partition to produce length=0;
294
    // this causes NaN issues so set to small value to avoid this problem
295
802
    if (highparam <= lowparam)
296
9
    {
297
9
      lowparam = 0.0f;
298
9
      highparam = 1e-7f;
299
9
    }
300
301
802
    float length = highparam - lowparam;
302
802
    float length_squared = length * length;
303
802
    float scale = 1.0f / length;
304
305
802
    if (i == 0)
306
802
    {
307
802
      partition0_len_sq = length_squared;
308
802
    }
309
0
    else
310
0
    {
311
0
      is_constant_wes = is_constant_wes && length_squared == partition0_len_sq;
312
0
    }
313
314
16.6k
    for (unsigned int j = 0; j < partition_texel_count; j++)
315
15.8k
    {
316
15.8k
      unsigned int tix = pi.texels_of_partition[i][j];
317
15.8k
      float idx = (ei.weights[tix] - lowparam) * scale;
318
15.8k
      idx = astc::clamp1f(idx);
319
320
15.8k
      ei.weights[tix] = idx;
321
15.8k
      ei.weight_error_scale[tix] = length_squared * error_weight;
322
15.8k
      assert(!astc::isnan(ei.weight_error_scale[tix]));
323
15.8k
    }
324
325
802
    vfloat4 lowvalue = line.a + line.b * lowparam;
326
802
    vfloat4 highvalue = line.a + line.b * highparam;
327
328
802
    vfloat4 ep0 = select(blk.data_min, vfloat4(lowvalue.lane<0>()), comp1_mask);
329
802
    vfloat4 ep1 = select(blk.data_max, vfloat4(highvalue.lane<0>()), comp1_mask);
330
331
802
    ei.ep.endpt0[i] = select(ep0, vfloat4(lowvalue.lane<1>()), comp2_mask);
332
802
    ei.ep.endpt1[i] = select(ep1, vfloat4(highvalue.lane<1>()), comp2_mask);
333
802
  }
334
335
  // Zero initialize any SIMD over-fetch
336
802
  size_t texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
337
917
  for (size_t i = texel_count; i < texel_count_simd; i++)
338
115
  {
339
115
    ei.weights[i] = 0.0f;
340
115
    ei.weight_error_scale[i] = 0.0f;
341
115
  }
342
343
802
  ei.is_constant_weight_error_scale = is_constant_wes;
344
802
}
345
346
/**
347
 * @brief Compute the ideal endpoints and weights for 3 color components.
348
 *
349
 * @param      blk                 The image block color data to compress.
350
 * @param      pi                  The partition info for the current trial.
351
 * @param[out] ei                  The computed ideal endpoints and weights.
352
 * @param      omitted_component   The color component excluded from the calculation.
353
 */
354
static void compute_ideal_colors_and_weights_3_comp(
355
  const image_block& blk,
356
  const partition_info& pi,
357
  endpoints_and_weights& ei,
358
  unsigned int omitted_component
359
7.79k
) {
360
7.79k
  unsigned int partition_count = pi.partition_count;
361
7.79k
  ei.ep.partition_count = partition_count;
362
7.79k
  promise(partition_count > 0);
363
364
7.79k
  unsigned int texel_count = blk.texel_count;
365
7.79k
  promise(texel_count > 0);
366
367
7.79k
  partition_metrics pms[BLOCK_MAX_PARTITIONS];
368
369
7.79k
  float error_weight;
370
7.79k
  const float* data_vr = nullptr;
371
7.79k
  const float* data_vg = nullptr;
372
7.79k
  const float* data_vb = nullptr;
373
7.79k
  if (omitted_component == 0)
374
1.41k
  {
375
1.41k
    error_weight = hadd_s(blk.channel_weight.swz<0, 1, 2>());
376
1.41k
    data_vr = blk.data_g;
377
1.41k
    data_vg = blk.data_b;
378
1.41k
    data_vb = blk.data_a;
379
1.41k
  }
380
6.37k
  else if (omitted_component == 1)
381
1.44k
  {
382
1.44k
    error_weight = hadd_s(blk.channel_weight.swz<0, 2, 3>());
383
1.44k
    data_vr = blk.data_r;
384
1.44k
    data_vg = blk.data_b;
385
1.44k
    data_vb = blk.data_a;
386
1.44k
  }
387
4.92k
  else if (omitted_component == 2)
388
1.53k
  {
389
1.53k
    error_weight = hadd_s(blk.channel_weight.swz<0, 1, 3>());
390
1.53k
    data_vr = blk.data_r;
391
1.53k
    data_vg = blk.data_g;
392
1.53k
    data_vb = blk.data_a;
393
1.53k
  }
394
3.39k
  else
395
3.39k
  {
396
3.39k
    assert(omitted_component == 3);
397
398
3.39k
    error_weight = hadd_s(blk.channel_weight.swz<0, 1, 2>());
399
3.39k
    data_vr = blk.data_r;
400
3.39k
    data_vg = blk.data_g;
401
3.39k
    data_vb = blk.data_b;
402
3.39k
  }
403
404
7.79k
  error_weight = error_weight * (1.0f / 3.0f);
405
406
7.79k
  if (omitted_component == 3)
407
3.39k
  {
408
3.39k
    compute_avgs_and_dirs_3_comp_rgb(pi, blk, pms);
409
3.39k
  }
410
4.39k
  else
411
4.39k
  {
412
4.39k
    compute_avgs_and_dirs_3_comp(pi, blk, omitted_component, pms);
413
4.39k
  }
414
415
7.79k
  bool is_constant_wes { true };
416
7.79k
  float partition0_len_sq { 0.0f };
417
418
17.5k
  for (unsigned int i = 0; i < partition_count; i++)
419
9.79k
  {
420
9.79k
    vfloat4 dir = pms[i].dir;
421
9.79k
    if (hadd_rgb_s(dir) < 0.0f)
422
226
    {
423
226
      dir = vfloat4::zero() - dir;
424
226
    }
425
426
9.79k
    line3 line { pms[i].avg, normalize_safe(dir, unit3()) };
427
9.79k
    float lowparam { 1e10f };
428
9.79k
    float highparam { -1e10f };
429
430
9.79k
    unsigned int partition_texel_count = pi.partition_texel_count[i];
431
201k
    for (unsigned int j = 0; j < partition_texel_count; j++)
432
191k
    {
433
191k
      unsigned int tix = pi.texels_of_partition[i][j];
434
191k
      vfloat4 point = vfloat3(data_vr[tix], data_vg[tix], data_vb[tix]);
435
191k
      float param = dot3_s(point - line.a, line.b);
436
191k
      ei.weights[tix] = param;
437
438
191k
      lowparam = astc::min(param, lowparam);
439
191k
      highparam = astc::max(param, highparam);
440
191k
    }
441
442
    // It is possible for a uniform-color partition to produce length=0;
443
    // this causes NaN issues so set to small value to avoid this problem
444
9.79k
    if (highparam <= lowparam)
445
620
    {
446
620
      lowparam = 0.0f;
447
620
      highparam = 1e-7f;
448
620
    }
449
450
9.79k
    float length = highparam - lowparam;
451
9.79k
    float length_squared = length * length;
452
9.79k
    float scale = 1.0f / length;
453
454
9.79k
    if (i == 0)
455
7.79k
    {
456
7.79k
      partition0_len_sq = length_squared;
457
7.79k
    }
458
2.00k
    else
459
2.00k
    {
460
2.00k
      is_constant_wes = is_constant_wes && length_squared == partition0_len_sq;
461
2.00k
    }
462
463
201k
    for (unsigned int j = 0; j < partition_texel_count; j++)
464
191k
    {
465
191k
      unsigned int tix = pi.texels_of_partition[i][j];
466
191k
      float idx = (ei.weights[tix] - lowparam) * scale;
467
191k
      idx = astc::clamp1f(idx);
468
469
191k
      ei.weights[tix] = idx;
470
191k
      ei.weight_error_scale[tix] = length_squared * error_weight;
471
191k
      assert(!astc::isnan(ei.weight_error_scale[tix]));
472
191k
    }
473
474
9.79k
    vfloat4 ep0 = line.a + line.b * lowparam;
475
9.79k
    vfloat4 ep1 = line.a + line.b * highparam;
476
477
9.79k
    vfloat4 bmin = blk.data_min;
478
9.79k
    vfloat4 bmax = blk.data_max;
479
480
9.79k
    assert(omitted_component < BLOCK_MAX_COMPONENTS);
481
9.79k
    switch (omitted_component)
482
9.79k
    {
483
1.41k
      case 0:
484
1.41k
        ei.ep.endpt0[i] = vfloat4(bmin.lane<0>(), ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>());
485
1.41k
        ei.ep.endpt1[i] = vfloat4(bmax.lane<0>(), ep1.lane<0>(), ep1.lane<1>(), ep1.lane<2>());
486
1.41k
        break;
487
1.44k
      case 1:
488
1.44k
        ei.ep.endpt0[i] = vfloat4(ep0.lane<0>(), bmin.lane<1>(), ep0.lane<1>(), ep0.lane<2>());
489
1.44k
        ei.ep.endpt1[i] = vfloat4(ep1.lane<0>(), bmax.lane<1>(), ep1.lane<1>(), ep1.lane<2>());
490
1.44k
        break;
491
1.53k
      case 2:
492
1.53k
        ei.ep.endpt0[i] = vfloat4(ep0.lane<0>(), ep0.lane<1>(), bmin.lane<2>(), ep0.lane<2>());
493
1.53k
        ei.ep.endpt1[i] = vfloat4(ep1.lane<0>(), ep1.lane<1>(), bmax.lane<2>(), ep1.lane<2>());
494
1.53k
        break;
495
5.40k
      default:
496
5.40k
        ei.ep.endpt0[i] = vfloat4(ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>(), bmin.lane<3>());
497
5.40k
        ei.ep.endpt1[i] = vfloat4(ep1.lane<0>(), ep1.lane<1>(), ep1.lane<2>(), bmax.lane<3>());
498
5.40k
        break;
499
9.79k
    }
500
9.79k
  }
501
502
  // Zero initialize any SIMD over-fetch
503
7.79k
  size_t texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
504
10.1k
  for (size_t i = texel_count; i < texel_count_simd; i++)
505
2.35k
  {
506
2.35k
    ei.weights[i] = 0.0f;
507
2.35k
    ei.weight_error_scale[i] = 0.0f;
508
2.35k
  }
509
510
7.79k
  ei.is_constant_weight_error_scale = is_constant_wes;
511
7.79k
}
512
513
/**
514
 * @brief Compute the ideal endpoints and weights for 4 color components.
515
 *
516
 * @param      blk   The image block color data to compress.
517
 * @param      pi    The partition info for the current trial.
518
 * @param[out] ei    The computed ideal endpoints and weights.
519
 */
520
static void compute_ideal_colors_and_weights_4_comp(
521
  const image_block& blk,
522
  const partition_info& pi,
523
  endpoints_and_weights& ei
524
9.32k
) {
525
9.32k
  const float error_weight = hadd_s(blk.channel_weight) / 4.0f;
526
527
9.32k
  unsigned int partition_count = pi.partition_count;
528
529
9.32k
  unsigned int texel_count = blk.texel_count;
530
9.32k
  promise(texel_count > 0);
531
9.32k
  promise(partition_count > 0);
532
533
9.32k
  partition_metrics pms[BLOCK_MAX_PARTITIONS];
534
535
9.32k
  compute_avgs_and_dirs_4_comp(pi, blk, pms);
536
537
9.32k
  bool is_constant_wes { true };
538
9.32k
  float partition0_len_sq { 0.0f };
539
540
30.2k
  for (unsigned int i = 0; i < partition_count; i++)
541
20.9k
  {
542
20.9k
    vfloat4 dir = pms[i].dir;
543
20.9k
    if (hadd_rgb_s(dir) < 0.0f)
544
2.20k
    {
545
2.20k
      dir = vfloat4::zero() - dir;
546
2.20k
    }
547
548
20.9k
    line4 line { pms[i].avg, normalize_safe(dir, unit4()) };
549
20.9k
    float lowparam { 1e10f };
550
20.9k
    float highparam { -1e10f };
551
552
20.9k
    unsigned int partition_texel_count = pi.partition_texel_count[i];
553
274k
    for (unsigned int j = 0; j < partition_texel_count; j++)
554
254k
    {
555
254k
      unsigned int tix = pi.texels_of_partition[i][j];
556
254k
      vfloat4 point = blk.texel(tix);
557
254k
      float param = dot_s(point - line.a, line.b);
558
254k
      ei.weights[tix] = param;
559
560
254k
      lowparam = astc::min(param, lowparam);
561
254k
      highparam = astc::max(param, highparam);
562
254k
    }
563
564
    // It is possible for a uniform-color partition to produce length=0;
565
    // this causes NaN issues so set to small value to avoid this problem
566
20.9k
    if (highparam <= lowparam)
567
1.93k
    {
568
1.93k
      lowparam = 0.0f;
569
1.93k
      highparam = 1e-7f;
570
1.93k
    }
571
572
20.9k
    float length = highparam - lowparam;
573
20.9k
    float length_squared = length * length;
574
20.9k
    float scale = 1.0f / length;
575
576
20.9k
    if (i == 0)
577
9.32k
    {
578
9.32k
      partition0_len_sq = length_squared;
579
9.32k
    }
580
11.6k
    else
581
11.6k
    {
582
11.6k
      is_constant_wes = is_constant_wes && length_squared == partition0_len_sq;
583
11.6k
    }
584
585
20.9k
    ei.ep.endpt0[i] = line.a + line.b * lowparam;
586
20.9k
    ei.ep.endpt1[i] = line.a + line.b * highparam;
587
588
274k
    for (unsigned int j = 0; j < partition_texel_count; j++)
589
254k
    {
590
254k
      unsigned int tix = pi.texels_of_partition[i][j];
591
254k
      float idx = (ei.weights[tix] - lowparam) * scale;
592
254k
      idx = astc::clamp1f(idx);
593
594
254k
      ei.weights[tix] = idx;
595
254k
      ei.weight_error_scale[tix] = length_squared * error_weight;
596
254k
      assert(!astc::isnan(ei.weight_error_scale[tix]));
597
254k
    }
598
20.9k
  }
599
600
  // Zero initialize any SIMD over-fetch
601
9.32k
  size_t texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
602
12.7k
  for (size_t i = texel_count; i < texel_count_simd; i++)
603
3.44k
  {
604
3.44k
    ei.weights[i] = 0.0f;
605
3.44k
    ei.weight_error_scale[i] = 0.0f;
606
3.44k
  }
607
608
9.32k
  ei.is_constant_weight_error_scale = is_constant_wes;
609
9.32k
}
610
611
/* See header for documentation. */
612
void compute_ideal_colors_and_weights_1plane(
613
  const image_block& blk,
614
  const partition_info& pi,
615
  endpoints_and_weights& ei
616
10.9k
) {
617
10.9k
  bool uses_alpha = !blk.is_constant_channel(3);
618
619
10.9k
  if (uses_alpha)
620
9.32k
  {
621
9.32k
    compute_ideal_colors_and_weights_4_comp(blk, pi, ei);
622
9.32k
  }
623
1.60k
  else
624
1.60k
  {
625
1.60k
    compute_ideal_colors_and_weights_3_comp(blk, pi, ei, 3);
626
1.60k
  }
627
10.9k
}
628
629
/* See header for documentation. */
630
void compute_ideal_colors_and_weights_2planes(
631
  const block_size_descriptor& bsd,
632
  const image_block& blk,
633
  unsigned int plane2_component,
634
  endpoints_and_weights& ei1,
635
  endpoints_and_weights& ei2
636
6.99k
) {
637
6.99k
  const auto& pi = bsd.get_partition_info(1, 0);
638
6.99k
  bool uses_alpha = !blk.is_constant_channel(3);
639
640
6.99k
  assert(plane2_component < BLOCK_MAX_COMPONENTS);
641
6.99k
  switch (plane2_component)
642
6.99k
  {
643
1.66k
  case 0: // Separate weights for red
644
1.66k
    if (uses_alpha)
645
1.41k
    {
646
1.41k
      compute_ideal_colors_and_weights_3_comp(blk, pi, ei1, 0);
647
1.41k
    }
648
249
    else
649
249
    {
650
249
      compute_ideal_colors_and_weights_2_comp(blk, pi, ei1, 1, 2);
651
249
    }
652
1.66k
    compute_ideal_colors_and_weights_1_comp(blk, pi, ei2, 0);
653
1.66k
    break;
654
655
1.69k
  case 1: // Separate weights for green
656
1.69k
    if (uses_alpha)
657
1.44k
    {
658
1.44k
      compute_ideal_colors_and_weights_3_comp(blk, pi, ei1, 1);
659
1.44k
    }
660
250
    else
661
250
    {
662
250
      compute_ideal_colors_and_weights_2_comp(blk, pi, ei1, 0, 2);
663
250
    }
664
1.69k
    compute_ideal_colors_and_weights_1_comp(blk, pi, ei2, 1);
665
1.69k
    break;
666
667
1.83k
  case 2: // Separate weights for blue
668
1.83k
    if (uses_alpha)
669
1.53k
    {
670
1.53k
      compute_ideal_colors_and_weights_3_comp(blk, pi, ei1, 2);
671
1.53k
    }
672
303
    else
673
303
    {
674
303
      compute_ideal_colors_and_weights_2_comp(blk, pi, ei1, 0, 1);
675
303
    }
676
1.83k
    compute_ideal_colors_and_weights_1_comp(blk, pi, ei2, 2);
677
1.83k
    break;
678
679
1.79k
  default: // Separate weights for alpha
680
1.79k
    assert(uses_alpha);
681
1.79k
    compute_ideal_colors_and_weights_3_comp(blk, pi, ei1, 3);
682
1.79k
    compute_ideal_colors_and_weights_1_comp(blk, pi, ei2, 3);
683
1.79k
    break;
684
6.99k
  }
685
6.99k
}
686
687
/* See header for documentation. */
688
float compute_error_of_weight_set_1plane(
689
  const endpoints_and_weights& eai,
690
  const decimation_info& di,
691
  const float* dec_weight_quant_uvalue
692
176k
) {
693
176k
  vfloatacc error_summav = vfloatacc::zero();
694
176k
  unsigned int texel_count = di.texel_count;
695
176k
  promise(texel_count > 0);
696
697
  // Process SIMD-width chunks, safe to over-fetch - the extra space is zero initialized
698
176k
  if (di.max_texel_weight_count > 2)
699
42.3k
  {
700
979k
    for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
701
936k
    {
702
      // Compute the bilinear interpolation of the decimated weight grid
703
936k
      vfloat current_values = bilinear_infill_vla(di, dec_weight_quant_uvalue, i);
704
705
      // Compute the error between the computed value and the ideal weight
706
936k
      vfloat actual_values = loada(eai.weights + i);
707
936k
      vfloat diff = current_values - actual_values;
708
936k
      vfloat significance = loada(eai.weight_error_scale + i);
709
936k
      vfloat error = diff * diff * significance;
710
711
936k
      haccumulate(error_summav, error);
712
936k
    }
713
42.3k
  }
714
134k
  else if (di.max_texel_weight_count > 1)
715
59.3k
  {
716
720k
    for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
717
660k
    {
718
      // Compute the bilinear interpolation of the decimated weight grid
719
660k
      vfloat current_values = bilinear_infill_vla_2(di, dec_weight_quant_uvalue, i);
720
721
      // Compute the error between the computed value and the ideal weight
722
660k
      vfloat actual_values = loada(eai.weights + i);
723
660k
      vfloat diff = current_values - actual_values;
724
660k
      vfloat significance = loada(eai.weight_error_scale + i);
725
660k
      vfloat error = diff * diff * significance;
726
727
660k
      haccumulate(error_summav, error);
728
660k
    }
729
59.3k
  }
730
74.9k
  else
731
74.9k
  {
732
428k
    for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
733
353k
    {
734
      // Load the weight set directly, without interpolation
735
353k
      vfloat current_values = loada(dec_weight_quant_uvalue + i);
736
737
      // Compute the error between the computed value and the ideal weight
738
353k
      vfloat actual_values = loada(eai.weights + i);
739
353k
      vfloat diff = current_values - actual_values;
740
353k
      vfloat significance = loada(eai.weight_error_scale + i);
741
353k
      vfloat error = diff * diff * significance;
742
743
353k
      haccumulate(error_summav, error);
744
353k
    }
745
74.9k
  }
746
747
  // Resolve the final scalar accumulator sum
748
176k
  return hadd_s(error_summav);
749
176k
}
750
751
/* See header for documentation. */
752
float compute_error_of_weight_set_2planes(
753
  const endpoints_and_weights& eai1,
754
  const endpoints_and_weights& eai2,
755
  const decimation_info& di,
756
  const float* dec_weight_quant_uvalue_plane1,
757
  const float* dec_weight_quant_uvalue_plane2
758
49.1k
) {
759
49.1k
  vfloatacc error_summav = vfloatacc::zero();
760
49.1k
  unsigned int texel_count = di.texel_count;
761
49.1k
  promise(texel_count > 0);
762
763
  // Process SIMD-width chunks, safe to over-fetch - the extra space is zero initialized
764
49.1k
  if (di.max_texel_weight_count > 2)
765
21.4k
  {
766
290k
    for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
767
268k
    {
768
      // Plane 1
769
      // Compute the bilinear interpolation of the decimated weight grid
770
268k
      vfloat current_values1 = bilinear_infill_vla(di, dec_weight_quant_uvalue_plane1, i);
771
772
      // Compute the error between the computed value and the ideal weight
773
268k
      vfloat actual_values1 = loada(eai1.weights + i);
774
268k
      vfloat diff = current_values1 - actual_values1;
775
268k
      vfloat error1 = diff * diff * loada(eai1.weight_error_scale + i);
776
777
      // Plane 2
778
      // Compute the bilinear interpolation of the decimated weight grid
779
268k
      vfloat current_values2 = bilinear_infill_vla(di, dec_weight_quant_uvalue_plane2, i);
780
781
      // Compute the error between the computed value and the ideal weight
782
268k
      vfloat actual_values2 = loada(eai2.weights + i);
783
268k
      diff = current_values2 - actual_values2;
784
268k
      vfloat error2 = diff * diff * loada(eai2.weight_error_scale + i);
785
786
268k
      haccumulate(error_summav, error1 + error2);
787
268k
    }
788
21.4k
  }
789
27.7k
  else if (di.max_texel_weight_count > 1)
790
17.4k
  {
791
103k
    for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
792
86.1k
    {
793
      // Plane 1
794
      // Compute the bilinear interpolation of the decimated weight grid
795
86.1k
      vfloat current_values1 = bilinear_infill_vla_2(di, dec_weight_quant_uvalue_plane1, i);
796
797
      // Compute the error between the computed value and the ideal weight
798
86.1k
      vfloat actual_values1 = loada(eai1.weights + i);
799
86.1k
      vfloat diff = current_values1 - actual_values1;
800
86.1k
      vfloat error1 = diff * diff * loada(eai1.weight_error_scale + i);
801
802
      // Plane 2
803
      // Compute the bilinear interpolation of the decimated weight grid
804
86.1k
      vfloat current_values2 = bilinear_infill_vla_2(di, dec_weight_quant_uvalue_plane2, i);
805
806
      // Compute the error between the computed value and the ideal weight
807
86.1k
      vfloat actual_values2 = loada(eai2.weights + i);
808
86.1k
      diff = current_values2 - actual_values2;
809
86.1k
      vfloat error2 = diff * diff * loada(eai2.weight_error_scale + i);
810
811
86.1k
      haccumulate(error_summav, error1 + error2);
812
86.1k
    }
813
17.4k
  }
814
10.3k
  else
815
10.3k
  {
816
53.0k
    for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
817
42.6k
    {
818
      // Plane 1
819
      // Load the weight set directly, without interpolation
820
42.6k
      vfloat current_values1 = loada(dec_weight_quant_uvalue_plane1 + i);
821
822
      // Compute the error between the computed value and the ideal weight
823
42.6k
      vfloat actual_values1 = loada(eai1.weights + i);
824
42.6k
      vfloat diff = current_values1 - actual_values1;
825
42.6k
      vfloat error1 = diff * diff * loada(eai1.weight_error_scale + i);
826
827
      // Plane 2
828
      // Load the weight set directly, without interpolation
829
42.6k
      vfloat current_values2 = loada(dec_weight_quant_uvalue_plane2 + i);
830
831
      // Compute the error between the computed value and the ideal weight
832
42.6k
      vfloat actual_values2 = loada(eai2.weights + i);
833
42.6k
      diff = current_values2 - actual_values2;
834
42.6k
      vfloat error2 = diff * diff * loada(eai2.weight_error_scale + i);
835
836
42.6k
      haccumulate(error_summav, error1 + error2);
837
42.6k
    }
838
10.3k
  }
839
840
  // Resolve the final scalar accumulator sum
841
49.1k
  return hadd_s(error_summav);
842
49.1k
}
843
844
/* See header for documentation. */
845
void compute_ideal_weights_for_decimation(
846
  const endpoints_and_weights& ei,
847
  const decimation_info& di,
848
  float* dec_weight_ideal_value
849
148k
) {
850
148k
  unsigned int texel_count = di.texel_count;
851
148k
  unsigned int weight_count = di.weight_count;
852
148k
  bool is_direct = texel_count == weight_count;
853
148k
  promise(texel_count > 0);
854
148k
  promise(weight_count > 0);
855
856
  // If we have a 1:1 mapping just shortcut the computation. Transfer enough to also copy the
857
  // zero-initialized SIMD over-fetch region
858
148k
  if (is_direct)
859
21.3k
  {
860
125k
    for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
861
103k
    {
862
103k
      vfloat weight(ei.weights + i);
863
103k
      storea(weight, dec_weight_ideal_value + i);
864
103k
    }
865
866
21.3k
    return;
867
21.3k
  }
868
869
  // Otherwise compute an estimate and perform single refinement iteration
870
871
  // Compute an initial average for each decimated weight
872
127k
  bool constant_wes = ei.is_constant_weight_error_scale;
873
127k
  vfloat weight_error_scale(ei.weight_error_scale[0]);
874
875
  // This overshoots - this is OK as we initialize the array tails in the
876
  // decimation table structures to safe values ...
877
821k
  for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
878
693k
  {
879
    // Start with a small value to avoid div-by-zero later
880
693k
    vfloat weight_weight(1e-10f);
881
693k
    vfloat initial_weight = vfloat::zero();
882
883
    // Accumulate error weighting of all the texels using this weight
884
693k
    vint weight_texel_count(di.weight_texel_count + i);
885
693k
    unsigned int max_texel_count = hmax_s(weight_texel_count);
886
693k
    promise(max_texel_count > 0);
887
888
6.00M
    for (unsigned int j = 0; j < max_texel_count; j++)
889
5.30M
    {
890
5.30M
      const uint8_t* texel = di.weight_texels_tr[j] + i;
891
5.30M
      vfloat weight = loada(di.weights_texel_contribs_tr[j] + i);
892
893
5.30M
      if (!constant_wes)
894
2.10M
      {
895
2.10M
        weight_error_scale = gatherf_byte_inds<vfloat>(ei.weight_error_scale, texel);
896
2.10M
      }
897
898
5.30M
      vfloat contrib_weight = weight * weight_error_scale;
899
900
5.30M
      weight_weight += contrib_weight;
901
5.30M
      initial_weight += gatherf_byte_inds<vfloat>(ei.weights, texel) * contrib_weight;
902
5.30M
    }
903
904
693k
    storea(initial_weight / weight_weight, dec_weight_ideal_value + i);
905
693k
  }
906
907
  // Populate the interpolated weight grid based on the initial average
908
  // Process SIMD-width texel coordinates at at time while we can. Safe to
909
  // over-process full SIMD vectors - the tail is zeroed.
910
127k
  ASTCENC_ALIGNAS float infilled_weights[BLOCK_MAX_TEXELS];
911
127k
  if (di.max_texel_weight_count <= 2)
912
53.7k
  {
913
528k
    for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
914
474k
    {
915
474k
      vfloat weight = bilinear_infill_vla_2(di, dec_weight_ideal_value, i);
916
474k
      storea(weight, infilled_weights + i);
917
474k
    }
918
53.7k
  }
919
73.5k
  else
920
73.5k
  {
921
1.30M
    for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
922
1.22M
    {
923
1.22M
      vfloat weight = bilinear_infill_vla(di, dec_weight_ideal_value, i);
924
1.22M
      storea(weight, infilled_weights + i);
925
1.22M
    }
926
73.5k
  }
927
928
  // Perform a single iteration of refinement
929
  // Empirically determined step size; larger values don't help but smaller drops image quality
930
127k
  constexpr float stepsize = 0.25f;
931
127k
  constexpr float chd_scale = -WEIGHTS_TEXEL_SUM;
932
933
821k
  for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
934
693k
  {
935
693k
    vfloat weight_val = loada(dec_weight_ideal_value + i);
936
937
    // Accumulate error weighting of all the texels using this weight
938
    // Start with a small value to avoid div-by-zero later
939
693k
    vfloat error_change0(1e-10f);
940
693k
    vfloat error_change1(0.0f);
941
942
    // Accumulate error weighting of all the texels using this weight
943
693k
    vint weight_texel_count(di.weight_texel_count + i);
944
693k
    unsigned int max_texel_count = hmax_s(weight_texel_count);
945
693k
    promise(max_texel_count > 0);
946
947
6.00M
    for (unsigned int j = 0; j < max_texel_count; j++)
948
5.30M
    {
949
5.30M
      const uint8_t* texel = di.weight_texels_tr[j] + i;
950
5.30M
      vfloat contrib_weight = loada(di.weights_texel_contribs_tr[j] + i);
951
952
5.30M
      if (!constant_wes)
953
2.10M
      {
954
2.10M
        weight_error_scale = gatherf_byte_inds<vfloat>(ei.weight_error_scale, texel);
955
2.10M
      }
956
957
5.30M
      vfloat scale = weight_error_scale * contrib_weight;
958
5.30M
      vfloat old_weight = gatherf_byte_inds<vfloat>(infilled_weights, texel);
959
5.30M
      vfloat ideal_weight = gatherf_byte_inds<vfloat>(ei.weights, texel);
960
961
5.30M
      error_change0 += contrib_weight * scale;
962
5.30M
      error_change1 += (old_weight - ideal_weight) * scale;
963
5.30M
    }
964
965
693k
    vfloat step = (error_change1 * chd_scale) / error_change0;
966
693k
    step = clamp(-stepsize, stepsize, step);
967
968
    // Update the weight; note this can store negative values
969
693k
    storea(weight_val + step, dec_weight_ideal_value + i);
970
693k
  }
971
127k
}
972
973
/* See header for documentation. */
974
void compute_quantized_weights_for_decimation(
975
  const decimation_info& di,
976
  float low_bound,
977
  float high_bound,
978
  const float* dec_weight_ideal_value,
979
  float* weight_set_out,
980
  uint8_t* quantized_weight_set,
981
  quant_method quant_level
982
275k
) {
983
275k
  int weight_count = di.weight_count;
984
275k
  promise(weight_count > 0);
985
275k
  const quant_and_transfer_table& qat = quant_and_xfer_tables[quant_level];
986
987
  // The available quant levels, stored with a minus 1 bias
988
275k
  static const float quant_levels_m1[12] {
989
275k
    1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 7.0f, 9.0f, 11.0f, 15.0f, 19.0f, 23.0f, 31.0f
990
275k
  };
991
992
275k
  vint steps_m1(get_quant_level(quant_level) - 1);
993
275k
  float quant_level_m1 = quant_levels_m1[quant_level];
994
995
  // Quantize the weight set using both the specified low/high bounds and standard 0..1 bounds
996
997
  // TODO: Oddity to investigate; triggered by test in issue #265.
998
275k
  if (high_bound <= low_bound)
999
3
  {
1000
3
    low_bound = 0.0f;
1001
3
    high_bound = 1.0f;
1002
3
  }
1003
1004
275k
  float rscale = high_bound - low_bound;
1005
275k
  float scale = 1.0f / rscale;
1006
1007
275k
  float scaled_low_bound = low_bound * scale;
1008
275k
  rscale *= 1.0f / 64.0f;
1009
1010
275k
  vfloat scalev(scale);
1011
275k
  vfloat scaled_low_boundv(scaled_low_bound);
1012
275k
  vfloat quant_level_m1v(quant_level_m1);
1013
275k
  vfloat rscalev(rscale);
1014
275k
  vfloat low_boundv(low_bound);
1015
1016
  // This runs to the rounded-up SIMD size, which is safe as the loop tail is filled with known
1017
  // safe data in compute_ideal_weights_for_decimation and arrays are always 64 elements
1018
275k
  if (get_quant_level(quant_level) <= 16)
1019
251k
  {
1020
251k
    vtable_16x8 table;
1021
251k
    vtable_prepare(table, qat.quant_to_unquant);
1022
1023
1.65M
    for (int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
1024
1.40M
    {
1025
1.40M
      vfloat ix = loada(dec_weight_ideal_value + i) * scalev - scaled_low_boundv;
1026
1.40M
      ix = clampzo(ix);
1027
1028
      // Look up the two closest indexes and return the one that was closest
1029
1.40M
      vfloat ix1 = ix * quant_level_m1v;
1030
1031
1.40M
      vint weightl = float_to_int(ix1);
1032
1.40M
      vint weighth = min(weightl + vint(1), steps_m1);
1033
1034
1.40M
      vint ixli = vtable_lookup_32bit(table, weightl);
1035
1.40M
      vint ixhi = vtable_lookup_32bit(table, weighth);
1036
1037
1.40M
      vfloat ixl = int_to_float(ixli);
1038
1.40M
      vfloat ixh = int_to_float(ixhi);
1039
1040
1.40M
      vmask mask = (ixl + ixh) < (vfloat(128.0f) * ix);
1041
1.40M
      vint weight = select(ixli, ixhi, mask);
1042
1.40M
      ixl = select(ixl, ixh, mask);
1043
1044
      // Invert the weight-scaling that was done initially
1045
1.40M
      storea(ixl * rscalev + low_boundv, weight_set_out + i);
1046
1.40M
      pack_and_store_low_bytes(weight, quantized_weight_set + i);
1047
1.40M
    }
1048
251k
  }
1049
23.8k
  else
1050
23.8k
  {
1051
23.8k
    vtable_32x8 table;
1052
23.8k
    vtable_prepare(table, qat.quant_to_unquant);
1053
1054
111k
    for (int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
1055
87.8k
    {
1056
87.8k
      vfloat ix = loada(dec_weight_ideal_value + i) * scalev - scaled_low_boundv;
1057
87.8k
      ix = clampzo(ix);
1058
1059
      // Look up the two closest indexes and return the one that was closest
1060
87.8k
      vfloat ix1 = ix * quant_level_m1v;
1061
1062
87.8k
      vint weightl = float_to_int(ix1);
1063
87.8k
      vint weighth = min(weightl + vint(1), steps_m1);
1064
1065
87.8k
      vint ixli = vtable_lookup_32bit(table, weightl);
1066
87.8k
      vint ixhi = vtable_lookup_32bit(table, weighth);
1067
1068
87.8k
      vfloat ixl = int_to_float(ixli);
1069
87.8k
      vfloat ixh = int_to_float(ixhi);
1070
1071
87.8k
      vmask mask = (ixl + ixh) < (vfloat(128.0f) * ix);
1072
87.8k
      vint weight = select(ixli, ixhi, mask);
1073
87.8k
      ixl = select(ixl, ixh, mask);
1074
1075
      // Invert the weight-scaling that was done initially
1076
87.8k
      storea(ixl * rscalev + low_boundv, weight_set_out + i);
1077
87.8k
      pack_and_store_low_bytes(weight, quantized_weight_set + i);
1078
87.8k
    }
1079
23.8k
  }
1080
275k
}
1081
1082
/**
1083
 * @brief Compute the RGB + offset for a HDR endpoint mode #7.
1084
 *
1085
 * Since the matrix needed has a regular structure we can simplify the inverse calculation. This
1086
 * gives us ~24 multiplications vs. 96 for a generic inverse.
1087
 *
1088
 *  mat[0] = vfloat4(rgba_ws.x,      0.0f,      0.0f, wght_ws.x);
1089
 *  mat[1] = vfloat4(     0.0f, rgba_ws.y,      0.0f, wght_ws.y);
1090
 *  mat[2] = vfloat4(     0.0f,      0.0f, rgba_ws.z, wght_ws.z);
1091
 *  mat[3] = vfloat4(wght_ws.x, wght_ws.y, wght_ws.z,      psum);
1092
 *  mat = invert(mat);
1093
 *
1094
 * @param rgba_weight_sum     Sum of partition component error weights.
1095
 * @param weight_weight_sum   Sum of partition component error weights * texel weight.
1096
 * @param rgbq_sum            Sum of partition component error weights * texel weight * color data.
1097
 * @param psum                Sum of RGB color weights * texel weight^2.
1098
 */
1099
static inline vfloat4 compute_rgbo_vector(
1100
  vfloat4 rgba_weight_sum,
1101
  vfloat4 weight_weight_sum,
1102
  vfloat4 rgbq_sum,
1103
  float psum
1104
58.8k
) {
1105
58.8k
  float X = rgba_weight_sum.lane<0>();
1106
58.8k
  float Y = rgba_weight_sum.lane<1>();
1107
58.8k
  float Z = rgba_weight_sum.lane<2>();
1108
58.8k
  float P = weight_weight_sum.lane<0>();
1109
58.8k
  float Q = weight_weight_sum.lane<1>();
1110
58.8k
  float R = weight_weight_sum.lane<2>();
1111
58.8k
  float S = psum;
1112
1113
58.8k
  float PP = P * P;
1114
58.8k
  float QQ = Q * Q;
1115
58.8k
  float RR = R * R;
1116
1117
58.8k
  float SZmRR = S * Z - RR;
1118
58.8k
  float DT = SZmRR * Y - Z * QQ;
1119
58.8k
  float YP = Y * P;
1120
58.8k
  float QX = Q * X;
1121
58.8k
  float YX = Y * X;
1122
58.8k
  float mZYP = -Z * YP;
1123
58.8k
  float mZQX = -Z * QX;
1124
58.8k
  float mRYX = -R * YX;
1125
58.8k
  float ZQP = Z * Q * P;
1126
58.8k
  float RYP = R * YP;
1127
58.8k
  float RQX = R * QX;
1128
1129
  // Compute the reciprocal of matrix determinant
1130
58.8k
  float rdet = 1.0f / (DT * X + mZYP * P);
1131
1132
  // Actually compute the adjugate, and then apply 1/det separately
1133
58.8k
  vfloat4 mat0(DT, ZQP, RYP, mZYP);
1134
58.8k
  vfloat4 mat1(ZQP, SZmRR * X - Z * PP, RQX, mZQX);
1135
58.8k
  vfloat4 mat2(RYP, RQX, (S * Y - QQ) * X - Y * PP, mRYX);
1136
58.8k
  vfloat4 mat3(mZYP, mZQX, mRYX, Z * YX);
1137
58.8k
  vfloat4 vect = rgbq_sum * rdet;
1138
1139
58.8k
  return vfloat4(dot_s(mat0, vect),
1140
58.8k
                 dot_s(mat1, vect),
1141
58.8k
                 dot_s(mat2, vect),
1142
58.8k
                 dot_s(mat3, vect));
1143
58.8k
}
1144
1145
/* See header for documentation. */
1146
void recompute_ideal_colors_1plane(
1147
  const image_block& blk,
1148
  const partition_info& pi,
1149
  const decimation_info& di,
1150
  const uint8_t* dec_weights_uquant,
1151
  endpoints& ep,
1152
  vfloat4 rgbs_vectors[BLOCK_MAX_PARTITIONS],
1153
  vfloat4 rgbo_vectors[BLOCK_MAX_PARTITIONS]
1154
45.8k
) {
1155
45.8k
  unsigned int weight_count = di.weight_count;
1156
45.8k
  unsigned int total_texel_count = blk.texel_count;
1157
45.8k
  unsigned int partition_count = pi.partition_count;
1158
1159
45.8k
  promise(weight_count > 0);
1160
45.8k
  promise(total_texel_count > 0);
1161
45.8k
  promise(partition_count > 0);
1162
1163
45.8k
  ASTCENC_ALIGNAS float dec_weight[BLOCK_MAX_WEIGHTS];
1164
292k
  for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
1165
246k
  {
1166
246k
    vint unquant_value(dec_weights_uquant + i);
1167
246k
    vfloat unquant_valuef = int_to_float(unquant_value) * vfloat(1.0f / 64.0f);
1168
246k
    storea(unquant_valuef, dec_weight + i);
1169
246k
  }
1170
1171
45.8k
  ASTCENC_ALIGNAS float undec_weight[BLOCK_MAX_TEXELS];
1172
45.8k
  float* undec_weight_ref;
1173
45.8k
  if (di.max_texel_weight_count == 1)
1174
38.0k
  {
1175
38.0k
    undec_weight_ref = dec_weight;
1176
38.0k
  }
1177
7.83k
  else if (di.max_texel_weight_count <= 2)
1178
6.53k
  {
1179
93.7k
    for (unsigned int i = 0; i < total_texel_count; i += ASTCENC_SIMD_WIDTH)
1180
87.1k
    {
1181
87.1k
      vfloat weight = bilinear_infill_vla_2(di, dec_weight, i);
1182
87.1k
      storea(weight, undec_weight + i);
1183
87.1k
    }
1184
1185
6.53k
    undec_weight_ref = undec_weight;
1186
6.53k
  }
1187
1.29k
  else
1188
1.29k
  {
1189
33.4k
    for (unsigned int i = 0; i < total_texel_count; i += ASTCENC_SIMD_WIDTH)
1190
32.1k
    {
1191
32.1k
      vfloat weight = bilinear_infill_vla(di, dec_weight, i);
1192
32.1k
      storea(weight, undec_weight + i);
1193
32.1k
    }
1194
1195
1.29k
    undec_weight_ref = undec_weight;
1196
1.29k
  }
1197
1198
45.8k
  vfloat4 rgba_sum(blk.data_mean * static_cast<float>(blk.texel_count));
1199
1200
145k
  for (unsigned int i = 0; i < partition_count; i++)
1201
99.7k
  {
1202
99.7k
    unsigned int texel_count = pi.partition_texel_count[i];
1203
99.7k
    const uint8_t *texel_indexes = pi.texels_of_partition[i];
1204
1205
    // Only compute a partition mean if more than one partition
1206
99.7k
    if (partition_count > 1)
1207
86.5k
    {
1208
86.5k
      rgba_sum = vfloat4::zero();
1209
86.5k
      promise(texel_count > 0);
1210
911k
      for (unsigned int j = 0; j < texel_count; j++)
1211
824k
      {
1212
824k
        unsigned int tix = texel_indexes[j];
1213
824k
        rgba_sum += blk.texel(tix);
1214
824k
      }
1215
86.5k
    }
1216
1217
99.7k
    rgba_sum = rgba_sum * blk.channel_weight;
1218
99.7k
    vfloat4 rgba_weight_sum = max(blk.channel_weight * static_cast<float>(texel_count), 1e-17f);
1219
99.7k
    vfloat4 scale_dir = normalize((rgba_sum / rgba_weight_sum).swz<0, 1, 2>());
1220
1221
99.7k
    float scale_max = 0.0f;
1222
99.7k
    float scale_min = 1e10f;
1223
1224
99.7k
    float wmin1 = 1.0f;
1225
99.7k
    float wmax1 = 0.0f;
1226
1227
99.7k
    float left_sum_s = 0.0f;
1228
99.7k
    float middle_sum_s = 0.0f;
1229
99.7k
    float right_sum_s = 0.0f;
1230
1231
99.7k
    vfloat4 color_vec_x = vfloat4::zero();
1232
99.7k
    vfloat4 color_vec_y = vfloat4::zero();
1233
1234
99.7k
    vfloat4 scale_vec = vfloat4::zero();
1235
1236
99.7k
    float weight_weight_sum_s = 1e-17f;
1237
1238
99.7k
    vfloat4 color_weight = blk.channel_weight;
1239
99.7k
    float ls_weight = hadd_rgb_s(color_weight);
1240
1241
1.27M
    for (unsigned int j = 0; j < texel_count; j++)
1242
1.17M
    {
1243
1.17M
      unsigned int tix = texel_indexes[j];
1244
1.17M
      vfloat4 rgba = blk.texel(tix);
1245
1246
1.17M
      float idx0 = undec_weight_ref[tix];
1247
1248
1.17M
      float om_idx0 = 1.0f - idx0;
1249
1.17M
      wmin1 = astc::min(idx0, wmin1);
1250
1.17M
      wmax1 = astc::max(idx0, wmax1);
1251
1252
1.17M
      float scale = dot3_s(scale_dir, rgba);
1253
1.17M
      scale_min = astc::min(scale, scale_min);
1254
1.17M
      scale_max = astc::max(scale, scale_max);
1255
1256
1.17M
      left_sum_s   += om_idx0 * om_idx0;
1257
1.17M
      middle_sum_s += om_idx0 * idx0;
1258
1.17M
      right_sum_s  += idx0 * idx0;
1259
1.17M
      weight_weight_sum_s += idx0;
1260
1261
1.17M
      vfloat4 color_idx(idx0);
1262
1.17M
      vfloat4 cwprod = rgba;
1263
1.17M
      vfloat4 cwiprod = cwprod * color_idx;
1264
1265
1.17M
      color_vec_y += cwiprod;
1266
1.17M
      color_vec_x += cwprod - cwiprod;
1267
1268
1.17M
      scale_vec += vfloat2(om_idx0, idx0) * (scale * ls_weight);
1269
1.17M
    }
1270
1271
99.7k
    vfloat4 left_sum   = vfloat4(left_sum_s) * color_weight;
1272
99.7k
    vfloat4 middle_sum = vfloat4(middle_sum_s) * color_weight;
1273
99.7k
    vfloat4 right_sum  = vfloat4(right_sum_s) * color_weight;
1274
99.7k
    vfloat4 lmrs_sum   = vfloat3(left_sum_s, middle_sum_s, right_sum_s) * ls_weight;
1275
1276
99.7k
    color_vec_x = color_vec_x * color_weight;
1277
99.7k
    color_vec_y = color_vec_y * color_weight;
1278
1279
    // Initialize the luminance and scale vectors with a reasonable default
1280
99.7k
    float scalediv = scale_min / astc::max(scale_max, 1e-10f);
1281
99.7k
    scalediv = astc::clamp1f(scalediv);
1282
1283
99.7k
    vfloat4 sds = scale_dir * scale_max;
1284
1285
99.7k
    rgbs_vectors[i] = vfloat4(sds.lane<0>(), sds.lane<1>(), sds.lane<2>(), scalediv);
1286
1287
99.7k
    if (wmin1 >= wmax1 * 0.999f)
1288
10.7k
    {
1289
      // If all weights in the partition were equal, then just take average of all colors in
1290
      // the partition and use that as both endpoint colors
1291
10.7k
      vfloat4 avg = (color_vec_x + color_vec_y) / rgba_weight_sum;
1292
1293
10.7k
      vmask4 notnan_mask = avg == avg;
1294
10.7k
      ep.endpt0[i] = select(ep.endpt0[i], avg, notnan_mask);
1295
10.7k
      ep.endpt1[i] = select(ep.endpt1[i], avg, notnan_mask);
1296
1297
10.7k
      rgbs_vectors[i] = vfloat4(sds.lane<0>(), sds.lane<1>(), sds.lane<2>(), 1.0f);
1298
10.7k
    }
1299
88.9k
    else
1300
88.9k
    {
1301
      // Otherwise, complete the analytic calculation of ideal-endpoint-values for the given
1302
      // set of texel weights and pixel colors
1303
88.9k
      vfloat4 color_det1 = (left_sum * right_sum) - (middle_sum * middle_sum);
1304
88.9k
      vfloat4 color_rdet1 = 1.0f / color_det1;
1305
1306
88.9k
      float ls_det1  = (lmrs_sum.lane<0>() * lmrs_sum.lane<2>()) - (lmrs_sum.lane<1>() * lmrs_sum.lane<1>());
1307
88.9k
      float ls_rdet1 = 1.0f / ls_det1;
1308
1309
88.9k
      vfloat4 color_mss1 = (left_sum * left_sum)
1310
88.9k
                         + (2.0f * middle_sum * middle_sum)
1311
88.9k
                         + (right_sum * right_sum);
1312
1313
88.9k
      float ls_mss1 = (lmrs_sum.lane<0>() * lmrs_sum.lane<0>())
1314
88.9k
                    + (2.0f * lmrs_sum.lane<1>() * lmrs_sum.lane<1>())
1315
88.9k
                    + (lmrs_sum.lane<2>() * lmrs_sum.lane<2>());
1316
1317
88.9k
      vfloat4 ep0 = (right_sum * color_vec_x - middle_sum * color_vec_y) * color_rdet1;
1318
88.9k
      vfloat4 ep1 = (left_sum * color_vec_y - middle_sum * color_vec_x) * color_rdet1;
1319
1320
88.9k
      vmask4 det_mask = abs(color_det1) > (color_mss1 * 1e-4f);
1321
88.9k
      vmask4 notnan_mask = (ep0 == ep0) & (ep1 == ep1);
1322
88.9k
      vmask4 full_mask = det_mask & notnan_mask;
1323
1324
88.9k
      ep.endpt0[i] = select(ep.endpt0[i], ep0, full_mask);
1325
88.9k
      ep.endpt1[i] = select(ep.endpt1[i], ep1, full_mask);
1326
1327
88.9k
      float scale_ep0 = (lmrs_sum.lane<2>() * scale_vec.lane<0>() - lmrs_sum.lane<1>() * scale_vec.lane<1>()) * ls_rdet1;
1328
88.9k
      float scale_ep1 = (lmrs_sum.lane<0>() * scale_vec.lane<1>() - lmrs_sum.lane<1>() * scale_vec.lane<0>()) * ls_rdet1;
1329
1330
88.9k
      if (fabsf(ls_det1) > (ls_mss1 * 1e-4f) && scale_ep0 == scale_ep0 && scale_ep1 == scale_ep1 && scale_ep0 < scale_ep1)
1331
82.6k
      {
1332
82.6k
        float scalediv2 = scale_ep0 / scale_ep1;
1333
82.6k
        vfloat4 sdsm = scale_dir * scale_ep1;
1334
82.6k
        rgbs_vectors[i] = vfloat4(sdsm.lane<0>(), sdsm.lane<1>(), sdsm.lane<2>(), scalediv2);
1335
82.6k
      }
1336
88.9k
    }
1337
1338
    // Calculations specific to mode #7, the HDR RGB-scale mode - skip if known LDR
1339
99.7k
    if (blk.rgb_lns[0] || blk.alpha_lns[0])
1340
46.6k
    {
1341
46.6k
      vfloat4 weight_weight_sum = vfloat4(weight_weight_sum_s) * color_weight;
1342
46.6k
      float psum = right_sum_s * hadd_rgb_s(color_weight);
1343
1344
46.6k
      vfloat4 rgbq_sum = color_vec_x + color_vec_y;
1345
46.6k
      rgbq_sum.set_lane<3>(hadd_rgb_s(color_vec_y));
1346
1347
46.6k
      vfloat4 rgbovec = compute_rgbo_vector(rgba_weight_sum, weight_weight_sum, rgbq_sum, psum);
1348
46.6k
      rgbo_vectors[i] = rgbovec;
1349
1350
      // We can get a failure due to the use of a singular (non-invertible) matrix
1351
      // If it failed, compute rgbo_vectors[] with a different method ...
1352
46.6k
      if (astc::isnan(dot_s(rgbovec, rgbovec)))
1353
4.43k
      {
1354
4.43k
        vfloat4 v0 = ep.endpt0[i];
1355
4.43k
        vfloat4 v1 = ep.endpt1[i];
1356
1357
4.43k
        float avgdif = hadd_rgb_s(v1 - v0) * (1.0f / 3.0f);
1358
4.43k
        avgdif = astc::max(avgdif, 0.0f);
1359
1360
4.43k
        vfloat4 avg = (v0 + v1) * 0.5f;
1361
4.43k
        vfloat4 ep0 = avg - vfloat4(avgdif) * 0.5f;
1362
4.43k
        rgbo_vectors[i] = vfloat4(ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>(), avgdif);
1363
4.43k
      }
1364
46.6k
    }
1365
99.7k
  }
1366
45.8k
}
1367
1368
/* See header for documentation. */
1369
void recompute_ideal_colors_2planes(
1370
  const image_block& blk,
1371
  const block_size_descriptor& bsd,
1372
  const decimation_info& di,
1373
  const uint8_t* dec_weights_uquant_plane1,
1374
  const uint8_t* dec_weights_uquant_plane2,
1375
  endpoints& ep,
1376
  vfloat4& rgbs_vector,
1377
  vfloat4& rgbo_vector,
1378
  int plane2_component
1379
25.6k
) {
1380
25.6k
  unsigned int weight_count = di.weight_count;
1381
25.6k
  unsigned int total_texel_count = blk.texel_count;
1382
1383
25.6k
  promise(total_texel_count > 0);
1384
25.6k
  promise(weight_count > 0);
1385
1386
25.6k
  ASTCENC_ALIGNAS float dec_weight_plane1[BLOCK_MAX_WEIGHTS_2PLANE];
1387
25.6k
  ASTCENC_ALIGNAS float dec_weight_plane2[BLOCK_MAX_WEIGHTS_2PLANE];
1388
1389
25.6k
  assert(weight_count <= BLOCK_MAX_WEIGHTS_2PLANE);
1390
1391
131k
  for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
1392
105k
  {
1393
105k
    vint unquant_value1(dec_weights_uquant_plane1 + i);
1394
105k
    vfloat unquant_value1f = int_to_float(unquant_value1) * vfloat(1.0f / 64.0f);
1395
105k
    storea(unquant_value1f, dec_weight_plane1 + i);
1396
1397
105k
    vint unquant_value2(dec_weights_uquant_plane2 + i);
1398
105k
    vfloat unquant_value2f = int_to_float(unquant_value2) * vfloat(1.0f / 64.0f);
1399
105k
    storea(unquant_value2f, dec_weight_plane2 + i);
1400
105k
  }
1401
1402
25.6k
  ASTCENC_ALIGNAS float undec_weight_plane1[BLOCK_MAX_TEXELS];
1403
25.6k
  ASTCENC_ALIGNAS float undec_weight_plane2[BLOCK_MAX_TEXELS];
1404
1405
25.6k
  float* undec_weight_plane1_ref;
1406
25.6k
  float* undec_weight_plane2_ref;
1407
1408
25.6k
  if (di.max_texel_weight_count == 1)
1409
12.1k
  {
1410
12.1k
    undec_weight_plane1_ref = dec_weight_plane1;
1411
12.1k
    undec_weight_plane2_ref = dec_weight_plane2;
1412
12.1k
  }
1413
13.5k
  else if (di.max_texel_weight_count <= 2)
1414
8.02k
  {
1415
48.8k
    for (unsigned int i = 0; i < total_texel_count; i += ASTCENC_SIMD_WIDTH)
1416
40.8k
    {
1417
40.8k
      vfloat weight = bilinear_infill_vla_2(di, dec_weight_plane1, i);
1418
40.8k
      storea(weight, undec_weight_plane1 + i);
1419
1420
40.8k
      weight = bilinear_infill_vla_2(di, dec_weight_plane2, i);
1421
40.8k
      storea(weight, undec_weight_plane2 + i);
1422
40.8k
    }
1423
1424
8.02k
    undec_weight_plane1_ref = undec_weight_plane1;
1425
8.02k
    undec_weight_plane2_ref = undec_weight_plane2;
1426
8.02k
  }
1427
5.47k
  else
1428
5.47k
  {
1429
79.6k
    for (unsigned int i = 0; i < total_texel_count; i += ASTCENC_SIMD_WIDTH)
1430
74.2k
    {
1431
74.2k
      vfloat weight = bilinear_infill_vla(di, dec_weight_plane1, i);
1432
74.2k
      storea(weight, undec_weight_plane1 + i);
1433
1434
74.2k
      weight = bilinear_infill_vla(di, dec_weight_plane2, i);
1435
74.2k
      storea(weight, undec_weight_plane2 + i);
1436
74.2k
    }
1437
1438
5.47k
    undec_weight_plane1_ref = undec_weight_plane1;
1439
5.47k
    undec_weight_plane2_ref = undec_weight_plane2;
1440
5.47k
  }
1441
1442
25.6k
  unsigned int texel_count = bsd.texel_count;
1443
25.6k
  vfloat4 rgba_weight_sum = max(blk.channel_weight * static_cast<float>(texel_count), 1e-17f);
1444
25.6k
  vfloat4 scale_dir = normalize(blk.data_mean.swz<0, 1, 2>());
1445
1446
25.6k
  float scale_max = 0.0f;
1447
25.6k
  float scale_min = 1e10f;
1448
1449
25.6k
  float wmin1 = 1.0f;
1450
25.6k
  float wmax1 = 0.0f;
1451
1452
25.6k
  float wmin2 = 1.0f;
1453
25.6k
  float wmax2 = 0.0f;
1454
1455
25.6k
  float left1_sum_s = 0.0f;
1456
25.6k
  float middle1_sum_s = 0.0f;
1457
25.6k
  float right1_sum_s = 0.0f;
1458
1459
25.6k
  float left2_sum_s = 0.0f;
1460
25.6k
  float middle2_sum_s = 0.0f;
1461
25.6k
  float right2_sum_s = 0.0f;
1462
1463
25.6k
  vfloat4 color_vec_x = vfloat4::zero();
1464
25.6k
  vfloat4 color_vec_y = vfloat4::zero();
1465
1466
25.6k
  vfloat4 scale_vec = vfloat4::zero();
1467
1468
25.6k
  vfloat4 weight_weight_sum = vfloat4(1e-17f);
1469
1470
25.6k
  vmask4 p2_mask = vint4::lane_id() == vint4(plane2_component);
1471
25.6k
  vfloat4 color_weight = blk.channel_weight;
1472
25.6k
  float ls_weight = hadd_rgb_s(color_weight);
1473
1474
676k
  for (unsigned int j = 0; j < texel_count; j++)
1475
650k
  {
1476
650k
    vfloat4 rgba = blk.texel(j);
1477
1478
650k
    float idx0 = undec_weight_plane1_ref[j];
1479
1480
650k
    float om_idx0 = 1.0f - idx0;
1481
650k
    wmin1 = astc::min(idx0, wmin1);
1482
650k
    wmax1 = astc::max(idx0, wmax1);
1483
1484
650k
    float scale = dot3_s(scale_dir, rgba);
1485
650k
    scale_min = astc::min(scale, scale_min);
1486
650k
    scale_max = astc::max(scale, scale_max);
1487
1488
650k
    left1_sum_s   += om_idx0 * om_idx0;
1489
650k
    middle1_sum_s += om_idx0 * idx0;
1490
650k
    right1_sum_s  += idx0 * idx0;
1491
1492
650k
    float idx1 = undec_weight_plane2_ref[j];
1493
1494
650k
    float om_idx1 = 1.0f - idx1;
1495
650k
    wmin2 = astc::min(idx1, wmin2);
1496
650k
    wmax2 = astc::max(idx1, wmax2);
1497
1498
650k
    left2_sum_s   += om_idx1 * om_idx1;
1499
650k
    middle2_sum_s += om_idx1 * idx1;
1500
650k
    right2_sum_s  += idx1 * idx1;
1501
1502
650k
    vfloat4 color_idx = select(vfloat4(idx0), vfloat4(idx1), p2_mask);
1503
1504
650k
    vfloat4 cwprod = rgba;
1505
650k
    vfloat4 cwiprod = cwprod * color_idx;
1506
1507
650k
    color_vec_y += cwiprod;
1508
650k
    color_vec_x += cwprod - cwiprod;
1509
1510
650k
    scale_vec += vfloat2(om_idx0, idx0) * (ls_weight * scale);
1511
650k
    weight_weight_sum += color_idx;
1512
650k
  }
1513
1514
25.6k
  vfloat4 left1_sum   = vfloat4(left1_sum_s) * color_weight;
1515
25.6k
  vfloat4 middle1_sum = vfloat4(middle1_sum_s) * color_weight;
1516
25.6k
  vfloat4 right1_sum  = vfloat4(right1_sum_s) * color_weight;
1517
25.6k
  vfloat4 lmrs_sum    = vfloat3(left1_sum_s, middle1_sum_s, right1_sum_s) * ls_weight;
1518
1519
25.6k
  vfloat4 left2_sum   = vfloat4(left2_sum_s) * color_weight;
1520
25.6k
  vfloat4 middle2_sum = vfloat4(middle2_sum_s) * color_weight;
1521
25.6k
  vfloat4 right2_sum  = vfloat4(right2_sum_s) * color_weight;
1522
1523
25.6k
  color_vec_x = color_vec_x * color_weight;
1524
25.6k
  color_vec_y = color_vec_y * color_weight;
1525
1526
  // Initialize the luminance and scale vectors with a reasonable default
1527
25.6k
  float scalediv = scale_min / astc::max(scale_max, 1e-10f);
1528
25.6k
  scalediv = astc::clamp1f(scalediv);
1529
1530
25.6k
  vfloat4 sds = scale_dir * scale_max;
1531
1532
25.6k
  rgbs_vector = vfloat4(sds.lane<0>(), sds.lane<1>(), sds.lane<2>(), scalediv);
1533
1534
25.6k
  if (wmin1 >= wmax1 * 0.999f)
1535
606
  {
1536
    // If all weights in the partition were equal, then just take average of all colors in
1537
    // the partition and use that as both endpoint colors
1538
606
    vfloat4 avg = (color_vec_x + color_vec_y) / rgba_weight_sum;
1539
1540
606
    vmask4 p1_mask = vint4::lane_id() != vint4(plane2_component);
1541
606
    vmask4 notnan_mask = avg == avg;
1542
606
    vmask4 full_mask = p1_mask & notnan_mask;
1543
1544
606
    ep.endpt0[0] = select(ep.endpt0[0], avg, full_mask);
1545
606
    ep.endpt1[0] = select(ep.endpt1[0], avg, full_mask);
1546
1547
606
    rgbs_vector = vfloat4(sds.lane<0>(), sds.lane<1>(), sds.lane<2>(), 1.0f);
1548
606
  }
1549
25.0k
  else
1550
25.0k
  {
1551
    // Otherwise, complete the analytic calculation of ideal-endpoint-values for the given
1552
    // set of texel weights and pixel colors
1553
25.0k
    vfloat4 color_det1 = (left1_sum * right1_sum) - (middle1_sum * middle1_sum);
1554
25.0k
    vfloat4 color_rdet1 = 1.0f / color_det1;
1555
1556
25.0k
    float ls_det1  = (lmrs_sum.lane<0>() * lmrs_sum.lane<2>()) - (lmrs_sum.lane<1>() * lmrs_sum.lane<1>());
1557
25.0k
    float ls_rdet1 = 1.0f / ls_det1;
1558
1559
25.0k
    vfloat4 color_mss1 = (left1_sum * left1_sum)
1560
25.0k
                       + (2.0f * middle1_sum * middle1_sum)
1561
25.0k
                       + (right1_sum * right1_sum);
1562
1563
25.0k
    float ls_mss1 = (lmrs_sum.lane<0>() * lmrs_sum.lane<0>())
1564
25.0k
                  + (2.0f * lmrs_sum.lane<1>() * lmrs_sum.lane<1>())
1565
25.0k
                  + (lmrs_sum.lane<2>() * lmrs_sum.lane<2>());
1566
1567
25.0k
    vfloat4 ep0 = (right1_sum * color_vec_x - middle1_sum * color_vec_y) * color_rdet1;
1568
25.0k
    vfloat4 ep1 = (left1_sum * color_vec_y - middle1_sum * color_vec_x) * color_rdet1;
1569
1570
25.0k
    float scale_ep0 = (lmrs_sum.lane<2>() * scale_vec.lane<0>() - lmrs_sum.lane<1>() * scale_vec.lane<1>()) * ls_rdet1;
1571
25.0k
    float scale_ep1 = (lmrs_sum.lane<0>() * scale_vec.lane<1>() - lmrs_sum.lane<1>() * scale_vec.lane<0>()) * ls_rdet1;
1572
1573
25.0k
    vmask4 p1_mask = vint4::lane_id() != vint4(plane2_component);
1574
25.0k
    vmask4 det_mask = abs(color_det1) > (color_mss1 * 1e-4f);
1575
25.0k
    vmask4 notnan_mask = (ep0 == ep0) & (ep1 == ep1);
1576
25.0k
    vmask4 full_mask = p1_mask & det_mask & notnan_mask;
1577
1578
25.0k
    ep.endpt0[0] = select(ep.endpt0[0], ep0, full_mask);
1579
25.0k
    ep.endpt1[0] = select(ep.endpt1[0], ep1, full_mask);
1580
1581
25.0k
    if (fabsf(ls_det1) > (ls_mss1 * 1e-4f) && scale_ep0 == scale_ep0 && scale_ep1 == scale_ep1 && scale_ep0 < scale_ep1)
1582
22.9k
    {
1583
22.9k
      float scalediv2 = scale_ep0 / scale_ep1;
1584
22.9k
      vfloat4 sdsm = scale_dir * scale_ep1;
1585
22.9k
      rgbs_vector = vfloat4(sdsm.lane<0>(), sdsm.lane<1>(), sdsm.lane<2>(), scalediv2);
1586
22.9k
    }
1587
25.0k
  }
1588
1589
25.6k
  if (wmin2 >= wmax2 * 0.999f)
1590
781
  {
1591
    // If all weights in the partition were equal, then just take average of all colors in
1592
    // the partition and use that as both endpoint colors
1593
781
    vfloat4 avg = (color_vec_x + color_vec_y) / rgba_weight_sum;
1594
1595
781
    vmask4 notnan_mask = avg == avg;
1596
781
    vmask4 full_mask = p2_mask & notnan_mask;
1597
1598
781
    ep.endpt0[0] = select(ep.endpt0[0], avg, full_mask);
1599
781
    ep.endpt1[0] = select(ep.endpt1[0], avg, full_mask);
1600
781
  }
1601
24.8k
  else
1602
24.8k
  {
1603
    // Otherwise, complete the analytic calculation of ideal-endpoint-values for the given
1604
    // set of texel weights and pixel colors
1605
24.8k
    vfloat4 color_det2 = (left2_sum * right2_sum) - (middle2_sum * middle2_sum);
1606
24.8k
    vfloat4 color_rdet2 = 1.0f / color_det2;
1607
1608
24.8k
    vfloat4 color_mss2 = (left2_sum * left2_sum)
1609
24.8k
                       + (2.0f * middle2_sum * middle2_sum)
1610
24.8k
                       + (right2_sum * right2_sum);
1611
1612
24.8k
    vfloat4 ep0 = (right2_sum * color_vec_x - middle2_sum * color_vec_y) * color_rdet2;
1613
24.8k
    vfloat4 ep1 = (left2_sum * color_vec_y - middle2_sum * color_vec_x) * color_rdet2;
1614
1615
24.8k
    vmask4 det_mask = abs(color_det2) > (color_mss2 * 1e-4f);
1616
24.8k
    vmask4 notnan_mask = (ep0 == ep0) & (ep1 == ep1);
1617
24.8k
    vmask4 full_mask = p2_mask & det_mask & notnan_mask;
1618
1619
24.8k
    ep.endpt0[0] = select(ep.endpt0[0], ep0, full_mask);
1620
24.8k
    ep.endpt1[0] = select(ep.endpt1[0], ep1, full_mask);
1621
24.8k
  }
1622
1623
  // Calculations specific to mode #7, the HDR RGB-scale mode - skip if known LDR
1624
25.6k
  if (blk.rgb_lns[0] || blk.alpha_lns[0])
1625
12.2k
  {
1626
12.2k
    weight_weight_sum = weight_weight_sum * color_weight;
1627
12.2k
    float psum = dot3_s(select(right1_sum, right2_sum, p2_mask), color_weight);
1628
1629
12.2k
    vfloat4 rgbq_sum = color_vec_x + color_vec_y;
1630
12.2k
    rgbq_sum.set_lane<3>(hadd_rgb_s(color_vec_y));
1631
1632
12.2k
    rgbo_vector = compute_rgbo_vector(rgba_weight_sum, weight_weight_sum, rgbq_sum, psum);
1633
1634
    // We can get a failure due to the use of a singular (non-invertible) matrix
1635
    // If it failed, compute rgbo_vectors[] with a different method ...
1636
12.2k
    if (astc::isnan(dot_s(rgbo_vector, rgbo_vector)))
1637
431
    {
1638
431
      vfloat4 v0 = ep.endpt0[0];
1639
431
      vfloat4 v1 = ep.endpt1[0];
1640
1641
431
      float avgdif = hadd_rgb_s(v1 - v0) * (1.0f / 3.0f);
1642
431
      avgdif = astc::max(avgdif, 0.0f);
1643
1644
431
      vfloat4 avg = (v0 + v1) * 0.5f;
1645
431
      vfloat4 ep0 = avg - vfloat4(avgdif) * 0.5f;
1646
1647
431
      rgbo_vector = vfloat4(ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>(), avgdif);
1648
431
    }
1649
12.2k
  }
1650
25.6k
}
1651
1652
#endif