Coverage Report

Created: 2026-05-30 06:06

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/astc-encoder/Source/astcenc_compress_symbolic.cpp
Line
Count
Source
1
// SPDX-License-Identifier: Apache-2.0
2
// ----------------------------------------------------------------------------
3
// Copyright 2011-2026 Arm Limited
4
//
5
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
6
// use this file except in compliance with the License. You may obtain a copy
7
// of the License at:
8
//
9
//     http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing, software
12
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14
// License for the specific language governing permissions and limitations
15
// under the License.
16
// ----------------------------------------------------------------------------
17
18
#if !defined(ASTCENC_DECOMPRESS_ONLY)
19
20
/**
21
 * @brief Functions to compress a symbolic block.
22
 */
23
24
#include "astcenc_internal.h"
25
#include "astcenc_diagnostic_trace.h"
26
27
#include <cassert>
28
29
/**
30
 * @brief Merge two planes of endpoints into a single vector.
31
 *
32
 * @param      ep_plane1          The endpoints for plane 1.
33
 * @param      ep_plane2          The endpoints for plane 2.
34
 * @param      component_plane2   The color component for plane 2.
35
 * @param[out] result             The merged output.
36
 */
37
static void merge_endpoints(
38
  const endpoints& ep_plane1,
39
  const endpoints& ep_plane2,
40
  unsigned int component_plane2,
41
  endpoints& result
42
6.99k
) {
43
6.99k
  unsigned int partition_count = ep_plane1.partition_count;
44
6.99k
  assert(partition_count == 1);
45
46
6.99k
  vmask4 sep_mask = vint4::lane_id() == vint4(component_plane2);
47
48
6.99k
  result.partition_count = partition_count;
49
6.99k
  result.endpt0[0] = select(ep_plane1.endpt0[0], ep_plane2.endpt0[0], sep_mask);
50
6.99k
  result.endpt1[0] = select(ep_plane1.endpt1[0], ep_plane2.endpt1[0], sep_mask);
51
6.99k
}
52
53
/**
54
 * @brief Attempt to improve weights given a chosen configuration.
55
 *
56
 * Given a fixed weight grid decimation and weight value quantization, iterate over all weights (per
57
 * partition and per plane) and attempt to improve image quality by moving each weight up by one or
58
 * down by one quantization step.
59
 *
60
 * This is a specialized function which only supports operating on undecimated weight grids,
61
 * therefore primarily improving the performance of 4x4 and 5x5 blocks where grid decimation
62
 * is needed less often.
63
 *
64
 * @param      decode_mode   The decode mode (LDR, HDR).
65
 * @param      bsd           The block size information.
66
 * @param      blk           The image block color data to compress.
67
 * @param[out] scb           The symbolic compressed block output.
68
 */
69
static bool realign_weights_undecimated(
70
  astcenc_profile decode_mode,
71
  const block_size_descriptor& bsd,
72
  const image_block& blk,
73
  symbolic_compressed_block& scb
74
33.9k
) {
75
  // Get the partition descriptor
76
33.9k
  unsigned int partition_count = scb.partition_count;
77
33.9k
  const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
78
79
  // Get the quantization table
80
33.9k
  const block_mode& bm = bsd.get_block_mode(scb.block_mode);
81
33.9k
  unsigned int weight_quant_level = bm.quant_mode;
82
33.9k
  const quant_and_transfer_table& qat = quant_and_xfer_tables[weight_quant_level];
83
84
33.9k
  unsigned int max_plane = bm.is_dual_plane;
85
33.9k
  int plane2_component = scb.plane2_component;
86
33.9k
  vmask4 plane_mask = vint4::lane_id() == vint4(plane2_component);
87
88
  // Decode the color endpoints
89
33.9k
  bool rgb_hdr;
90
33.9k
  bool alpha_hdr;
91
33.9k
  vint4 endpnt0[BLOCK_MAX_PARTITIONS];
92
33.9k
  vint4 endpnt1[BLOCK_MAX_PARTITIONS];
93
33.9k
  vfloat4 endpnt0f[BLOCK_MAX_PARTITIONS];
94
33.9k
  vfloat4 offset[BLOCK_MAX_PARTITIONS];
95
96
33.9k
  promise(partition_count > 0);
97
98
92.8k
  for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
99
58.8k
  {
100
58.8k
    unpack_color_endpoints(decode_mode,
101
58.8k
                           scb.color_formats[pa_idx],
102
58.8k
                           scb.color_values[pa_idx],
103
58.8k
                           rgb_hdr, alpha_hdr,
104
58.8k
                           endpnt0[pa_idx],
105
58.8k
                           endpnt1[pa_idx]);
106
58.8k
  }
107
108
33.9k
  uint8_t* dec_weights_uquant = scb.weights;
109
33.9k
  bool adjustments = false;
110
111
  // For each plane and partition ...
112
76.6k
  for (unsigned int pl_idx = 0; pl_idx <= max_plane; pl_idx++)
113
42.6k
  {
114
110k
    for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
115
67.5k
    {
116
      // Compute the endpoint delta for all components in current plane
117
67.5k
      vint4 epd = endpnt1[pa_idx] - endpnt0[pa_idx];
118
67.5k
      epd = select(epd, vint4::zero(), plane_mask);
119
120
67.5k
      endpnt0f[pa_idx] = int_to_float(endpnt0[pa_idx]);
121
67.5k
      offset[pa_idx] = int_to_float(epd) * (1.0f / 64.0f);
122
67.5k
    }
123
124
    // For each weight compute previous, current, and next errors
125
42.6k
    promise(bsd.texel_count > 0);
126
791k
    for (unsigned int texel = 0; texel < bsd.texel_count; texel++)
127
749k
    {
128
749k
      int uqw = dec_weights_uquant[texel];
129
130
749k
      uint32_t prev_and_next = qat.prev_next_values[uqw];
131
749k
      int uqw_down = prev_and_next & 0xFF;
132
749k
      int uqw_up = (prev_and_next >> 8) & 0xFF;
133
134
      // Interpolate the colors to create the diffs
135
749k
      float weight_base = static_cast<float>(uqw);
136
749k
      float weight_down = static_cast<float>(uqw_down - uqw);
137
749k
      float weight_up = static_cast<float>(uqw_up - uqw);
138
139
749k
      unsigned int partition = pi.partition_of_texel[texel];
140
749k
      vfloat4 color_offset = offset[partition];
141
749k
      vfloat4 color_base   = endpnt0f[partition];
142
143
749k
      vfloat4 color = color_base + color_offset * weight_base;
144
749k
      vfloat4 orig_color   = blk.texel(texel);
145
749k
      vfloat4 error_weight = blk.channel_weight;
146
147
749k
      vfloat4 color_diff      = color - orig_color;
148
749k
      vfloat4 color_diff_down = color_diff + color_offset * weight_down;
149
749k
      vfloat4 color_diff_up   = color_diff + color_offset * weight_up;
150
151
749k
      float error_base = dot_s(color_diff      * color_diff,      error_weight);
152
749k
      float error_down = dot_s(color_diff_down * color_diff_down, error_weight);
153
749k
      float error_up   = dot_s(color_diff_up   * color_diff_up,   error_weight);
154
155
      // Check if the prev or next error is better, and if so use it
156
749k
      if ((error_up < error_base) && (error_up < error_down) && (uqw < 64))
157
54.7k
      {
158
54.7k
        dec_weights_uquant[texel] = static_cast<uint8_t>(uqw_up);
159
54.7k
        adjustments = true;
160
54.7k
      }
161
694k
      else if ((error_down < error_base) && (uqw > 0))
162
54.1k
      {
163
54.1k
        dec_weights_uquant[texel] = static_cast<uint8_t>(uqw_down);
164
54.1k
        adjustments = true;
165
54.1k
      }
166
749k
    }
167
168
    // Prepare iteration for plane 2
169
42.6k
    dec_weights_uquant += WEIGHTS_PLANE2_OFFSET;
170
42.6k
    plane_mask = ~plane_mask;
171
42.6k
  }
172
173
33.9k
  return adjustments;
174
33.9k
}
175
176
/**
177
 * @brief Attempt to improve weights given a chosen configuration.
178
 *
179
 * Given a fixed weight grid decimation and weight value quantization, iterate over all weights (per
180
 * partition and per plane) and attempt to improve image quality by moving each weight up by one or
181
 * down by one quantization step.
182
 *
183
 * @param      decode_mode   The decode mode (LDR, HDR).
184
 * @param      bsd           The block size information.
185
 * @param      blk           The image block color data to compress.
186
 * @param[out] scb           The symbolic compressed block output.
187
 */
188
static bool realign_weights_decimated(
189
  astcenc_profile decode_mode,
190
  const block_size_descriptor& bsd,
191
  const image_block& blk,
192
  symbolic_compressed_block& scb
193
12.2k
) {
194
  // Get the partition descriptor
195
12.2k
  unsigned int partition_count = scb.partition_count;
196
12.2k
  const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
197
198
  // Get the quantization table
199
12.2k
  const block_mode& bm = bsd.get_block_mode(scb.block_mode);
200
12.2k
  unsigned int weight_quant_level = bm.quant_mode;
201
12.2k
  const quant_and_transfer_table& qat = quant_and_xfer_tables[weight_quant_level];
202
203
  // Get the decimation table
204
12.2k
  const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
205
12.2k
  unsigned int weight_count = di.weight_count;
206
12.2k
  assert(weight_count != bsd.texel_count);
207
208
12.2k
  unsigned int max_plane = bm.is_dual_plane;
209
12.2k
  int plane2_component = scb.plane2_component;
210
12.2k
  vmask4 plane_mask = vint4::lane_id() == vint4(plane2_component);
211
212
  // Decode the color endpoints
213
12.2k
  bool rgb_hdr;
214
12.2k
  bool alpha_hdr;
215
12.2k
  vint4 endpnt0[BLOCK_MAX_PARTITIONS];
216
12.2k
  vint4 endpnt1[BLOCK_MAX_PARTITIONS];
217
12.2k
  vfloat4 endpnt0f[BLOCK_MAX_PARTITIONS];
218
12.2k
  vfloat4 offset[BLOCK_MAX_PARTITIONS];
219
220
12.2k
  promise(partition_count > 0);
221
12.2k
  promise(weight_count > 0);
222
223
28.5k
  for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
224
16.3k
  {
225
16.3k
    unpack_color_endpoints(decode_mode,
226
16.3k
                           scb.color_formats[pa_idx],
227
16.3k
                           scb.color_values[pa_idx],
228
16.3k
                           rgb_hdr, alpha_hdr,
229
16.3k
                           endpnt0[pa_idx],
230
16.3k
                           endpnt1[pa_idx]);
231
16.3k
  }
232
233
12.2k
  uint8_t* dec_weights_uquant = scb.weights;
234
12.2k
  bool adjustments = false;
235
236
  // For each plane and partition ...
237
32.0k
  for (unsigned int pl_idx = 0; pl_idx <= max_plane; pl_idx++)
238
19.8k
  {
239
43.8k
    for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
240
23.9k
    {
241
      // Compute the endpoint delta for all components in current plane
242
23.9k
      vint4 epd = endpnt1[pa_idx] - endpnt0[pa_idx];
243
23.9k
      epd = select(epd, vint4::zero(), plane_mask);
244
245
23.9k
      endpnt0f[pa_idx] = int_to_float(endpnt0[pa_idx]);
246
23.9k
      offset[pa_idx] = int_to_float(epd) * (1.0f / 64.0f);
247
23.9k
    }
248
249
    // Create an unquantized weight grid for this decimation level
250
19.8k
    ASTCENC_ALIGNAS float uq_weightsf[BLOCK_MAX_WEIGHTS];
251
118k
    for (unsigned int we_idx = 0; we_idx < weight_count; we_idx += ASTCENC_SIMD_WIDTH)
252
99.0k
    {
253
99.0k
      vint unquant_value(dec_weights_uquant + we_idx);
254
99.0k
      vfloat unquant_valuef = int_to_float(unquant_value);
255
99.0k
      storea(unquant_valuef, uq_weightsf + we_idx);
256
99.0k
    }
257
258
    // For each weight compute previous, current, and next errors
259
407k
    for (unsigned int we_idx = 0; we_idx < weight_count; we_idx++)
260
387k
    {
261
387k
      int uqw = dec_weights_uquant[we_idx];
262
387k
      uint32_t prev_and_next = qat.prev_next_values[uqw];
263
264
387k
      float uqw_base = uq_weightsf[we_idx];
265
387k
      float uqw_down = static_cast<float>(prev_and_next & 0xFF);
266
387k
      float uqw_up = static_cast<float>((prev_and_next >> 8) & 0xFF);
267
268
387k
      float uqw_diff_down = uqw_down - uqw_base;
269
387k
      float uqw_diff_up = uqw_up - uqw_base;
270
271
387k
      vfloat4 error_basev = vfloat4::zero();
272
387k
      vfloat4 error_downv = vfloat4::zero();
273
387k
      vfloat4 error_upv = vfloat4::zero();
274
275
      // Interpolate the colors to create the diffs
276
387k
      unsigned int texels_to_evaluate = di.weight_texel_count[we_idx];
277
387k
      promise(texels_to_evaluate > 0);
278
2.21M
      for (unsigned int te_idx = 0; te_idx < texels_to_evaluate; te_idx++)
279
1.82M
      {
280
1.82M
        unsigned int texel = di.weight_texels_tr[te_idx][we_idx];
281
282
1.82M
        float tw_base = di.texel_contrib_for_weight[te_idx][we_idx];
283
284
1.82M
        float weight_base = (uq_weightsf[di.texel_weights_tr[0][texel]] * di.texel_weight_contribs_float_tr[0][texel]
285
1.82M
                           + uq_weightsf[di.texel_weights_tr[1][texel]] * di.texel_weight_contribs_float_tr[1][texel])
286
1.82M
                        + (uq_weightsf[di.texel_weights_tr[2][texel]] * di.texel_weight_contribs_float_tr[2][texel]
287
1.82M
                           + uq_weightsf[di.texel_weights_tr[3][texel]] * di.texel_weight_contribs_float_tr[3][texel]);
288
289
        // Ideally this is integer rounded, but IQ gain it isn't worth the overhead
290
        // float weight = astc::flt_rd(weight_base + 0.5f);
291
        // float weight_down = astc::flt_rd(weight_base + 0.5f + uqw_diff_down * tw_base) - weight;
292
        // float weight_up = astc::flt_rd(weight_base + 0.5f + uqw_diff_up * tw_base) - weight;
293
1.82M
        float weight_down = weight_base + uqw_diff_down * tw_base - weight_base;
294
1.82M
        float weight_up = weight_base + uqw_diff_up * tw_base - weight_base;
295
296
1.82M
        unsigned int partition = pi.partition_of_texel[texel];
297
1.82M
        vfloat4 color_offset = offset[partition];
298
1.82M
        vfloat4 color_base   = endpnt0f[partition];
299
300
1.82M
        vfloat4 color = color_base + color_offset * weight_base;
301
1.82M
        vfloat4 orig_color = blk.texel(texel);
302
303
1.82M
        vfloat4 color_diff      = color - orig_color;
304
1.82M
        vfloat4 color_down_diff = color_diff + color_offset * weight_down;
305
1.82M
        vfloat4 color_up_diff   = color_diff + color_offset * weight_up;
306
307
1.82M
        error_basev += color_diff * color_diff;
308
1.82M
        error_downv += color_down_diff * color_down_diff;
309
1.82M
        error_upv   += color_up_diff * color_up_diff;
310
1.82M
      }
311
312
387k
      vfloat4 error_weight = blk.channel_weight;
313
387k
      float error_base = hadd_s(error_basev * error_weight);
314
387k
      float error_down = hadd_s(error_downv * error_weight);
315
387k
      float error_up   = hadd_s(error_upv   * error_weight);
316
317
      // Check if the prev or next error is better, and if so use it
318
387k
      if ((error_up < error_base) && (error_up < error_down) && (uqw < 64))
319
17.6k
      {
320
17.6k
        uq_weightsf[we_idx] = uqw_up;
321
17.6k
        dec_weights_uquant[we_idx] = static_cast<uint8_t>(uqw_up);
322
17.6k
        adjustments = true;
323
17.6k
      }
324
369k
      else if ((error_down < error_base) && (uqw > 0))
325
33.0k
      {
326
33.0k
        uq_weightsf[we_idx] = uqw_down;
327
33.0k
        dec_weights_uquant[we_idx] = static_cast<uint8_t>(uqw_down);
328
33.0k
        adjustments = true;
329
33.0k
      }
330
387k
    }
331
332
    // Prepare iteration for plane 2
333
19.8k
    dec_weights_uquant += WEIGHTS_PLANE2_OFFSET;
334
19.8k
    plane_mask = ~plane_mask;
335
19.8k
  }
336
337
12.2k
  return adjustments;
338
12.2k
}
339
340
/**
341
 * @brief Compress a block using a chosen partitioning and 1 plane of weights.
342
 *
343
 * @param      config                    The compressor configuration.
344
 * @param      bsd                       The block size information.
345
 * @param      blk                       The image block color data to compress.
346
 * @param      only_always               True if we only use "always" percentile block modes.
347
 * @param      tune_errorval_threshold   The error value threshold.
348
 * @param      partition_count           The partition count.
349
 * @param      partition_index           The partition index if @c partition_count is 2-4.
350
 * @param[out] scb                       The symbolic compressed block output.
351
 * @param[out] tmpbuf                    The quantized weights for plane 1.
352
 */
353
static float compress_symbolic_block_for_partition_1plane(
354
  const astcenc_config& config,
355
  const block_size_descriptor& bsd,
356
  const image_block& blk,
357
  bool only_always,
358
  float tune_errorval_threshold,
359
  unsigned int partition_count,
360
  unsigned int partition_index,
361
  symbolic_compressed_block& scb,
362
  compression_working_buffers& tmpbuf,
363
  int quant_limit
364
10.9k
) {
365
10.9k
  promise(partition_count > 0);
366
10.9k
  promise(config.tune_candidate_limit > 0);
367
10.9k
  promise(config.tune_refinement_limit > 0);
368
369
10.9k
  int max_weight_quant = astc::min(static_cast<int>(QUANT_32), quant_limit);
370
371
10.9k
  auto compute_difference = &compute_symbolic_block_difference_1plane;
372
10.9k
  if ((partition_count == 1) && !(config.flags & ASTCENC_FLG_MAP_RGBM))
373
1.76k
  {
374
1.76k
    compute_difference = &compute_symbolic_block_difference_1plane_1partition;
375
1.76k
  }
376
377
10.9k
  const auto& pi = bsd.get_partition_info(partition_count, partition_index);
378
379
  // Compute ideal weights and endpoint colors, with no quantization or decimation
380
10.9k
  endpoints_and_weights& ei = tmpbuf.ei1;
381
10.9k
  compute_ideal_colors_and_weights_1plane(blk, pi, ei);
382
383
  // Compute ideal weights and endpoint colors for every decimation
384
10.9k
  float* dec_weights_ideal = tmpbuf.dec_weights_ideal;
385
10.9k
  uint8_t* dec_weights_uquant = tmpbuf.dec_weights_uquant;
386
387
  // For each decimation mode, compute an ideal set of weights with no quantization
388
10.9k
  unsigned int max_decimation_modes = only_always ? bsd.decimation_mode_count_always
389
10.9k
                                                  : bsd.decimation_mode_count_selected;
390
10.9k
  promise(max_decimation_modes > 0);
391
116k
  for (unsigned int i = 0; i < max_decimation_modes; i++)
392
105k
  {
393
105k
    const auto& dm = bsd.get_decimation_mode(i);
394
105k
    if (!dm.is_ref_1plane(static_cast<quant_method>(max_weight_quant)))
395
34.5k
    {
396
34.5k
      continue;
397
34.5k
    }
398
399
70.7k
    const auto& di = bsd.get_decimation_info(i);
400
401
70.7k
    compute_ideal_weights_for_decimation(
402
70.7k
        ei,
403
70.7k
        di,
404
70.7k
        dec_weights_ideal + i * BLOCK_MAX_WEIGHTS);
405
70.7k
  }
406
407
  // Compute maximum colors for the endpoints and ideal weights, then for each endpoint and ideal
408
  // weight pair, compute the smallest weight that will result in a color value greater than 1
409
10.9k
  vfloat4 min_ep(10.0f);
410
35.4k
  for (unsigned int i = 0; i < partition_count; i++)
411
24.5k
  {
412
24.5k
    vfloat4 ep = (vfloat4(1.0f) - ei.ep.endpt0[i]) / (ei.ep.endpt1[i] - ei.ep.endpt0[i]);
413
414
24.5k
    vmask4 use_ep = (ep > vfloat4(0.5f)) & (ep < min_ep);
415
24.5k
    min_ep = select(min_ep, ep, use_ep);
416
24.5k
  }
417
418
10.9k
  float min_wt_cutoff = hmin_s(min_ep);
419
420
  // For each mode, use the angular method to compute a shift
421
10.9k
  compute_angular_endpoints_1plane(
422
10.9k
      only_always, bsd, dec_weights_ideal, max_weight_quant, tmpbuf);
423
424
10.9k
  float* weight_low_value = tmpbuf.weight_low_value1;
425
10.9k
  float* weight_high_value = tmpbuf.weight_high_value1;
426
10.9k
  int8_t* qwt_bitcounts = tmpbuf.qwt_bitcounts;
427
10.9k
  float* qwt_errors = tmpbuf.qwt_errors;
428
429
  // For each mode (which specifies a decimation and a quantization):
430
  //     * Compute number of bits needed for the quantized weights
431
  //     * Generate an optimized set of quantized weights
432
  //     * Compute quantization errors for the mode
433
434
10.9k
  static const int8_t free_bits_for_partition_count[4] {
435
10.9k
    115 - 4, 111 - 4 - PARTITION_INDEX_BITS, 108 - 4 - PARTITION_INDEX_BITS, 105 - 4 - PARTITION_INDEX_BITS
436
10.9k
  };
437
438
10.9k
  unsigned int max_block_modes = only_always ? bsd.block_mode_count_1plane_always
439
10.9k
                                             : bsd.block_mode_count_1plane_selected;
440
10.9k
  promise(max_block_modes > 0);
441
280k
  for (unsigned int i = 0; i < max_block_modes; i++)
442
269k
  {
443
269k
    const block_mode& bm = bsd.block_modes[i];
444
445
269k
    if (bm.quant_mode > max_weight_quant)
446
91.2k
    {
447
91.2k
      qwt_errors[i] = 1e38f;
448
91.2k
      continue;
449
91.2k
    }
450
451
269k
    assert(!bm.is_dual_plane);
452
178k
    int bitcount = free_bits_for_partition_count[partition_count - 1] - bm.weight_bits;
453
178k
    if (bitcount <= 0)
454
1.29k
    {
455
1.29k
      qwt_errors[i] = 1e38f;
456
1.29k
      continue;
457
1.29k
    }
458
459
176k
    if (weight_high_value[i] > 1.02f * min_wt_cutoff)
460
35.0k
    {
461
35.0k
      weight_high_value[i] = 1.0f;
462
35.0k
    }
463
464
176k
    int decimation_mode = bm.decimation_mode;
465
176k
    const auto& di = bsd.get_decimation_info(decimation_mode);
466
467
176k
    qwt_bitcounts[i] = static_cast<int8_t>(bitcount);
468
469
176k
    ASTCENC_ALIGNAS float dec_weights_uquantf[BLOCK_MAX_WEIGHTS];
470
471
    // Generate the optimized set of weights for the weight mode
472
176k
    compute_quantized_weights_for_decimation(
473
176k
        di,
474
176k
        weight_low_value[i], weight_high_value[i],
475
176k
        dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode,
476
176k
        dec_weights_uquantf,
477
176k
        dec_weights_uquant + BLOCK_MAX_WEIGHTS * i,
478
176k
        bm.get_weight_quant_mode());
479
480
    // Compute weight quantization errors for the block mode
481
176k
    qwt_errors[i] = compute_error_of_weight_set_1plane(
482
176k
        ei,
483
176k
        di,
484
176k
        dec_weights_uquantf);
485
176k
  }
486
487
  // Decide the optimal combination of color endpoint encodings and weight encodings
488
10.9k
  uint8_t partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS];
489
10.9k
  int block_mode_index[TUNE_MAX_TRIAL_CANDIDATES];
490
491
10.9k
  quant_method color_quant_level[TUNE_MAX_TRIAL_CANDIDATES];
492
10.9k
  quant_method color_quant_level_mod[TUNE_MAX_TRIAL_CANDIDATES];
493
494
10.9k
  unsigned int candidate_count = compute_ideal_endpoint_formats(
495
10.9k
      pi, blk, ei.ep, qwt_bitcounts, qwt_errors,
496
10.9k
      config.tune_candidate_limit, 0, max_block_modes,
497
10.9k
      partition_format_specifiers, block_mode_index,
498
10.9k
      color_quant_level, color_quant_level_mod, tmpbuf);
499
500
  // Iterate over the N believed-to-be-best modes to find out which one is actually best
501
10.9k
  float best_errorval_in_mode = ERROR_CALC_DEFAULT;
502
10.9k
  float best_errorval_in_scb = scb.errorval;
503
504
41.7k
  for (unsigned int i = 0; i < candidate_count; i++)
505
30.8k
  {
506
30.8k
    TRACE_NODE(node0, "candidate");
507
508
30.8k
    const int bm_packed_index = block_mode_index[i];
509
30.8k
    assert(bm_packed_index >= 0 && bm_packed_index < static_cast<int>(bsd.block_mode_count_1plane_selected));
510
30.8k
    const block_mode& qw_bm = bsd.block_modes[bm_packed_index];
511
512
30.8k
    int decimation_mode = qw_bm.decimation_mode;
513
30.8k
    const auto& di = bsd.get_decimation_info(decimation_mode);
514
30.8k
    promise(di.weight_count > 0);
515
516
30.8k
    trace_add_data("weight_x", di.weight_x);
517
30.8k
    trace_add_data("weight_y", di.weight_y);
518
30.8k
    trace_add_data("weight_z", di.weight_z);
519
30.8k
    trace_add_data("weight_quant", qw_bm.quant_mode);
520
521
    // Recompute the ideal color endpoints before storing them
522
30.8k
    vfloat4 rgbs_colors[BLOCK_MAX_PARTITIONS];
523
30.8k
    vfloat4 rgbo_colors[BLOCK_MAX_PARTITIONS];
524
525
30.8k
    symbolic_compressed_block workscb;
526
30.8k
    endpoints workep = ei.ep;
527
528
30.8k
    uint8_t* u8_weight_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index;
529
530
694k
    for (unsigned int j = 0; j < di.weight_count; j++)
531
664k
    {
532
664k
      workscb.weights[j] = u8_weight_src[j];
533
664k
    }
534
535
48.7k
    for (unsigned int l = 0; l < config.tune_refinement_limit; l++)
536
45.8k
    {
537
45.8k
      recompute_ideal_colors_1plane(
538
45.8k
          blk, pi, di, workscb.weights,
539
45.8k
          workep, rgbs_colors, rgbo_colors);
540
541
      // Quantize the chosen color, tracking if worth trying the mod value
542
45.8k
      bool all_same = color_quant_level[i] != color_quant_level_mod[i];
543
145k
      for (unsigned int j = 0; j < partition_count; j++)
544
99.7k
      {
545
99.7k
        workscb.color_formats[j] = pack_color_endpoints(
546
99.7k
            workep.endpt0[j],
547
99.7k
            workep.endpt1[j],
548
99.7k
            rgbs_colors[j],
549
99.7k
            rgbo_colors[j],
550
99.7k
            partition_format_specifiers[i][j],
551
99.7k
            workscb.color_values[j],
552
99.7k
            color_quant_level[i]);
553
554
99.7k
        all_same = all_same && workscb.color_formats[j] == workscb.color_formats[0];
555
99.7k
      }
556
557
      // If all the color endpoint modes are the same, we get a few more bits to store colors;
558
      // let's see if we can take advantage of this: requantize all the colors and see if the
559
      // endpoint modes remain the same.
560
45.8k
      workscb.color_formats_matched = 0;
561
45.8k
      if (partition_count >= 2 && all_same)
562
8.87k
      {
563
8.87k
        uint8_t colorvals[BLOCK_MAX_PARTITIONS][8];
564
8.87k
        uint8_t color_formats_mod[BLOCK_MAX_PARTITIONS] { 0 };
565
8.87k
        bool all_same_mod = true;
566
33.3k
        for (unsigned int j = 0; j < partition_count; j++)
567
24.8k
        {
568
24.8k
          color_formats_mod[j] = pack_color_endpoints(
569
24.8k
              workep.endpt0[j],
570
24.8k
              workep.endpt1[j],
571
24.8k
              rgbs_colors[j],
572
24.8k
              rgbo_colors[j],
573
24.8k
              partition_format_specifiers[i][j],
574
24.8k
              colorvals[j],
575
24.8k
              color_quant_level_mod[i]);
576
577
          // Early out as soon as it's no longer possible to use mod
578
24.8k
          if (color_formats_mod[j] != color_formats_mod[0])
579
416
          {
580
416
            all_same_mod = false;
581
416
            break;
582
416
          }
583
24.8k
        }
584
585
8.87k
        if (all_same_mod)
586
8.45k
        {
587
8.45k
          workscb.color_formats_matched = 1;
588
42.2k
          for (unsigned int j = 0; j < BLOCK_MAX_PARTITIONS; j++)
589
33.8k
          {
590
304k
            for (unsigned int k = 0; k < 8; k++)
591
270k
            {
592
270k
              workscb.color_values[j][k] = colorvals[j][k];
593
270k
            }
594
595
33.8k
            workscb.color_formats[j] = color_formats_mod[j];
596
33.8k
          }
597
8.45k
        }
598
8.87k
      }
599
600
      // Store header fields
601
45.8k
      workscb.partition_count = static_cast<uint8_t>(partition_count);
602
45.8k
      workscb.partition_index = static_cast<uint16_t>(partition_index);
603
45.8k
      workscb.plane2_component = -1;
604
45.8k
      workscb.quant_mode = workscb.color_formats_matched ? color_quant_level_mod[i] : color_quant_level[i];
605
45.8k
      workscb.block_mode = qw_bm.mode_index;
606
45.8k
      workscb.block_type = SYM_BTYPE_NONCONST;
607
608
      // Pre-realign test
609
45.8k
      if (l == 0)
610
30.8k
      {
611
30.8k
        float errorval = compute_difference(config, bsd, workscb, blk);
612
30.8k
        if (errorval == -ERROR_CALC_DEFAULT)
613
5.92k
        {
614
5.92k
          errorval = -errorval;
615
5.92k
          workscb.block_type = SYM_BTYPE_ERROR;
616
5.92k
        }
617
618
30.8k
        trace_add_data("error_prerealign", errorval);
619
30.8k
        best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
620
621
        // Average refinement improvement is 3.5% per iteration (allow 4.5%), but the first
622
        // iteration can help more so we give it a extra 8% leeway. Use this knowledge to
623
        // drive a heuristic to skip blocks that are unlikely to catch up with the best
624
        // block we have already.
625
30.8k
        unsigned int iters_remaining = config.tune_refinement_limit - l;
626
30.8k
        float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.08f;
627
30.8k
        if (errorval > (threshold * best_errorval_in_scb))
628
15.9k
        {
629
15.9k
          break;
630
15.9k
        }
631
632
14.9k
        if (errorval < best_errorval_in_scb)
633
3.88k
        {
634
3.88k
          best_errorval_in_scb = errorval;
635
3.88k
          workscb.errorval = errorval;
636
3.88k
          scb = workscb;
637
638
3.88k
          if (errorval < tune_errorval_threshold)
639
29
          {
640
            // Skip remaining candidates - this is "good enough"
641
29
            i = candidate_count;
642
29
            break;
643
29
          }
644
3.88k
        }
645
14.9k
      }
646
647
29.8k
      bool adjustments;
648
29.8k
      if (di.weight_count != bsd.texel_count)
649
4.59k
      {
650
4.59k
        adjustments = realign_weights_decimated(
651
4.59k
          config.profile, bsd, blk, workscb);
652
4.59k
      }
653
25.2k
      else
654
25.2k
      {
655
25.2k
        adjustments = realign_weights_undecimated(
656
25.2k
          config.profile, bsd, blk, workscb);
657
25.2k
      }
658
659
      // Post-realign test
660
29.8k
      float errorval = compute_difference(config, bsd, workscb, blk);
661
29.8k
      if (errorval == -ERROR_CALC_DEFAULT)
662
5.44k
      {
663
5.44k
        errorval = -errorval;
664
5.44k
        workscb.block_type = SYM_BTYPE_ERROR;
665
5.44k
      }
666
667
29.8k
      trace_add_data("error_postrealign", errorval);
668
29.8k
      best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
669
670
      // Average refinement improvement is 3.5% per iteration, so skip blocks that are
671
      // unlikely to catch up with the best block we have already. Assume a 4.5% per step to
672
      // give benefit of the doubt ...
673
29.8k
      unsigned int iters_remaining = config.tune_refinement_limit - 1 - l;
674
29.8k
      float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.0f;
675
29.8k
      if (errorval > (threshold * best_errorval_in_scb))
676
4.35k
      {
677
4.35k
        break;
678
4.35k
      }
679
680
25.5k
      if (errorval < best_errorval_in_scb)
681
6.50k
      {
682
6.50k
        best_errorval_in_scb = errorval;
683
6.50k
        workscb.errorval = errorval;
684
6.50k
        scb = workscb;
685
686
6.50k
        if (errorval < tune_errorval_threshold)
687
7
        {
688
          // Skip remaining candidates - this is "good enough"
689
7
          i = candidate_count;
690
7
          break;
691
7
        }
692
6.50k
      }
693
694
25.5k
      if (!adjustments)
695
7.64k
      {
696
7.64k
        break;
697
7.64k
      }
698
25.5k
    }
699
30.8k
  }
700
701
10.9k
  return best_errorval_in_mode;
702
10.9k
}
703
704
/**
705
 * @brief Compress a block using a chosen partitioning and 2 planes of weights.
706
 *
707
 * @param      config                    The compressor configuration.
708
 * @param      bsd                       The block size information.
709
 * @param      blk                       The image block color data to compress.
710
 * @param      tune_errorval_threshold   The error value threshold.
711
 * @param      plane2_component          The component index for the second plane of weights.
712
 * @param[out] scb                       The symbolic compressed block output.
713
 * @param[out] tmpbuf                    The quantized weights for plane 1.
714
 */
715
static float compress_symbolic_block_for_partition_2planes(
716
  const astcenc_config& config,
717
  const block_size_descriptor& bsd,
718
  const image_block& blk,
719
  float tune_errorval_threshold,
720
  unsigned int plane2_component,
721
  symbolic_compressed_block& scb,
722
  compression_working_buffers& tmpbuf,
723
  int quant_limit
724
6.99k
) {
725
6.99k
  promise(config.tune_candidate_limit > 0);
726
6.99k
  promise(config.tune_refinement_limit > 0);
727
6.99k
  promise(bsd.decimation_mode_count_selected > 0);
728
729
6.99k
  int max_weight_quant = astc::min(static_cast<int>(QUANT_32), quant_limit);
730
731
  // Compute ideal weights and endpoint colors, with no quantization or decimation
732
6.99k
  endpoints_and_weights& ei1 = tmpbuf.ei1;
733
6.99k
  endpoints_and_weights& ei2 = tmpbuf.ei2;
734
735
6.99k
  compute_ideal_colors_and_weights_2planes(bsd, blk, plane2_component, ei1, ei2);
736
737
  // Compute ideal weights and endpoint colors for every decimation
738
6.99k
  float* dec_weights_ideal = tmpbuf.dec_weights_ideal;
739
6.99k
  uint8_t* dec_weights_uquant = tmpbuf.dec_weights_uquant;
740
741
  // For each decimation mode, compute an ideal set of weights with no quantization
742
72.7k
  for (unsigned int i = 0; i < bsd.decimation_mode_count_selected; i++)
743
65.7k
  {
744
65.7k
    const auto& dm = bsd.get_decimation_mode(i);
745
65.7k
    if (!dm.is_ref_2plane(static_cast<quant_method>(max_weight_quant)))
746
26.7k
    {
747
26.7k
      continue;
748
26.7k
    }
749
750
38.9k
    const auto& di = bsd.get_decimation_info(i);
751
752
38.9k
    compute_ideal_weights_for_decimation(
753
38.9k
        ei1,
754
38.9k
        di,
755
38.9k
        dec_weights_ideal + i * BLOCK_MAX_WEIGHTS);
756
757
38.9k
    compute_ideal_weights_for_decimation(
758
38.9k
        ei2,
759
38.9k
        di,
760
38.9k
        dec_weights_ideal + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET);
761
38.9k
  }
762
763
  // Compute maximum colors for the endpoints and ideal weights, then for each endpoint and ideal
764
  // weight pair, compute the smallest weight that will result in a color value greater than 1
765
6.99k
  vfloat4 min_ep1(10.0f);
766
6.99k
  vfloat4 min_ep2(10.0f);
767
768
6.99k
  vfloat4 ep1 = (vfloat4(1.0f) - ei1.ep.endpt0[0]) / (ei1.ep.endpt1[0] - ei1.ep.endpt0[0]);
769
6.99k
  vmask4 use_ep1 = (ep1 > vfloat4(0.5f)) & (ep1 < min_ep1);
770
6.99k
  min_ep1 = select(min_ep1, ep1, use_ep1);
771
772
6.99k
  vfloat4 ep2 = (vfloat4(1.0f) - ei2.ep.endpt0[0]) / (ei2.ep.endpt1[0] - ei2.ep.endpt0[0]);
773
6.99k
  vmask4 use_ep2 = (ep2 > vfloat4(0.5f)) & (ep2 < min_ep2);
774
6.99k
  min_ep2 = select(min_ep2, ep2, use_ep2);
775
776
6.99k
  vfloat4 err_max(ERROR_CALC_DEFAULT);
777
6.99k
  vmask4 err_mask = vint4::lane_id() == vint4(plane2_component);
778
779
  // Set the plane2 component to max error in ep1
780
6.99k
  min_ep1 = select(min_ep1, err_max, err_mask);
781
782
6.99k
  float min_wt_cutoff1 = hmin_s(min_ep1);
783
784
  // Set the minwt2 to the plane2 component min in ep2
785
6.99k
  float min_wt_cutoff2 = hmin_s(select(err_max, min_ep2, err_mask));
786
787
6.99k
  compute_angular_endpoints_2planes(
788
6.99k
      bsd, dec_weights_ideal, max_weight_quant, tmpbuf);
789
790
  // For each mode (which specifies a decimation and a quantization):
791
  //     * Compute number of bits needed for the quantized weights
792
  //     * Generate an optimized set of quantized weights
793
  //     * Compute quantization errors for the mode
794
795
6.99k
  float* weight_low_value1 = tmpbuf.weight_low_value1;
796
6.99k
  float* weight_high_value1 = tmpbuf.weight_high_value1;
797
6.99k
  float* weight_low_value2 = tmpbuf.weight_low_value2;
798
6.99k
  float* weight_high_value2 = tmpbuf.weight_high_value2;
799
800
6.99k
  int8_t* qwt_bitcounts = tmpbuf.qwt_bitcounts;
801
6.99k
  float* qwt_errors = tmpbuf.qwt_errors;
802
803
6.99k
  unsigned int start_2plane = bsd.block_mode_count_1plane_selected;
804
6.99k
  unsigned int end_2plane = bsd.block_mode_count_1plane_2plane_selected;
805
806
78.8k
  for (unsigned int i = start_2plane; i < end_2plane; i++)
807
71.8k
  {
808
71.8k
    const block_mode& bm = bsd.block_modes[i];
809
71.8k
    assert(bm.is_dual_plane);
810
811
71.8k
    if (bm.quant_mode > max_weight_quant)
812
22.6k
    {
813
22.6k
      qwt_errors[i] = 1e38f;
814
22.6k
      continue;
815
22.6k
    }
816
817
49.1k
    qwt_bitcounts[i] = static_cast<int8_t>(109 - bm.weight_bits);
818
819
49.1k
    if (weight_high_value1[i] > 1.02f * min_wt_cutoff1)
820
6.42k
    {
821
6.42k
      weight_high_value1[i] = 1.0f;
822
6.42k
    }
823
824
49.1k
    if (weight_high_value2[i] > 1.02f * min_wt_cutoff2)
825
575
    {
826
575
      weight_high_value2[i] = 1.0f;
827
575
    }
828
829
49.1k
    unsigned int decimation_mode = bm.decimation_mode;
830
49.1k
    const auto& di = bsd.get_decimation_info(decimation_mode);
831
832
49.1k
    ASTCENC_ALIGNAS float dec_weights_uquantf[BLOCK_MAX_WEIGHTS];
833
834
    // Generate the optimized set of weights for the mode
835
49.1k
    compute_quantized_weights_for_decimation(
836
49.1k
        di,
837
49.1k
        weight_low_value1[i],
838
49.1k
        weight_high_value1[i],
839
49.1k
        dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode,
840
49.1k
        dec_weights_uquantf,
841
49.1k
        dec_weights_uquant + BLOCK_MAX_WEIGHTS * i,
842
49.1k
        bm.get_weight_quant_mode());
843
844
49.1k
    compute_quantized_weights_for_decimation(
845
49.1k
        di,
846
49.1k
        weight_low_value2[i],
847
49.1k
        weight_high_value2[i],
848
49.1k
        dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode + WEIGHTS_PLANE2_OFFSET,
849
49.1k
        dec_weights_uquantf + WEIGHTS_PLANE2_OFFSET,
850
49.1k
        dec_weights_uquant + BLOCK_MAX_WEIGHTS * i + WEIGHTS_PLANE2_OFFSET,
851
49.1k
        bm.get_weight_quant_mode());
852
853
    // Compute weight quantization errors for the block mode
854
49.1k
    qwt_errors[i] = compute_error_of_weight_set_2planes(
855
49.1k
        ei1,
856
49.1k
        ei2,
857
49.1k
        di,
858
49.1k
        dec_weights_uquantf,
859
49.1k
        dec_weights_uquantf + WEIGHTS_PLANE2_OFFSET);
860
49.1k
  }
861
862
  // Decide the optimal combination of color endpoint encodings and weight encodings
863
6.99k
  uint8_t partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS];
864
6.99k
  int block_mode_index[TUNE_MAX_TRIAL_CANDIDATES];
865
866
6.99k
  quant_method color_quant_level[TUNE_MAX_TRIAL_CANDIDATES];
867
6.99k
  quant_method color_quant_level_mod[TUNE_MAX_TRIAL_CANDIDATES];
868
869
6.99k
  endpoints epm;
870
6.99k
  merge_endpoints(ei1.ep, ei2.ep, plane2_component, epm);
871
872
6.99k
  const auto& pi = bsd.get_partition_info(1, 0);
873
6.99k
  unsigned int candidate_count = compute_ideal_endpoint_formats(
874
6.99k
      pi, blk, epm, qwt_bitcounts, qwt_errors,
875
6.99k
      config.tune_candidate_limit,
876
6.99k
    bsd.block_mode_count_1plane_selected, bsd.block_mode_count_1plane_2plane_selected,
877
6.99k
      partition_format_specifiers, block_mode_index,
878
6.99k
      color_quant_level, color_quant_level_mod, tmpbuf);
879
880
  // Iterate over the N believed-to-be-best modes to find out which one is actually best
881
6.99k
  float best_errorval_in_mode = ERROR_CALC_DEFAULT;
882
6.99k
  float best_errorval_in_scb = scb.errorval;
883
884
24.5k
  for (unsigned int i = 0; i < candidate_count; i++)
885
17.5k
  {
886
17.5k
    TRACE_NODE(node0, "candidate");
887
888
17.5k
    const int bm_packed_index = block_mode_index[i];
889
17.5k
    assert(bm_packed_index >= static_cast<int>(bsd.block_mode_count_1plane_selected) &&
890
17.5k
           bm_packed_index < static_cast<int>(bsd.block_mode_count_1plane_2plane_selected));
891
17.5k
    const block_mode& qw_bm = bsd.block_modes[bm_packed_index];
892
893
17.5k
    int decimation_mode = qw_bm.decimation_mode;
894
17.5k
    const auto& di = bsd.get_decimation_info(decimation_mode);
895
17.5k
    promise(di.weight_count > 0);
896
897
17.5k
    trace_add_data("weight_x", di.weight_x);
898
17.5k
    trace_add_data("weight_y", di.weight_y);
899
17.5k
    trace_add_data("weight_z", di.weight_z);
900
17.5k
    trace_add_data("weight_quant", qw_bm.quant_mode);
901
902
17.5k
    vfloat4 rgbs_color;
903
17.5k
    vfloat4 rgbo_color;
904
905
17.5k
    symbolic_compressed_block workscb;
906
17.5k
    endpoints workep = epm;
907
908
17.5k
    uint8_t* u8_weight1_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index;
909
17.5k
    uint8_t* u8_weight2_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index + WEIGHTS_PLANE2_OFFSET;
910
911
308k
    for (int j = 0; j < di.weight_count; j++)
912
290k
    {
913
290k
      workscb.weights[j] = u8_weight1_src[j];
914
290k
      workscb.weights[j + WEIGHTS_PLANE2_OFFSET] = u8_weight2_src[j];
915
290k
    }
916
917
27.0k
    for (unsigned int l = 0; l < config.tune_refinement_limit; l++)
918
25.6k
    {
919
25.6k
      recompute_ideal_colors_2planes(
920
25.6k
          blk, bsd, di,
921
25.6k
          workscb.weights, workscb.weights + WEIGHTS_PLANE2_OFFSET,
922
25.6k
          workep, rgbs_color, rgbo_color, plane2_component);
923
924
      // Quantize the chosen color
925
25.6k
      workscb.color_formats[0] = pack_color_endpoints(
926
25.6k
                                     workep.endpt0[0],
927
25.6k
                                     workep.endpt1[0],
928
25.6k
                                     rgbs_color, rgbo_color,
929
25.6k
                                     partition_format_specifiers[i][0],
930
25.6k
                                     workscb.color_values[0],
931
25.6k
                                     color_quant_level[i]);
932
933
      // Store header fields
934
25.6k
      workscb.partition_count = 1;
935
25.6k
      workscb.partition_index = 0;
936
25.6k
      workscb.quant_mode = color_quant_level[i];
937
25.6k
      workscb.color_formats_matched = 0;
938
25.6k
      workscb.block_mode = qw_bm.mode_index;
939
25.6k
      workscb.plane2_component = static_cast<int8_t>(plane2_component);
940
25.6k
      workscb.block_type = SYM_BTYPE_NONCONST;
941
942
      // Pre-realign test
943
25.6k
      if (l == 0)
944
17.5k
      {
945
17.5k
        float errorval = compute_symbolic_block_difference_2plane(config, bsd, workscb, blk);
946
17.5k
        if (errorval == -ERROR_CALC_DEFAULT)
947
5.22k
        {
948
5.22k
          errorval = -errorval;
949
5.22k
          workscb.block_type = SYM_BTYPE_ERROR;
950
5.22k
        }
951
952
17.5k
        trace_add_data("error_prerealign", errorval);
953
17.5k
        best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
954
955
        // Average refinement improvement is 3.5% per iteration (allow 4.5%), but the first
956
        // iteration can help more so we give it a extra 8% leeway. Use this knowledge to
957
        // drive a heuristic to skip blocks that are unlikely to catch up with the best
958
        // block we have already.
959
17.5k
        unsigned int iters_remaining = config.tune_refinement_limit - l;
960
17.5k
        float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.08f;
961
17.5k
        if (errorval > (threshold * best_errorval_in_scb))
962
9.36k
        {
963
9.36k
          break;
964
9.36k
        }
965
966
8.22k
        if (errorval < best_errorval_in_scb)
967
1.73k
        {
968
1.73k
          best_errorval_in_scb = errorval;
969
1.73k
          workscb.errorval = errorval;
970
1.73k
          scb = workscb;
971
972
1.73k
          if (errorval < tune_errorval_threshold)
973
3
          {
974
            // Skip remaining candidates - this is "good enough"
975
3
            i = candidate_count;
976
3
            break;
977
3
          }
978
1.73k
        }
979
8.22k
      }
980
981
      // Perform a final pass over the weights to try to improve them.
982
16.3k
      bool adjustments;
983
16.3k
      if (di.weight_count != bsd.texel_count)
984
7.62k
      {
985
7.62k
        adjustments = realign_weights_decimated(
986
7.62k
          config.profile, bsd, blk, workscb);
987
7.62k
      }
988
8.68k
      else
989
8.68k
      {
990
8.68k
        adjustments = realign_weights_undecimated(
991
8.68k
          config.profile, bsd, blk, workscb);
992
8.68k
      }
993
994
      // Post-realign test
995
16.3k
      float errorval = compute_symbolic_block_difference_2plane(config, bsd, workscb, blk);
996
16.3k
      if (errorval == -ERROR_CALC_DEFAULT)
997
5.38k
      {
998
5.38k
        errorval = -errorval;
999
5.38k
        workscb.block_type = SYM_BTYPE_ERROR;
1000
5.38k
      }
1001
1002
16.3k
      trace_add_data("error_postrealign", errorval);
1003
16.3k
      best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
1004
1005
      // Average refinement improvement is 3.5% per iteration, so skip blocks that are
1006
      // unlikely to catch up with the best block we have already. Assume a 4.5% per step to
1007
      // give benefit of the doubt ...
1008
16.3k
      unsigned int iters_remaining = config.tune_refinement_limit - 1 - l;
1009
16.3k
      float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.0f;
1010
16.3k
      if (errorval > (threshold * best_errorval_in_scb))
1011
1.63k
      {
1012
1.63k
        break;
1013
1.63k
      }
1014
1015
14.6k
      if (errorval < best_errorval_in_scb)
1016
2.37k
      {
1017
2.37k
        best_errorval_in_scb = errorval;
1018
2.37k
        workscb.errorval = errorval;
1019
2.37k
        scb = workscb;
1020
1021
2.37k
        if (errorval < tune_errorval_threshold)
1022
5
        {
1023
          // Skip remaining candidates - this is "good enough"
1024
5
          i = candidate_count;
1025
5
          break;
1026
5
        }
1027
2.37k
      }
1028
1029
14.6k
      if (!adjustments)
1030
5.22k
      {
1031
5.22k
        break;
1032
5.22k
      }
1033
14.6k
    }
1034
17.5k
  }
1035
1036
6.99k
  return best_errorval_in_mode;
1037
6.99k
}
1038
1039
/**
1040
 * @brief Determine the lowest cross-channel correlation factor.
1041
 *
1042
 * @param texels_per_block   The number of texels in a block.
1043
 * @param blk                The image block color data to compress.
1044
 *
1045
 * @return Return the lowest correlation factor.
1046
 */
1047
static float prepare_block_statistics(
1048
  int texels_per_block,
1049
  const image_block& blk
1050
2.17k
) {
1051
  // Compute covariance matrix, as a collection of 10 scalars that form the upper-triangular row
1052
  // of the matrix. The matrix is symmetric, so this is all we need for this use case.
1053
2.17k
  float rs = 0.0f;
1054
2.17k
  float gs = 0.0f;
1055
2.17k
  float bs = 0.0f;
1056
2.17k
  float as = 0.0f;
1057
2.17k
  float rr_var = 0.0f;
1058
2.17k
  float gg_var = 0.0f;
1059
2.17k
  float bb_var = 0.0f;
1060
2.17k
  float aa_var = 0.0f;
1061
2.17k
  float rg_cov = 0.0f;
1062
2.17k
  float rb_cov = 0.0f;
1063
2.17k
  float ra_cov = 0.0f;
1064
2.17k
  float gb_cov = 0.0f;
1065
2.17k
  float ga_cov = 0.0f;
1066
2.17k
  float ba_cov = 0.0f;
1067
1068
2.17k
  float weight_sum = 0.0f;
1069
1070
2.17k
  promise(texels_per_block > 0);
1071
58.0k
  for (int i = 0; i < texels_per_block; i++)
1072
55.8k
  {
1073
55.8k
    float weight = hadd_s(blk.channel_weight) / 4.0f;
1074
55.8k
    assert(weight >= 0.0f);
1075
55.8k
    weight_sum += weight;
1076
1077
55.8k
    float r = blk.data_r[i];
1078
55.8k
    float g = blk.data_g[i];
1079
55.8k
    float b = blk.data_b[i];
1080
55.8k
    float a = blk.data_a[i];
1081
1082
55.8k
    float rw = r * weight;
1083
55.8k
    rs += rw;
1084
55.8k
    rr_var += r * rw;
1085
55.8k
    rg_cov += g * rw;
1086
55.8k
    rb_cov += b * rw;
1087
55.8k
    ra_cov += a * rw;
1088
1089
55.8k
    float gw = g * weight;
1090
55.8k
    gs += gw;
1091
55.8k
    gg_var += g * gw;
1092
55.8k
    gb_cov += b * gw;
1093
55.8k
    ga_cov += a * gw;
1094
1095
55.8k
    float bw = b * weight;
1096
55.8k
    bs += bw;
1097
55.8k
    bb_var += b * bw;
1098
55.8k
    ba_cov += a * bw;
1099
1100
55.8k
    float aw = a * weight;
1101
55.8k
    as += aw;
1102
55.8k
    aa_var += a * aw;
1103
55.8k
  }
1104
1105
2.17k
  float rpt = 1.0f / astc::max(weight_sum, 1e-7f);
1106
1107
2.17k
  rr_var -= rs * (rs * rpt);
1108
2.17k
  rg_cov -= gs * (rs * rpt);
1109
2.17k
  rb_cov -= bs * (rs * rpt);
1110
2.17k
  ra_cov -= as * (rs * rpt);
1111
1112
2.17k
  gg_var -= gs * (gs * rpt);
1113
2.17k
  gb_cov -= bs * (gs * rpt);
1114
2.17k
  ga_cov -= as * (gs * rpt);
1115
1116
2.17k
  bb_var -= bs * (bs * rpt);
1117
2.17k
  ba_cov -= as * (bs * rpt);
1118
1119
2.17k
  aa_var -= as * (as * rpt);
1120
1121
  // These will give a NaN if a channel is constant - these are fixed up in the next step
1122
2.17k
  rg_cov *= astc::rsqrt(rr_var * gg_var);
1123
2.17k
  rb_cov *= astc::rsqrt(rr_var * bb_var);
1124
2.17k
  ra_cov *= astc::rsqrt(rr_var * aa_var);
1125
2.17k
  gb_cov *= astc::rsqrt(gg_var * bb_var);
1126
2.17k
  ga_cov *= astc::rsqrt(gg_var * aa_var);
1127
2.17k
  ba_cov *= astc::rsqrt(bb_var * aa_var);
1128
1129
2.17k
  if (astc::isnan(rg_cov)) rg_cov = 1.0f;
1130
2.17k
  if (astc::isnan(rb_cov)) rb_cov = 1.0f;
1131
2.17k
  if (astc::isnan(ra_cov)) ra_cov = 1.0f;
1132
2.17k
  if (astc::isnan(gb_cov)) gb_cov = 1.0f;
1133
2.17k
  if (astc::isnan(ga_cov)) ga_cov = 1.0f;
1134
2.17k
  if (astc::isnan(ba_cov)) ba_cov = 1.0f;
1135
1136
2.17k
  float lowest_correlation = astc::min(fabsf(rg_cov),      fabsf(rb_cov));
1137
2.17k
  lowest_correlation       = astc::min(lowest_correlation, fabsf(ra_cov));
1138
2.17k
  lowest_correlation       = astc::min(lowest_correlation, fabsf(gb_cov));
1139
2.17k
  lowest_correlation       = astc::min(lowest_correlation, fabsf(ga_cov));
1140
2.17k
  lowest_correlation       = astc::min(lowest_correlation, fabsf(ba_cov));
1141
1142
  // Diagnostic trace points
1143
2.17k
  trace_add_data("min_r", blk.data_min.lane<0>());
1144
2.17k
  trace_add_data("max_r", blk.data_max.lane<0>());
1145
2.17k
  trace_add_data("min_g", blk.data_min.lane<1>());
1146
2.17k
  trace_add_data("max_g", blk.data_max.lane<1>());
1147
2.17k
  trace_add_data("min_b", blk.data_min.lane<2>());
1148
2.17k
  trace_add_data("max_b", blk.data_max.lane<2>());
1149
2.17k
  trace_add_data("min_a", blk.data_min.lane<3>());
1150
2.17k
  trace_add_data("max_a", blk.data_max.lane<3>());
1151
2.17k
  trace_add_data("cov_rg", fabsf(rg_cov));
1152
2.17k
  trace_add_data("cov_rb", fabsf(rb_cov));
1153
2.17k
  trace_add_data("cov_ra", fabsf(ra_cov));
1154
2.17k
  trace_add_data("cov_gb", fabsf(gb_cov));
1155
2.17k
  trace_add_data("cov_ga", fabsf(ga_cov));
1156
2.17k
  trace_add_data("cov_ba", fabsf(ba_cov));
1157
1158
2.17k
  return lowest_correlation;
1159
2.17k
}
1160
1161
/* See header for documentation. */
1162
void compress_block(
1163
  const astcenc_contexti& ctx,
1164
  const image_block& blk,
1165
  uint8_t pcb[16],
1166
  compression_working_buffers& tmpbuf)
1167
2.21k
{
1168
2.21k
  astcenc_profile decode_mode = ctx.config.profile;
1169
2.21k
  symbolic_compressed_block scb;
1170
2.21k
  const block_size_descriptor& bsd = *ctx.bsd;
1171
2.21k
  float lowest_correl;
1172
1173
2.21k
  TRACE_NODE(node0, "block");
1174
2.21k
  trace_add_data("pos_x", blk.xpos);
1175
2.21k
  trace_add_data("pos_y", blk.ypos);
1176
2.21k
  trace_add_data("pos_z", blk.zpos);
1177
1178
  // Set stricter block targets for luminance data as we have more bits to play with
1179
2.21k
  bool block_is_l = blk.is_luminance();
1180
2.21k
  float block_is_l_scale = block_is_l ? 1.0f / 1.5f : 1.0f;
1181
1182
  // Set slightly stricter block targets for lumalpha data as we have more bits to play with
1183
2.21k
  bool block_is_la = blk.is_luminancealpha();
1184
2.21k
  float block_is_la_scale = block_is_la ? 1.0f / 1.05f : 1.0f;
1185
1186
2.21k
  bool block_skip_two_plane = false;
1187
2.21k
  int max_partitions = ctx.config.tune_partition_count_limit;
1188
1189
2.21k
  unsigned int requested_partition_indices[3] {
1190
2.21k
    ctx.config.tune_2partition_index_limit,
1191
2.21k
    ctx.config.tune_3partition_index_limit,
1192
2.21k
    ctx.config.tune_4partition_index_limit
1193
2.21k
  };
1194
1195
2.21k
  unsigned int requested_partition_trials[3] {
1196
2.21k
    ctx.config.tune_2partitioning_candidate_limit,
1197
2.21k
    ctx.config.tune_3partitioning_candidate_limit,
1198
2.21k
    ctx.config.tune_4partitioning_candidate_limit
1199
2.21k
  };
1200
1201
#if defined(ASTCENC_DIAGNOSTICS)
1202
  // Do this early in diagnostic builds so we can dump uniform metrics
1203
  // for every block. Do it later in release builds to avoid redundant work!
1204
  float error_weight_sum = hadd_s(blk.channel_weight) * bsd.texel_count;
1205
  float error_threshold = ctx.config.tune_db_limit
1206
                        * error_weight_sum
1207
                        * block_is_l_scale
1208
                        * block_is_la_scale;
1209
1210
  lowest_correl = prepare_block_statistics(bsd.texel_count, blk);
1211
  trace_add_data("lowest_correl", lowest_correl);
1212
  trace_add_data("tune_error_threshold", error_threshold);
1213
#endif
1214
1215
  // Detected a constant-color block
1216
2.21k
  if (all(blk.data_min == blk.data_max))
1217
11
  {
1218
11
    TRACE_NODE(node1, "pass");
1219
11
    trace_add_data("partition_count", 0);
1220
11
    trace_add_data("plane_count", 1);
1221
1222
11
    scb.partition_count = 0;
1223
1224
    // Encode as FP16 if using HDR
1225
11
    if ((decode_mode == ASTCENC_PRF_HDR) ||
1226
8
        (decode_mode == ASTCENC_PRF_HDR_RGB_LDR_A))
1227
5
    {
1228
5
      scb.block_type = SYM_BTYPE_CONST_F16;
1229
5
      vint4 color_f16 = float_to_float16(blk.origin_texel);
1230
5
      store(color_f16, scb.constant_color);
1231
5
    }
1232
    // Encode as UNORM16 if NOT using HDR
1233
6
    else
1234
6
    {
1235
6
      scb.block_type = SYM_BTYPE_CONST_U16;
1236
6
      vfloat4 color_f32 = clamp(0.0f, 1.0f, blk.origin_texel) * 65535.0f;
1237
6
      vint4 color_u16 = float_to_int_rtn(color_f32);
1238
6
      store(color_u16, scb.constant_color);
1239
6
    }
1240
1241
11
    trace_add_data("exit", "quality hit");
1242
1243
11
    symbolic_to_physical(bsd, scb, pcb);
1244
11
    return;
1245
11
  }
1246
1247
2.20k
#if !defined(ASTCENC_DIAGNOSTICS)
1248
2.20k
  float error_weight_sum = hadd_s(blk.channel_weight) * bsd.texel_count;
1249
2.20k
  float error_threshold = ctx.config.tune_db_limit
1250
2.20k
                        * error_weight_sum
1251
2.20k
                        * block_is_l_scale
1252
2.20k
                        * block_is_la_scale;
1253
2.20k
#endif
1254
1255
  // Set SCB and mode errors to a very high error value
1256
2.20k
  scb.errorval = ERROR_CALC_DEFAULT;
1257
2.20k
  scb.block_type = SYM_BTYPE_ERROR;
1258
1259
2.20k
  float best_errorvals_for_pcount[BLOCK_MAX_PARTITIONS] {
1260
2.20k
    ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT
1261
2.20k
  };
1262
1263
2.20k
  float exit_thresholds_for_pcount[BLOCK_MAX_PARTITIONS] {
1264
2.20k
    0.0f,
1265
2.20k
    ctx.config.tune_2partition_early_out_limit_factor,
1266
2.20k
    ctx.config.tune_3partition_early_out_limit_factor,
1267
2.20k
    0.0f
1268
2.20k
  };
1269
1270
  // Trial using 1 plane of weights and 1 partition.
1271
1272
  // Most of the time we test it twice, first with a mode cutoff of 0 and then with the specified
1273
  // mode cutoff. This causes an early-out that speeds up encoding of easy blocks. However, this
1274
  // optimization is disabled for 4x4 and 5x4 blocks where it nearly always slows down the
1275
  // compression and slightly reduces image quality.
1276
1277
2.20k
  float errorval_mult[2] {
1278
2.20k
    1.0f / ctx.config.tune_mse_overshoot,
1279
2.20k
    1.0f
1280
2.20k
  };
1281
1282
2.20k
  const float errorval_overshoot = 1.0f / ctx.config.tune_mse_overshoot;
1283
1284
  // Only enable MODE0 fast path if enabled
1285
  // Never enable for 3D blocks as no "always" block modes are available
1286
2.20k
  int start_trial = 1;
1287
2.20k
  if ((ctx.config.tune_search_mode0_enable >= TUNE_MIN_SEARCH_MODE0) && (bsd.zdim == 1))
1288
347
  {
1289
347
    start_trial = 0;
1290
347
  }
1291
1292
2.20k
  int quant_limit = QUANT_32;
1293
4.71k
  for (int i = start_trial; i < 2; i++)
1294
2.54k
  {
1295
2.54k
    TRACE_NODE(node1, "pass");
1296
2.54k
    trace_add_data("partition_count", 1);
1297
2.54k
    trace_add_data("plane_count", 1);
1298
2.54k
    trace_add_data("search_mode", i);
1299
1300
2.54k
    float errorval = compress_symbolic_block_for_partition_1plane(
1301
2.54k
        ctx.config, bsd, blk, i == 0,
1302
2.54k
        error_threshold * errorval_mult[i] * errorval_overshoot,
1303
2.54k
        1, 0,  scb, tmpbuf, QUANT_32);
1304
1305
    // Record the quant level so we can use the filter later searches
1306
2.54k
    if (scb.block_type != SYM_BTYPE_ERROR)
1307
1.94k
    {
1308
1.94k
      const auto& bm = bsd.get_block_mode(scb.block_mode);
1309
1.94k
      quant_limit = bm.get_weight_quant_mode();
1310
1.94k
    }
1311
1312
2.54k
    best_errorvals_for_pcount[0] = astc::min(best_errorvals_for_pcount[0], errorval);
1313
2.54k
    if (errorval < (error_threshold * errorval_mult[i]))
1314
33
    {
1315
33
      trace_add_data("exit", "quality hit");
1316
33
      goto END_OF_TESTS;
1317
33
    }
1318
2.54k
  }
1319
1320
2.17k
#if !defined(ASTCENC_DIAGNOSTICS)
1321
2.17k
  lowest_correl = prepare_block_statistics(bsd.texel_count, blk);
1322
2.17k
#endif
1323
1324
2.17k
  block_skip_two_plane = lowest_correl > ctx.config.tune_2plane_early_out_limit_correlation;
1325
1326
  // Test the four possible 1-partition, 2-planes modes. Do this in reverse, as
1327
  // alpha is the most likely to be non-correlated if it is present in the data.
1328
9.38k
  for (int i = BLOCK_MAX_COMPONENTS - 1; i >= 0; i--)
1329
7.64k
  {
1330
7.64k
    TRACE_NODE(node1, "pass");
1331
7.64k
    trace_add_data("partition_count", 1);
1332
7.64k
    trace_add_data("plane_count", 2);
1333
7.64k
    trace_add_data("plane_component", i);
1334
1335
7.64k
    if (block_skip_two_plane)
1336
224
    {
1337
224
      trace_add_data("skip", "tune_2plane_early_out_limit_correlation");
1338
224
      continue;
1339
224
    }
1340
1341
7.42k
    if (blk.grayscale && i != 3)
1342
9
    {
1343
9
      trace_add_data("skip", "grayscale block");
1344
9
      continue;
1345
9
    }
1346
1347
7.41k
    if (blk.is_constant_channel(i))
1348
419
    {
1349
419
      trace_add_data("skip", "constant component");
1350
419
      continue;
1351
419
    }
1352
1353
6.99k
    float errorval = compress_symbolic_block_for_partition_2planes(
1354
6.99k
        ctx.config, bsd, blk, error_threshold * errorval_overshoot,
1355
6.99k
        i, scb, tmpbuf, quant_limit);
1356
1357
    // If attempting two planes is much worse than the best one plane result
1358
    // then further two plane searches are unlikely to help so move on ...
1359
6.99k
    if (errorval > (best_errorvals_for_pcount[0] * 1.85f))
1360
421
    {
1361
421
      break;
1362
421
    }
1363
1364
6.57k
    if (errorval < error_threshold)
1365
13
    {
1366
13
      trace_add_data("exit", "quality hit");
1367
13
      goto END_OF_TESTS;
1368
13
    }
1369
6.57k
  }
1370
1371
  // Find best blocks for 2, 3 and 4 partitions
1372
5.34k
  for (int partition_count = 2; partition_count <= max_partitions; partition_count++)
1373
4.94k
  {
1374
4.94k
    unsigned int partition_indices[TUNE_MAX_PARTITIONING_CANDIDATES];
1375
1376
4.94k
    unsigned int requested_indices = requested_partition_indices[partition_count - 2];
1377
1378
4.94k
    unsigned int requested_trials = requested_partition_trials[partition_count - 2];
1379
4.94k
    requested_trials = astc::min(requested_trials, requested_indices);
1380
1381
4.94k
    unsigned int actual_trials = find_best_partition_candidates(
1382
4.94k
        bsd, blk, partition_count, requested_indices, partition_indices, requested_trials);
1383
1384
4.94k
    float best_error_in_prev = best_errorvals_for_pcount[partition_count - 2];
1385
1386
11.8k
    for (unsigned int i = 0; i < actual_trials; i++)
1387
8.37k
    {
1388
8.37k
      TRACE_NODE(node1, "pass");
1389
8.37k
      trace_add_data("partition_count", partition_count);
1390
8.37k
      trace_add_data("partition_index", partition_indices[i]);
1391
8.37k
      trace_add_data("plane_count", 1);
1392
8.37k
      trace_add_data("search_mode", i);
1393
1394
8.37k
      float errorval = compress_symbolic_block_for_partition_1plane(
1395
8.37k
          ctx.config, bsd, blk, false,
1396
8.37k
          error_threshold * errorval_overshoot,
1397
8.37k
          partition_count, partition_indices[i],
1398
8.37k
          scb, tmpbuf, quant_limit);
1399
1400
8.37k
      best_errorvals_for_pcount[partition_count - 1] = astc::min(best_errorvals_for_pcount[partition_count - 1], errorval);
1401
1402
      // If using N partitions doesn't improve much over using N-1 partitions then skip trying
1403
      // N+1. Error can dramatically improve if the data is correlated or non-correlated and
1404
      // aligns with a partitioning that suits that encoding, so for this inner loop check add
1405
      // a large error scale because the "other" trial could be a lot better.
1406
8.37k
      float best_error = best_errorvals_for_pcount[partition_count - 1];
1407
8.37k
      float best_error_scale = exit_thresholds_for_pcount[partition_count - 1] * 1.85f;
1408
8.37k
      if (best_error > (best_error_in_prev * best_error_scale))
1409
1.48k
      {
1410
1.48k
        trace_add_data("skip", "tune_partition_early_out_limit_factor");
1411
1.48k
        goto END_OF_TESTS;
1412
1.48k
      }
1413
1414
6.89k
      if (errorval < error_threshold)
1415
23
      {
1416
23
        trace_add_data("exit", "quality hit");
1417
23
        goto END_OF_TESTS;
1418
23
      }
1419
6.89k
    }
1420
1421
    // If using N partitions doesn't improve much over using N-1 partitions then skip trying N+1
1422
3.43k
    float best_error = best_errorvals_for_pcount[partition_count - 1];
1423
3.43k
    float best_error_scale = exit_thresholds_for_pcount[partition_count - 1];
1424
3.43k
    if (best_error > (best_error_in_prev * best_error_scale))
1425
244
    {
1426
244
      trace_add_data("skip", "tune_partition_early_out_limit_factor");
1427
244
      goto END_OF_TESTS;
1428
244
    }
1429
3.43k
  }
1430
1431
406
  trace_add_data("exit", "quality not hit");
1432
1433
2.20k
END_OF_TESTS:
1434
  // If we still have an error block then convert to something we can encode
1435
  // TODO: Do something more sensible here, such as average color block
1436
2.20k
  if (scb.block_type == SYM_BTYPE_ERROR)
1437
120
  {
1438
#if defined(ASTCENC_DIAGNOSTICS)
1439
    static bool printed_once = false;
1440
    if (!printed_once)
1441
    {
1442
      printed_once = true;
1443
      printf("WARN: At least one block failed to find a valid encoding.\n"
1444
             "      Try increasing compression quality settings.\n\n");
1445
    }
1446
#endif
1447
1448
120
    scb.block_type = SYM_BTYPE_CONST_U16;
1449
120
    vfloat4 color_f32 = clamp(0.0f, 1.0f, blk.origin_texel) * 65535.0f;
1450
120
    vint4 color_u16 = float_to_int_rtn(color_f32);
1451
120
    store(color_u16, scb.constant_color);
1452
120
  }
1453
1454
  // Compress to a physical block
1455
2.20k
  symbolic_to_physical(bsd, scb, pcb);
1456
2.20k
}
1457
1458
#endif