Coverage Report

Created: 2026-06-04 06:52

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/astc-encoder/Source/astcenc_find_best_partitioning.cpp
Line
Count
Source
1
// SPDX-License-Identifier: Apache-2.0
2
// ----------------------------------------------------------------------------
3
// Copyright 2011-2026 Arm Limited
4
//
5
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
6
// use this file except in compliance with the License. You may obtain a copy
7
// of the License at:
8
//
9
//     http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing, software
12
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14
// License for the specific language governing permissions and limitations
15
// under the License.
16
// ----------------------------------------------------------------------------
17
18
#if !defined(ASTCENC_DECOMPRESS_ONLY)
19
20
/**
21
 * @brief Functions for finding best partition for a block.
22
 *
23
 * The partition search operates in two stages. The first pass uses kmeans clustering to group
24
 * texels into an ideal partitioning for the requested partition count, and then compares that
25
 * against the 1024 partitionings generated by the ASTC partition hash function. The generated
26
 * partitions are then ranked by the number of texels in the wrong partition, compared to the ideal
27
 * clustering. All 1024 partitions are tested for similarity and ranked, apart from duplicates and
28
 * partitionings that actually generate fewer than the requested partition count, but only the top
29
 * N candidates are actually put through a more detailed search. N is determined by the compressor
30
 * quality preset.
31
 *
32
 * For the detailed search, each candidate is checked against two possible encoding methods:
33
 *
34
 *   - The best partitioning assuming different chroma colors (RGB + RGB or RGB + delta endpoints).
35
 *   - The best partitioning assuming same chroma colors (RGB + scale endpoints).
36
 *
37
 * This is implemented by computing the compute mean color and dominant direction for each
38
 * partition. This defines two lines, both of which go through the mean color value.
39
 *
40
 * - One line has a direction defined by the dominant direction; this is used to assess the error
41
 *   from using an uncorrelated color representation.
42
 * - The other line goes through (0,0,0,1) and is used to assess the error from using a same chroma
43
 *   (RGB + scale) color representation.
44
 *
45
 * The best candidate is selected by computing the squared-errors that result from using these
46
 * lines for endpoint selection.
47
 */
48
49
#include <limits>
50
#include "astcenc_internal.h"
51
52
/**
53
 * @brief Pick some initial kmeans cluster centers.
54
 *
55
 * @param      blk               The image block color data to compress.
56
 * @param      texel_count       The number of texels in the block.
57
 * @param      partition_count   The number of partitions in the block.
58
 * @param[out] cluster_centers   The initial partition cluster center colors.
59
 */
60
static void kmeans_init(
61
  const image_block& blk,
62
  unsigned int texel_count,
63
  unsigned int partition_count,
64
  vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS]
65
5.01k
) {
66
5.01k
  promise(texel_count > 0);
67
5.01k
  promise(partition_count > 0);
68
69
5.01k
  unsigned int clusters_selected = 0;
70
5.01k
  float distances[BLOCK_MAX_TEXELS];
71
72
  // Pick a random sample as first cluster center; 145897 from random.org
73
5.01k
  unsigned int sample = 145897 % texel_count;
74
5.01k
  vfloat4 center_color = blk.texel(sample);
75
5.01k
  cluster_centers[clusters_selected] = center_color;
76
5.01k
  clusters_selected++;
77
78
  // Compute the distance to the first cluster center
79
5.01k
  float distance_sum = 0.0f;
80
132k
  for (unsigned int i = 0; i < texel_count; i++)
81
127k
  {
82
127k
    vfloat4 color = blk.texel(i);
83
127k
    vfloat4 diff = color - center_color;
84
127k
    float distance = dot_s(diff * diff, blk.channel_weight);
85
127k
    distance_sum += distance;
86
127k
    distances[i] = distance;
87
127k
  }
88
89
  // More numbers from random.org for weighted-random center selection
90
5.01k
  const float cluster_cutoffs[9] {
91
5.01k
    0.626220f, 0.932770f, 0.275454f,
92
5.01k
    0.318558f, 0.240113f, 0.009190f,
93
5.01k
    0.347661f, 0.731960f, 0.156391f
94
5.01k
  };
95
96
5.01k
  unsigned int cutoff = (clusters_selected - 1) + 3 * (partition_count - 2);
97
98
  // Pick the remaining samples as needed
99
8.85k
  while (true)
100
8.85k
  {
101
    // Pick the next center in a weighted-random fashion.
102
8.85k
    float summa = 0.0f;
103
8.85k
    float distance_cutoff = distance_sum * cluster_cutoffs[cutoff++];
104
95.9k
    for (sample = 0; sample < texel_count; sample++)
105
95.9k
    {
106
95.9k
      summa += distances[sample];
107
95.9k
      if (summa >= distance_cutoff)
108
8.85k
      {
109
8.85k
        break;
110
8.85k
      }
111
95.9k
    }
112
113
    // Clamp to a valid range and store the selected cluster center
114
8.85k
    sample = astc::min(sample, texel_count - 1);
115
116
8.85k
    center_color = blk.texel(sample);
117
8.85k
    cluster_centers[clusters_selected++] = center_color;
118
8.85k
    if (clusters_selected >= partition_count)
119
5.01k
    {
120
5.01k
      break;
121
5.01k
    }
122
123
    // Compute the distance to the new cluster center, keep the min dist
124
3.84k
    distance_sum = 0.0f;
125
100k
    for (unsigned int i = 0; i < texel_count; i++)
126
96.4k
    {
127
96.4k
      vfloat4 color = blk.texel(i);
128
96.4k
      vfloat4 diff = color - center_color;
129
96.4k
      float distance = dot_s(diff * diff, blk.channel_weight);
130
96.4k
      distance = astc::min(distance, distances[i]);
131
96.4k
      distance_sum += distance;
132
96.4k
      distances[i] = distance;
133
96.4k
    }
134
3.84k
  }
135
5.01k
}
136
137
/**
138
 * @brief Assign texels to clusters, based on a set of chosen center points.
139
 *
140
 * @param      blk                  The image block color data to compress.
141
 * @param      texel_count          The number of texels in the block.
142
 * @param      partition_count      The number of partitions in the block.
143
 * @param      cluster_centers      The partition cluster center colors.
144
 * @param[out] partition_of_texel   The partition assigned for each texel.
145
 */
146
static void kmeans_assign(
147
  const image_block& blk,
148
  unsigned int texel_count,
149
  unsigned int partition_count,
150
  const vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS],
151
  uint8_t partition_of_texel[BLOCK_MAX_TEXELS]
152
15.0k
) {
153
15.0k
  promise(texel_count > 0);
154
15.0k
  promise(partition_count > 0);
155
156
15.0k
  uint8_t partition_texel_count[BLOCK_MAX_PARTITIONS] { 0 };
157
158
  // Find the best partition for every texel
159
398k
  for (unsigned int i = 0; i < texel_count; i++)
160
383k
  {
161
383k
    float best_distance = std::numeric_limits<float>::max();
162
383k
    unsigned int best_partition = 0;
163
164
383k
    vfloat4 color = blk.texel(i);
165
1.44M
    for (unsigned int j = 0; j < partition_count; j++)
166
1.05M
    {
167
1.05M
      vfloat4 diff = color - cluster_centers[j];
168
1.05M
      float distance = dot_s(diff * diff, blk.channel_weight);
169
1.05M
      if (distance < best_distance)
170
556k
      {
171
556k
        best_distance = distance;
172
556k
        best_partition = j;
173
556k
      }
174
1.05M
    }
175
176
383k
    partition_of_texel[i] = static_cast<uint8_t>(best_partition);
177
383k
    partition_texel_count[best_partition]++;
178
383k
  }
179
180
  // It is possible to get a situation where a partition ends up without any texels. In this case,
181
  // assign texel N to partition N. This is silly, but ensures that every partition retains at
182
  // least one texel. Reassigning a texel in this manner may cause another partition to go empty,
183
  // so if we actually did a reassignment, run the whole loop over again.
184
15.0k
  bool problem_case;
185
15.0k
  do
186
15.5k
  {
187
15.5k
    problem_case = false;
188
58.5k
    for (unsigned int i = 0; i < partition_count; i++)
189
43.0k
    {
190
43.0k
      if (partition_texel_count[i] == 0)
191
845
      {
192
845
        partition_texel_count[partition_of_texel[i]]--;
193
845
        partition_texel_count[i]++;
194
845
        partition_of_texel[i] = static_cast<uint8_t>(i);
195
845
        problem_case = true;
196
845
      }
197
43.0k
    }
198
15.5k
  } while (problem_case);
199
15.0k
}
200
201
/**
202
 * @brief Compute new cluster centers based on their center of gravity.
203
 *
204
 * @param       blk                  The image block color data to compress.
205
 * @param       texel_count          The number of texels in the block.
206
 * @param       partition_count      The number of partitions in the block.
207
 * @param[out]  cluster_centers      The new cluster center colors.
208
 * @param       partition_of_texel   The partition assigned for each texel.
209
 */
210
static void kmeans_update(
211
  const image_block& blk,
212
  unsigned int texel_count,
213
  unsigned int partition_count,
214
  vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS],
215
  const uint8_t partition_of_texel[BLOCK_MAX_TEXELS]
216
10.0k
) {
217
10.0k
  promise(texel_count > 0);
218
10.0k
  promise(partition_count > 0);
219
220
10.0k
  vfloat4 color_sum[BLOCK_MAX_PARTITIONS] {
221
10.0k
    vfloat4::zero(),
222
10.0k
    vfloat4::zero(),
223
10.0k
    vfloat4::zero(),
224
10.0k
    vfloat4::zero()
225
10.0k
  };
226
227
10.0k
  uint8_t partition_texel_count[BLOCK_MAX_PARTITIONS] { 0 };
228
229
  // Find the center of gravity in each cluster
230
265k
  for (unsigned int i = 0; i < texel_count; i++)
231
255k
  {
232
255k
    uint8_t partition = partition_of_texel[i];
233
255k
    color_sum[partition] += blk.texel(i);
234
255k
    partition_texel_count[partition]++;
235
255k
  }
236
237
  // Set the center of gravity to be the new cluster center
238
37.7k
  for (unsigned int i = 0; i < partition_count; i++)
239
27.7k
  {
240
27.7k
    float scale = 1.0f / static_cast<float>(partition_texel_count[i]);
241
27.7k
    cluster_centers[i] = color_sum[i] * scale;
242
27.7k
  }
243
10.0k
}
244
245
/**
246
 * @brief Compute bit-mismatch for partitioning in 2-partition mode.
247
 *
248
 * @param a   The texel assignment bitvector for the block.
249
 * @param b   The texel assignment bitvector for the partition table.
250
 *
251
 * @return    The number of bit mismatches.
252
 */
253
static inline uint8_t partition_mismatch2(
254
  const uint64_t a[2],
255
  const uint64_t b[2]
256
1.12M
) {
257
1.12M
  int v1 = popcount(a[0] ^ b[0]) + popcount(a[1] ^ b[1]);
258
1.12M
  int v2 = popcount(a[0] ^ b[1]) + popcount(a[1] ^ b[0]);
259
260
  // Divide by 2 because XOR always counts errors twice, once when missing
261
  // in the expected position, and again when present in the wrong partition
262
1.12M
  return static_cast<uint8_t>(astc::min(v1, v2) / 2);
263
1.12M
}
264
265
/**
266
 * @brief Compute bit-mismatch for partitioning in 3-partition mode.
267
 *
268
 * @param a   The texel assignment bitvector for the block.
269
 * @param b   The texel assignment bitvector for the partition table.
270
 *
271
 * @return    The number of bit mismatches.
272
 */
273
static inline uint8_t partition_mismatch3(
274
  const uint64_t a[3],
275
  const uint64_t b[3]
276
690k
) {
277
690k
  int p00 = popcount(a[0] ^ b[0]);
278
690k
  int p01 = popcount(a[0] ^ b[1]);
279
690k
  int p02 = popcount(a[0] ^ b[2]);
280
281
690k
  int p10 = popcount(a[1] ^ b[0]);
282
690k
  int p11 = popcount(a[1] ^ b[1]);
283
690k
  int p12 = popcount(a[1] ^ b[2]);
284
285
690k
  int p20 = popcount(a[2] ^ b[0]);
286
690k
  int p21 = popcount(a[2] ^ b[1]);
287
690k
  int p22 = popcount(a[2] ^ b[2]);
288
289
690k
  int s0 = p11 + p22;
290
690k
  int s1 = p12 + p21;
291
690k
  int v0 = astc::min(s0, s1) + p00;
292
293
690k
  int s2 = p10 + p22;
294
690k
  int s3 = p12 + p20;
295
690k
  int v1 = astc::min(s2, s3) + p01;
296
297
690k
  int s4 = p10 + p21;
298
690k
  int s5 = p11 + p20;
299
690k
  int v2 = astc::min(s4, s5) + p02;
300
301
  // Divide by 2 because XOR always counts errors twice, once when missing
302
  // in the expected position, and again when present in the wrong partition
303
690k
  return static_cast<uint8_t>(astc::min(v0, v1, v2) / 2);
304
690k
}
305
306
/**
307
 * @brief Compute bit-mismatch for partitioning in 4-partition mode.
308
 *
309
 * @param a   The texel assignment bitvector for the block.
310
 * @param b   The texel assignment bitvector for the partition table.
311
 *
312
 * @return    The number of bit mismatches.
313
 */
314
static inline uint8_t partition_mismatch4(
315
  const uint64_t a[4],
316
  const uint64_t b[4]
317
379k
) {
318
379k
  int p00 = popcount(a[0] ^ b[0]);
319
379k
  int p01 = popcount(a[0] ^ b[1]);
320
379k
  int p02 = popcount(a[0] ^ b[2]);
321
379k
  int p03 = popcount(a[0] ^ b[3]);
322
323
379k
  int p10 = popcount(a[1] ^ b[0]);
324
379k
  int p11 = popcount(a[1] ^ b[1]);
325
379k
  int p12 = popcount(a[1] ^ b[2]);
326
379k
  int p13 = popcount(a[1] ^ b[3]);
327
328
379k
  int p20 = popcount(a[2] ^ b[0]);
329
379k
  int p21 = popcount(a[2] ^ b[1]);
330
379k
  int p22 = popcount(a[2] ^ b[2]);
331
379k
  int p23 = popcount(a[2] ^ b[3]);
332
333
379k
  int p30 = popcount(a[3] ^ b[0]);
334
379k
  int p31 = popcount(a[3] ^ b[1]);
335
379k
  int p32 = popcount(a[3] ^ b[2]);
336
379k
  int p33 = popcount(a[3] ^ b[3]);
337
338
379k
  int mx23 = astc::min(p22 + p33, p23 + p32);
339
379k
  int mx13 = astc::min(p21 + p33, p23 + p31);
340
379k
  int mx12 = astc::min(p21 + p32, p22 + p31);
341
379k
  int mx03 = astc::min(p20 + p33, p23 + p30);
342
379k
  int mx02 = astc::min(p20 + p32, p22 + p30);
343
379k
  int mx01 = astc::min(p21 + p30, p20 + p31);
344
345
379k
  int v0 = p00 + astc::min(p11 + mx23, p12 + mx13, p13 + mx12);
346
379k
  int v1 = p01 + astc::min(p10 + mx23, p12 + mx03, p13 + mx02);
347
379k
  int v2 = p02 + astc::min(p11 + mx03, p10 + mx13, p13 + mx01);
348
379k
  int v3 = p03 + astc::min(p11 + mx02, p12 + mx01, p10 + mx12);
349
350
  // Divide by 2 because XOR always counts errors twice, once when missing
351
  // in the expected position, and again when present in the wrong partition
352
379k
  return static_cast<uint8_t>(astc::min(v0, v1, v2, v3) / 2);
353
379k
}
354
355
using mismatch_dispatch = unsigned int (*)(const uint64_t*, const uint64_t*);
356
357
/**
358
 * @brief Count the partition table mismatches vs the data clustering.
359
 *
360
 * @param      bsd               The block size information.
361
 * @param      partition_count   The number of partitions in the block.
362
 * @param      bitmaps           The block texel partition assignment patterns.
363
 * @param[out] mismatch_counts   The array storing per partitioning mismatch counts.
364
 */
365
static void count_partition_mismatch_bits(
366
  const block_size_descriptor& bsd,
367
  unsigned int partition_count,
368
  const uint64_t bitmaps[BLOCK_MAX_PARTITIONS],
369
  uint8_t mismatch_counts[BLOCK_MAX_PARTITIONINGS]
370
5.01k
) {
371
5.01k
  unsigned int active_count = bsd.partitioning_count_selected[partition_count - 1];
372
5.01k
  promise(active_count > 0);
373
374
5.01k
  if (partition_count == 2)
375
2.18k
  {
376
1.12M
    for (unsigned int i = 0; i < active_count; i++)
377
1.12M
    {
378
1.12M
      mismatch_counts[i] = partition_mismatch2(bitmaps, bsd.coverage_bitmaps_2[i]);
379
1.12M
      assert(mismatch_counts[i] < BLOCK_MAX_KMEANS_TEXELS);
380
1.12M
      assert(mismatch_counts[i] < bsd.texel_count);
381
1.12M
    }
382
2.18k
  }
383
2.82k
  else if (partition_count == 3)
384
1.81k
  {
385
692k
    for (unsigned int i = 0; i < active_count; i++)
386
690k
    {
387
690k
      mismatch_counts[i] = partition_mismatch3(bitmaps, bsd.coverage_bitmaps_3[i]);
388
690k
      assert(mismatch_counts[i] < BLOCK_MAX_KMEANS_TEXELS);
389
690k
      assert(mismatch_counts[i] < bsd.texel_count);
390
690k
    }
391
1.81k
  }
392
1.01k
  else
393
1.01k
  {
394
380k
    for (unsigned int i = 0; i < active_count; i++)
395
379k
    {
396
379k
      mismatch_counts[i] = partition_mismatch4(bitmaps, bsd.coverage_bitmaps_4[i]);
397
379k
      assert(mismatch_counts[i] < BLOCK_MAX_KMEANS_TEXELS);
398
379k
      assert(mismatch_counts[i] < bsd.texel_count);
399
379k
    }
400
1.01k
  }
401
5.01k
}
402
403
/**
404
 * @brief Use counting sort on the mismatch array to sort partition candidates.
405
 *
406
 * @param      partitioning_count   The number of packed partitionings.
407
 * @param      mismatch_count       Partitioning mismatch counts, in index order.
408
 * @param[out] partition_ordering   Partition index values, in mismatch order.
409
 *
410
 * @return The number of active partitions in this selection.
411
 */
412
static unsigned int get_partition_ordering_by_mismatch_bits(
413
  unsigned int texel_count,
414
  unsigned int partitioning_count,
415
  const uint8_t mismatch_count[BLOCK_MAX_PARTITIONINGS],
416
  uint16_t partition_ordering[BLOCK_MAX_PARTITIONINGS]
417
5.01k
) {
418
5.01k
  promise(partitioning_count > 0);
419
5.01k
  uint16_t mscount[BLOCK_MAX_KMEANS_TEXELS] { 0 };
420
421
  // Create the histogram of mismatch counts
422
2.20M
  for (unsigned int i = 0; i < partitioning_count; i++)
423
2.19M
  {
424
2.19M
    mscount[mismatch_count[i]]++;
425
2.19M
  }
426
427
  // Create a running sum from the histogram array
428
  // Indices store previous values only; i.e. exclude self after sum
429
5.01k
  uint16_t sum = 0;
430
121k
  for (unsigned int i = 0; i < texel_count; i++)
431
116k
  {
432
116k
    uint16_t cnt = mscount[i];
433
116k
    mscount[i] = sum;
434
116k
    sum += cnt;
435
116k
  }
436
437
  // Use the running sum as the index, incrementing after read to allow
438
  // sequential entries with the same count
439
2.20M
  for (unsigned int i = 0; i < partitioning_count; i++)
440
2.19M
  {
441
2.19M
    unsigned int idx = mscount[mismatch_count[i]]++;
442
2.19M
    partition_ordering[idx] = static_cast<uint16_t>(i);
443
2.19M
  }
444
445
5.01k
  return partitioning_count;
446
5.01k
}
447
448
/**
449
 * @brief Use k-means clustering to compute a partition ordering for a block..
450
 *
451
 * @param      bsd                  The block size information.
452
 * @param      blk                  The image block color data to compress.
453
 * @param      partition_count      The desired number of partitions in the block.
454
 * @param[out] partition_ordering   The list of recommended partition indices, in priority order.
455
 *
456
 * @return The number of active partitionings in this selection.
457
 */
458
static unsigned int compute_kmeans_partition_ordering(
459
  const block_size_descriptor& bsd,
460
  const image_block& blk,
461
  unsigned int partition_count,
462
  uint16_t partition_ordering[BLOCK_MAX_PARTITIONINGS]
463
5.01k
) {
464
5.01k
  vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS];
465
5.01k
  uint8_t texel_partitions[BLOCK_MAX_TEXELS];
466
467
  // Use three passes of k-means clustering to partition the block data
468
20.0k
  for (unsigned int i = 0; i < 3; i++)
469
15.0k
  {
470
15.0k
    if (i == 0)
471
5.01k
    {
472
5.01k
      kmeans_init(blk, bsd.texel_count, partition_count, cluster_centers);
473
5.01k
    }
474
10.0k
    else
475
10.0k
    {
476
10.0k
      kmeans_update(blk, bsd.texel_count, partition_count, cluster_centers, texel_partitions);
477
10.0k
    }
478
479
15.0k
    kmeans_assign(blk, bsd.texel_count, partition_count, cluster_centers, texel_partitions);
480
15.0k
  }
481
482
  // Construct the block bitmaps of texel assignments to each partition
483
5.01k
  uint64_t bitmaps[BLOCK_MAX_PARTITIONS] { 0 };
484
5.01k
  unsigned int texels_to_process = astc::min(bsd.texel_count, BLOCK_MAX_KMEANS_TEXELS);
485
5.01k
  promise(texels_to_process > 0);
486
121k
  for (unsigned int i = 0; i < texels_to_process; i++)
487
116k
  {
488
116k
    unsigned int idx = bsd.kmeans_texels[i];
489
116k
    bitmaps[texel_partitions[idx]] |= 1ULL << i;
490
116k
  }
491
492
  // Count the mismatch between the block and the format's partition tables
493
5.01k
  uint8_t mismatch_counts[BLOCK_MAX_PARTITIONINGS];
494
5.01k
  count_partition_mismatch_bits(bsd, partition_count, bitmaps, mismatch_counts);
495
496
  // Sort the partitions based on the number of mismatched bits
497
5.01k
  return get_partition_ordering_by_mismatch_bits(
498
5.01k
      texels_to_process,
499
5.01k
      bsd.partitioning_count_selected[partition_count - 1],
500
5.01k
      mismatch_counts, partition_ordering);
501
5.01k
}
502
503
/**
504
 * @brief Insert a partitioning into an order list of results, sorted by error.
505
 *
506
 * @param      max_values      The max number of entries in the best result arrays.
507
 * @param      this_error      The error of the new entry.
508
 * @param      this_partition  The partition ID of the new entry.
509
 * @param[out] best_errors     The array of best error values.
510
 * @param[out] best_partitions The array of best partition values.
511
 */
512
static void insert_result(
513
  unsigned int max_values,
514
  float this_error,
515
  unsigned int this_partition,
516
  float* best_errors,
517
  unsigned int* best_partitions)
518
231k
{
519
231k
  promise(max_values > 0);
520
521
  // Don't bother searching if the current worst error beats the new error
522
231k
  if (this_error >= best_errors[max_values - 1])
523
179k
  {
524
179k
    return;
525
179k
  }
526
527
  // Else insert into the list in error-order
528
75.2k
  for (unsigned int i = 0; i < max_values; i++)
529
75.2k
  {
530
    // Existing result is better - move on ...
531
75.2k
    if (this_error > best_errors[i])
532
22.7k
    {
533
22.7k
      continue;
534
22.7k
    }
535
536
    // Move existing results down one
537
82.2k
    for (unsigned int j = max_values - 1; j > i; j--)
538
29.7k
    {
539
29.7k
      best_errors[j] = best_errors[j - 1];
540
29.7k
      best_partitions[j] = best_partitions[j - 1];
541
29.7k
    }
542
543
    // Insert new result
544
52.5k
    best_errors[i] = this_error;
545
52.5k
    best_partitions[i] = this_partition;
546
52.5k
    break;
547
75.2k
  }
548
52.5k
}
549
550
/* See header for documentation. */
551
unsigned int find_best_partition_candidates(
552
  const block_size_descriptor& bsd,
553
  const image_block& blk,
554
  unsigned int partition_count,
555
  unsigned int partition_search_limit,
556
  unsigned int best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES],
557
  unsigned int requested_candidates
558
5.01k
) {
559
  // Constant used to estimate quantization error for a given partitioning; the optimal value for
560
  // this depends on bitrate. These values have been determined empirically.
561
5.01k
  unsigned int texels_per_block = bsd.texel_count;
562
5.01k
  float weight_imprecision_estim = 0.055f;
563
5.01k
  if (texels_per_block <= 20)
564
3.72k
  {
565
3.72k
    weight_imprecision_estim = 0.03f;
566
3.72k
  }
567
1.29k
  else if (texels_per_block <= 31)
568
594
  {
569
594
    weight_imprecision_estim = 0.04f;
570
594
  }
571
698
  else if (texels_per_block <= 41)
572
171
  {
573
171
    weight_imprecision_estim = 0.05f;
574
171
  }
575
576
5.01k
  promise(partition_count > 0);
577
5.01k
  promise(partition_search_limit > 0);
578
579
5.01k
  weight_imprecision_estim = weight_imprecision_estim * weight_imprecision_estim;
580
581
5.01k
  uint16_t partition_sequence[BLOCK_MAX_PARTITIONINGS];
582
5.01k
  unsigned int sequence_len = compute_kmeans_partition_ordering(bsd, blk, partition_count, partition_sequence);
583
5.01k
  partition_search_limit = astc::min(partition_search_limit, sequence_len);
584
5.01k
  requested_candidates = astc::min(partition_search_limit, requested_candidates);
585
586
5.01k
  bool uses_alpha = !blk.is_constant_channel(3);
587
588
  // Partitioning errors assuming uncorrelated-chrominance endpoints
589
5.01k
  float uncor_best_errors[TUNE_MAX_PARTITIONING_CANDIDATES];
590
5.01k
  unsigned int uncor_best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES] = {};
591
592
  // Partitioning errors assuming same-chrominance endpoints
593
5.01k
  float samec_best_errors[TUNE_MAX_PARTITIONING_CANDIDATES];
594
5.01k
  unsigned int samec_best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES] = {};
595
596
15.0k
  for (unsigned int i = 0; i < requested_candidates; i++)
597
10.0k
  {
598
10.0k
    uncor_best_errors[i] = ERROR_CALC_DEFAULT;
599
10.0k
    samec_best_errors[i] = ERROR_CALC_DEFAULT;
600
10.0k
  }
601
602
5.01k
  if (uses_alpha)
603
4.28k
  {
604
103k
    for (unsigned int i = 0; i < partition_search_limit; i++)
605
99.0k
    {
606
99.0k
      unsigned int partition = partition_sequence[i];
607
99.0k
      const auto& pi = bsd.get_raw_partition_info(partition_count, partition);
608
609
      // Compute weighting to give to each component in each partition
610
99.0k
      partition_metrics pms[BLOCK_MAX_PARTITIONS];
611
612
99.0k
      compute_avgs_and_dirs_4_comp(pi, blk, pms);
613
614
99.0k
      line4 uncor_lines[BLOCK_MAX_PARTITIONS];
615
99.0k
      line4 samec_lines[BLOCK_MAX_PARTITIONS];
616
617
99.0k
      processed_line4 uncor_plines[BLOCK_MAX_PARTITIONS];
618
99.0k
      processed_line4 samec_plines[BLOCK_MAX_PARTITIONS];
619
620
99.0k
      float line_lengths[BLOCK_MAX_PARTITIONS];
621
622
357k
      for (unsigned int j = 0; j < partition_count; j++)
623
258k
      {
624
258k
        partition_metrics& pm = pms[j];
625
626
258k
        uncor_lines[j].a = pm.avg;
627
258k
        uncor_lines[j].b = normalize_safe(pm.dir, unit4());
628
629
258k
        uncor_plines[j].amod = uncor_lines[j].a - uncor_lines[j].b * dot(uncor_lines[j].a, uncor_lines[j].b);
630
258k
        uncor_plines[j].bs = uncor_lines[j].b;
631
632
258k
        samec_lines[j].a = vfloat4::zero();
633
258k
        samec_lines[j].b = normalize_safe(pm.avg, unit4());
634
635
258k
        samec_plines[j].amod = vfloat4::zero();
636
258k
        samec_plines[j].bs = samec_lines[j].b;
637
258k
      }
638
639
99.0k
      float uncor_error = 0.0f;
640
99.0k
      float samec_error = 0.0f;
641
642
99.0k
      compute_error_squared_rgba(pi,
643
99.0k
                                 blk,
644
99.0k
                                 uncor_plines,
645
99.0k
                                 samec_plines,
646
99.0k
                                 line_lengths,
647
99.0k
                                 uncor_error,
648
99.0k
                                 samec_error);
649
650
      // Compute an estimate of error introduced by weight quantization imprecision.
651
      // This error is computed as follows, for each partition
652
      //     1: compute the principal-axis vector (full length) in error-space
653
      //     2: convert the principal-axis vector to regular RGB-space
654
      //     3: scale the vector by a constant that estimates average quantization error
655
      //     4: for each texel, square the vector, then do a dot-product with the texel's
656
      //        error weight; sum up the results across all texels.
657
      //     4(optimized): square the vector once, then do a dot-product with the average
658
      //        texel error, then multiply by the number of texels.
659
660
357k
      for (unsigned int j = 0; j < partition_count; j++)
661
258k
      {
662
258k
        float tpp = static_cast<float>(pi.partition_texel_count[j]);
663
258k
        vfloat4 error_weights(tpp * weight_imprecision_estim);
664
665
258k
        vfloat4 uncor_vector = uncor_lines[j].b * line_lengths[j];
666
258k
        vfloat4 samec_vector = samec_lines[j].b * line_lengths[j];
667
668
258k
        uncor_error += dot_s(uncor_vector * uncor_vector, error_weights);
669
258k
        samec_error += dot_s(samec_vector * samec_vector, error_weights);
670
258k
      }
671
672
99.0k
      insert_result(requested_candidates, uncor_error, partition, uncor_best_errors, uncor_best_partitions);
673
99.0k
      insert_result(requested_candidates, samec_error, partition, samec_best_errors, samec_best_partitions);
674
99.0k
    }
675
4.28k
  }
676
729
  else
677
729
  {
678
17.6k
    for (unsigned int i = 0; i < partition_search_limit; i++)
679
16.9k
    {
680
16.9k
      unsigned int partition = partition_sequence[i];
681
16.9k
      const auto& pi = bsd.get_raw_partition_info(partition_count, partition);
682
683
      // Compute weighting to give to each component in each partition
684
16.9k
      partition_metrics pms[BLOCK_MAX_PARTITIONS];
685
16.9k
      compute_avgs_and_dirs_3_comp_rgb(pi, blk, pms);
686
687
16.9k
      partition_lines3 plines[BLOCK_MAX_PARTITIONS];
688
689
61.2k
      for (unsigned int j = 0; j < partition_count; j++)
690
44.2k
      {
691
44.2k
        partition_metrics& pm = pms[j];
692
44.2k
        partition_lines3& pl = plines[j];
693
694
44.2k
        pl.uncor_line.a = pm.avg;
695
44.2k
        pl.uncor_line.b = normalize_safe(pm.dir, unit3());
696
697
44.2k
        pl.samec_line.a = vfloat4::zero();
698
44.2k
        pl.samec_line.b = normalize_safe(pm.avg, unit3());
699
700
44.2k
        pl.uncor_pline.amod = pl.uncor_line.a - pl.uncor_line.b * dot3(pl.uncor_line.a, pl.uncor_line.b);
701
44.2k
        pl.uncor_pline.bs   = pl.uncor_line.b;
702
703
44.2k
        pl.samec_pline.amod = vfloat4::zero();
704
44.2k
        pl.samec_pline.bs   = pl.samec_line.b;
705
44.2k
      }
706
707
16.9k
      float uncor_error = 0.0f;
708
16.9k
      float samec_error = 0.0f;
709
710
16.9k
      compute_error_squared_rgb(pi,
711
16.9k
                                blk,
712
16.9k
                                plines,
713
16.9k
                                uncor_error,
714
16.9k
                                samec_error);
715
716
      // Compute an estimate of error introduced by weight quantization imprecision.
717
      // This error is computed as follows, for each partition
718
      //     1: compute the principal-axis vector (full length) in error-space
719
      //     2: convert the principal-axis vector to regular RGB-space
720
      //     3: scale the vector by a constant that estimates average quantization error
721
      //     4: for each texel, square the vector, then do a dot-product with the texel's
722
      //        error weight; sum up the results across all texels.
723
      //     4(optimized): square the vector once, then do a dot-product with the average
724
      //        texel error, then multiply by the number of texels.
725
726
61.2k
      for (unsigned int j = 0; j < partition_count; j++)
727
44.2k
      {
728
44.2k
        partition_lines3& pl = plines[j];
729
730
44.2k
        float tpp = static_cast<float>(pi.partition_texel_count[j]);
731
44.2k
        vfloat4 error_weights(tpp * weight_imprecision_estim);
732
733
44.2k
        vfloat4 uncor_vector = pl.uncor_line.b * pl.line_length;
734
44.2k
        vfloat4 samec_vector = pl.samec_line.b * pl.line_length;
735
736
44.2k
        uncor_error += dot3_s(uncor_vector * uncor_vector, error_weights);
737
44.2k
        samec_error += dot3_s(samec_vector * samec_vector, error_weights);
738
44.2k
      }
739
740
16.9k
      insert_result(requested_candidates, uncor_error, partition, uncor_best_errors, uncor_best_partitions);
741
16.9k
      insert_result(requested_candidates, samec_error, partition, samec_best_errors, samec_best_partitions);
742
16.9k
    }
743
729
  }
744
745
5.01k
  unsigned int interleave[2 * TUNE_MAX_PARTITIONING_CANDIDATES];
746
15.0k
  for (unsigned int i = 0; i < requested_candidates; i++)
747
10.0k
  {
748
10.0k
    interleave[2 * i] = bsd.get_raw_partition_info(partition_count, uncor_best_partitions[i]).partition_index;
749
10.0k
    interleave[2 * i + 1] = bsd.get_raw_partition_info(partition_count, samec_best_partitions[i]).partition_index;
750
10.0k
  }
751
752
5.01k
  uint64_t bitmasks[1024/64] { 0 };
753
5.01k
  unsigned int emitted = 0;
754
755
  // Deduplicate the first "requested" entries
756
11.6k
  for (unsigned int i = 0; i < requested_candidates * 2;  i++)
757
11.6k
  {
758
11.6k
    unsigned int partition = interleave[i];
759
760
11.6k
    unsigned int word = partition / 64;
761
11.6k
    unsigned int bit = partition % 64;
762
763
11.6k
    bool written = bitmasks[word] & (1ull << bit);
764
765
11.6k
    if (!written)
766
10.0k
    {
767
10.0k
      best_partitions[emitted] = partition;
768
10.0k
      bitmasks[word] |= 1ull << bit;
769
10.0k
      emitted++;
770
771
10.0k
      if (emitted == requested_candidates)
772
5.01k
      {
773
5.01k
        break;
774
5.01k
      }
775
10.0k
    }
776
11.6k
  }
777
778
5.01k
  return emitted;
779
5.01k
}
780
781
#endif