/src/astc-encoder/Source/astcenc_find_best_partitioning.cpp
Line | Count | Source |
1 | | // SPDX-License-Identifier: Apache-2.0 |
2 | | // ---------------------------------------------------------------------------- |
3 | | // Copyright 2011-2026 Arm Limited |
4 | | // |
5 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not |
6 | | // use this file except in compliance with the License. You may obtain a copy |
7 | | // of the License at: |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, software |
12 | | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT |
13 | | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the |
14 | | // License for the specific language governing permissions and limitations |
15 | | // under the License. |
16 | | // ---------------------------------------------------------------------------- |
17 | | |
18 | | #if !defined(ASTCENC_DECOMPRESS_ONLY) |
19 | | |
20 | | /** |
21 | | * @brief Functions for finding best partition for a block. |
22 | | * |
23 | | * The partition search operates in two stages. The first pass uses kmeans clustering to group |
24 | | * texels into an ideal partitioning for the requested partition count, and then compares that |
25 | | * against the 1024 partitionings generated by the ASTC partition hash function. The generated |
26 | | * partitions are then ranked by the number of texels in the wrong partition, compared to the ideal |
27 | | * clustering. All 1024 partitions are tested for similarity and ranked, apart from duplicates and |
28 | | * partitionings that actually generate fewer than the requested partition count, but only the top |
29 | | * N candidates are actually put through a more detailed search. N is determined by the compressor |
30 | | * quality preset. |
31 | | * |
32 | | * For the detailed search, each candidate is checked against two possible encoding methods: |
33 | | * |
34 | | * - The best partitioning assuming different chroma colors (RGB + RGB or RGB + delta endpoints). |
35 | | * - The best partitioning assuming same chroma colors (RGB + scale endpoints). |
36 | | * |
37 | | * This is implemented by computing the compute mean color and dominant direction for each |
38 | | * partition. This defines two lines, both of which go through the mean color value. |
39 | | * |
40 | | * - One line has a direction defined by the dominant direction; this is used to assess the error |
41 | | * from using an uncorrelated color representation. |
42 | | * - The other line goes through (0,0,0,1) and is used to assess the error from using a same chroma |
43 | | * (RGB + scale) color representation. |
44 | | * |
45 | | * The best candidate is selected by computing the squared-errors that result from using these |
46 | | * lines for endpoint selection. |
47 | | */ |
48 | | |
49 | | #include <limits> |
50 | | #include "astcenc_internal.h" |
51 | | |
52 | | /** |
53 | | * @brief Pick some initial kmeans cluster centers. |
54 | | * |
55 | | * @param blk The image block color data to compress. |
56 | | * @param texel_count The number of texels in the block. |
57 | | * @param partition_count The number of partitions in the block. |
58 | | * @param[out] cluster_centers The initial partition cluster center colors. |
59 | | */ |
60 | | static void kmeans_init( |
61 | | const image_block& blk, |
62 | | unsigned int texel_count, |
63 | | unsigned int partition_count, |
64 | | vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS] |
65 | 5.01k | ) { |
66 | 5.01k | promise(texel_count > 0); |
67 | 5.01k | promise(partition_count > 0); |
68 | | |
69 | 5.01k | unsigned int clusters_selected = 0; |
70 | 5.01k | float distances[BLOCK_MAX_TEXELS]; |
71 | | |
72 | | // Pick a random sample as first cluster center; 145897 from random.org |
73 | 5.01k | unsigned int sample = 145897 % texel_count; |
74 | 5.01k | vfloat4 center_color = blk.texel(sample); |
75 | 5.01k | cluster_centers[clusters_selected] = center_color; |
76 | 5.01k | clusters_selected++; |
77 | | |
78 | | // Compute the distance to the first cluster center |
79 | 5.01k | float distance_sum = 0.0f; |
80 | 132k | for (unsigned int i = 0; i < texel_count; i++) |
81 | 127k | { |
82 | 127k | vfloat4 color = blk.texel(i); |
83 | 127k | vfloat4 diff = color - center_color; |
84 | 127k | float distance = dot_s(diff * diff, blk.channel_weight); |
85 | 127k | distance_sum += distance; |
86 | 127k | distances[i] = distance; |
87 | 127k | } |
88 | | |
89 | | // More numbers from random.org for weighted-random center selection |
90 | 5.01k | const float cluster_cutoffs[9] { |
91 | 5.01k | 0.626220f, 0.932770f, 0.275454f, |
92 | 5.01k | 0.318558f, 0.240113f, 0.009190f, |
93 | 5.01k | 0.347661f, 0.731960f, 0.156391f |
94 | 5.01k | }; |
95 | | |
96 | 5.01k | unsigned int cutoff = (clusters_selected - 1) + 3 * (partition_count - 2); |
97 | | |
98 | | // Pick the remaining samples as needed |
99 | 8.85k | while (true) |
100 | 8.85k | { |
101 | | // Pick the next center in a weighted-random fashion. |
102 | 8.85k | float summa = 0.0f; |
103 | 8.85k | float distance_cutoff = distance_sum * cluster_cutoffs[cutoff++]; |
104 | 95.9k | for (sample = 0; sample < texel_count; sample++) |
105 | 95.9k | { |
106 | 95.9k | summa += distances[sample]; |
107 | 95.9k | if (summa >= distance_cutoff) |
108 | 8.85k | { |
109 | 8.85k | break; |
110 | 8.85k | } |
111 | 95.9k | } |
112 | | |
113 | | // Clamp to a valid range and store the selected cluster center |
114 | 8.85k | sample = astc::min(sample, texel_count - 1); |
115 | | |
116 | 8.85k | center_color = blk.texel(sample); |
117 | 8.85k | cluster_centers[clusters_selected++] = center_color; |
118 | 8.85k | if (clusters_selected >= partition_count) |
119 | 5.01k | { |
120 | 5.01k | break; |
121 | 5.01k | } |
122 | | |
123 | | // Compute the distance to the new cluster center, keep the min dist |
124 | 3.84k | distance_sum = 0.0f; |
125 | 100k | for (unsigned int i = 0; i < texel_count; i++) |
126 | 96.4k | { |
127 | 96.4k | vfloat4 color = blk.texel(i); |
128 | 96.4k | vfloat4 diff = color - center_color; |
129 | 96.4k | float distance = dot_s(diff * diff, blk.channel_weight); |
130 | 96.4k | distance = astc::min(distance, distances[i]); |
131 | 96.4k | distance_sum += distance; |
132 | 96.4k | distances[i] = distance; |
133 | 96.4k | } |
134 | 3.84k | } |
135 | 5.01k | } |
136 | | |
137 | | /** |
138 | | * @brief Assign texels to clusters, based on a set of chosen center points. |
139 | | * |
140 | | * @param blk The image block color data to compress. |
141 | | * @param texel_count The number of texels in the block. |
142 | | * @param partition_count The number of partitions in the block. |
143 | | * @param cluster_centers The partition cluster center colors. |
144 | | * @param[out] partition_of_texel The partition assigned for each texel. |
145 | | */ |
146 | | static void kmeans_assign( |
147 | | const image_block& blk, |
148 | | unsigned int texel_count, |
149 | | unsigned int partition_count, |
150 | | const vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS], |
151 | | uint8_t partition_of_texel[BLOCK_MAX_TEXELS] |
152 | 15.0k | ) { |
153 | 15.0k | promise(texel_count > 0); |
154 | 15.0k | promise(partition_count > 0); |
155 | | |
156 | 15.0k | uint8_t partition_texel_count[BLOCK_MAX_PARTITIONS] { 0 }; |
157 | | |
158 | | // Find the best partition for every texel |
159 | 398k | for (unsigned int i = 0; i < texel_count; i++) |
160 | 383k | { |
161 | 383k | float best_distance = std::numeric_limits<float>::max(); |
162 | 383k | unsigned int best_partition = 0; |
163 | | |
164 | 383k | vfloat4 color = blk.texel(i); |
165 | 1.44M | for (unsigned int j = 0; j < partition_count; j++) |
166 | 1.05M | { |
167 | 1.05M | vfloat4 diff = color - cluster_centers[j]; |
168 | 1.05M | float distance = dot_s(diff * diff, blk.channel_weight); |
169 | 1.05M | if (distance < best_distance) |
170 | 556k | { |
171 | 556k | best_distance = distance; |
172 | 556k | best_partition = j; |
173 | 556k | } |
174 | 1.05M | } |
175 | | |
176 | 383k | partition_of_texel[i] = static_cast<uint8_t>(best_partition); |
177 | 383k | partition_texel_count[best_partition]++; |
178 | 383k | } |
179 | | |
180 | | // It is possible to get a situation where a partition ends up without any texels. In this case, |
181 | | // assign texel N to partition N. This is silly, but ensures that every partition retains at |
182 | | // least one texel. Reassigning a texel in this manner may cause another partition to go empty, |
183 | | // so if we actually did a reassignment, run the whole loop over again. |
184 | 15.0k | bool problem_case; |
185 | 15.0k | do |
186 | 15.5k | { |
187 | 15.5k | problem_case = false; |
188 | 58.5k | for (unsigned int i = 0; i < partition_count; i++) |
189 | 43.0k | { |
190 | 43.0k | if (partition_texel_count[i] == 0) |
191 | 845 | { |
192 | 845 | partition_texel_count[partition_of_texel[i]]--; |
193 | 845 | partition_texel_count[i]++; |
194 | 845 | partition_of_texel[i] = static_cast<uint8_t>(i); |
195 | 845 | problem_case = true; |
196 | 845 | } |
197 | 43.0k | } |
198 | 15.5k | } while (problem_case); |
199 | 15.0k | } |
200 | | |
201 | | /** |
202 | | * @brief Compute new cluster centers based on their center of gravity. |
203 | | * |
204 | | * @param blk The image block color data to compress. |
205 | | * @param texel_count The number of texels in the block. |
206 | | * @param partition_count The number of partitions in the block. |
207 | | * @param[out] cluster_centers The new cluster center colors. |
208 | | * @param partition_of_texel The partition assigned for each texel. |
209 | | */ |
210 | | static void kmeans_update( |
211 | | const image_block& blk, |
212 | | unsigned int texel_count, |
213 | | unsigned int partition_count, |
214 | | vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS], |
215 | | const uint8_t partition_of_texel[BLOCK_MAX_TEXELS] |
216 | 10.0k | ) { |
217 | 10.0k | promise(texel_count > 0); |
218 | 10.0k | promise(partition_count > 0); |
219 | | |
220 | 10.0k | vfloat4 color_sum[BLOCK_MAX_PARTITIONS] { |
221 | 10.0k | vfloat4::zero(), |
222 | 10.0k | vfloat4::zero(), |
223 | 10.0k | vfloat4::zero(), |
224 | 10.0k | vfloat4::zero() |
225 | 10.0k | }; |
226 | | |
227 | 10.0k | uint8_t partition_texel_count[BLOCK_MAX_PARTITIONS] { 0 }; |
228 | | |
229 | | // Find the center of gravity in each cluster |
230 | 265k | for (unsigned int i = 0; i < texel_count; i++) |
231 | 255k | { |
232 | 255k | uint8_t partition = partition_of_texel[i]; |
233 | 255k | color_sum[partition] += blk.texel(i); |
234 | 255k | partition_texel_count[partition]++; |
235 | 255k | } |
236 | | |
237 | | // Set the center of gravity to be the new cluster center |
238 | 37.7k | for (unsigned int i = 0; i < partition_count; i++) |
239 | 27.7k | { |
240 | 27.7k | float scale = 1.0f / static_cast<float>(partition_texel_count[i]); |
241 | 27.7k | cluster_centers[i] = color_sum[i] * scale; |
242 | 27.7k | } |
243 | 10.0k | } |
244 | | |
245 | | /** |
246 | | * @brief Compute bit-mismatch for partitioning in 2-partition mode. |
247 | | * |
248 | | * @param a The texel assignment bitvector for the block. |
249 | | * @param b The texel assignment bitvector for the partition table. |
250 | | * |
251 | | * @return The number of bit mismatches. |
252 | | */ |
253 | | static inline uint8_t partition_mismatch2( |
254 | | const uint64_t a[2], |
255 | | const uint64_t b[2] |
256 | 1.12M | ) { |
257 | 1.12M | int v1 = popcount(a[0] ^ b[0]) + popcount(a[1] ^ b[1]); |
258 | 1.12M | int v2 = popcount(a[0] ^ b[1]) + popcount(a[1] ^ b[0]); |
259 | | |
260 | | // Divide by 2 because XOR always counts errors twice, once when missing |
261 | | // in the expected position, and again when present in the wrong partition |
262 | 1.12M | return static_cast<uint8_t>(astc::min(v1, v2) / 2); |
263 | 1.12M | } |
264 | | |
265 | | /** |
266 | | * @brief Compute bit-mismatch for partitioning in 3-partition mode. |
267 | | * |
268 | | * @param a The texel assignment bitvector for the block. |
269 | | * @param b The texel assignment bitvector for the partition table. |
270 | | * |
271 | | * @return The number of bit mismatches. |
272 | | */ |
273 | | static inline uint8_t partition_mismatch3( |
274 | | const uint64_t a[3], |
275 | | const uint64_t b[3] |
276 | 690k | ) { |
277 | 690k | int p00 = popcount(a[0] ^ b[0]); |
278 | 690k | int p01 = popcount(a[0] ^ b[1]); |
279 | 690k | int p02 = popcount(a[0] ^ b[2]); |
280 | | |
281 | 690k | int p10 = popcount(a[1] ^ b[0]); |
282 | 690k | int p11 = popcount(a[1] ^ b[1]); |
283 | 690k | int p12 = popcount(a[1] ^ b[2]); |
284 | | |
285 | 690k | int p20 = popcount(a[2] ^ b[0]); |
286 | 690k | int p21 = popcount(a[2] ^ b[1]); |
287 | 690k | int p22 = popcount(a[2] ^ b[2]); |
288 | | |
289 | 690k | int s0 = p11 + p22; |
290 | 690k | int s1 = p12 + p21; |
291 | 690k | int v0 = astc::min(s0, s1) + p00; |
292 | | |
293 | 690k | int s2 = p10 + p22; |
294 | 690k | int s3 = p12 + p20; |
295 | 690k | int v1 = astc::min(s2, s3) + p01; |
296 | | |
297 | 690k | int s4 = p10 + p21; |
298 | 690k | int s5 = p11 + p20; |
299 | 690k | int v2 = astc::min(s4, s5) + p02; |
300 | | |
301 | | // Divide by 2 because XOR always counts errors twice, once when missing |
302 | | // in the expected position, and again when present in the wrong partition |
303 | 690k | return static_cast<uint8_t>(astc::min(v0, v1, v2) / 2); |
304 | 690k | } |
305 | | |
306 | | /** |
307 | | * @brief Compute bit-mismatch for partitioning in 4-partition mode. |
308 | | * |
309 | | * @param a The texel assignment bitvector for the block. |
310 | | * @param b The texel assignment bitvector for the partition table. |
311 | | * |
312 | | * @return The number of bit mismatches. |
313 | | */ |
314 | | static inline uint8_t partition_mismatch4( |
315 | | const uint64_t a[4], |
316 | | const uint64_t b[4] |
317 | 379k | ) { |
318 | 379k | int p00 = popcount(a[0] ^ b[0]); |
319 | 379k | int p01 = popcount(a[0] ^ b[1]); |
320 | 379k | int p02 = popcount(a[0] ^ b[2]); |
321 | 379k | int p03 = popcount(a[0] ^ b[3]); |
322 | | |
323 | 379k | int p10 = popcount(a[1] ^ b[0]); |
324 | 379k | int p11 = popcount(a[1] ^ b[1]); |
325 | 379k | int p12 = popcount(a[1] ^ b[2]); |
326 | 379k | int p13 = popcount(a[1] ^ b[3]); |
327 | | |
328 | 379k | int p20 = popcount(a[2] ^ b[0]); |
329 | 379k | int p21 = popcount(a[2] ^ b[1]); |
330 | 379k | int p22 = popcount(a[2] ^ b[2]); |
331 | 379k | int p23 = popcount(a[2] ^ b[3]); |
332 | | |
333 | 379k | int p30 = popcount(a[3] ^ b[0]); |
334 | 379k | int p31 = popcount(a[3] ^ b[1]); |
335 | 379k | int p32 = popcount(a[3] ^ b[2]); |
336 | 379k | int p33 = popcount(a[3] ^ b[3]); |
337 | | |
338 | 379k | int mx23 = astc::min(p22 + p33, p23 + p32); |
339 | 379k | int mx13 = astc::min(p21 + p33, p23 + p31); |
340 | 379k | int mx12 = astc::min(p21 + p32, p22 + p31); |
341 | 379k | int mx03 = astc::min(p20 + p33, p23 + p30); |
342 | 379k | int mx02 = astc::min(p20 + p32, p22 + p30); |
343 | 379k | int mx01 = astc::min(p21 + p30, p20 + p31); |
344 | | |
345 | 379k | int v0 = p00 + astc::min(p11 + mx23, p12 + mx13, p13 + mx12); |
346 | 379k | int v1 = p01 + astc::min(p10 + mx23, p12 + mx03, p13 + mx02); |
347 | 379k | int v2 = p02 + astc::min(p11 + mx03, p10 + mx13, p13 + mx01); |
348 | 379k | int v3 = p03 + astc::min(p11 + mx02, p12 + mx01, p10 + mx12); |
349 | | |
350 | | // Divide by 2 because XOR always counts errors twice, once when missing |
351 | | // in the expected position, and again when present in the wrong partition |
352 | 379k | return static_cast<uint8_t>(astc::min(v0, v1, v2, v3) / 2); |
353 | 379k | } |
354 | | |
355 | | using mismatch_dispatch = unsigned int (*)(const uint64_t*, const uint64_t*); |
356 | | |
357 | | /** |
358 | | * @brief Count the partition table mismatches vs the data clustering. |
359 | | * |
360 | | * @param bsd The block size information. |
361 | | * @param partition_count The number of partitions in the block. |
362 | | * @param bitmaps The block texel partition assignment patterns. |
363 | | * @param[out] mismatch_counts The array storing per partitioning mismatch counts. |
364 | | */ |
365 | | static void count_partition_mismatch_bits( |
366 | | const block_size_descriptor& bsd, |
367 | | unsigned int partition_count, |
368 | | const uint64_t bitmaps[BLOCK_MAX_PARTITIONS], |
369 | | uint8_t mismatch_counts[BLOCK_MAX_PARTITIONINGS] |
370 | 5.01k | ) { |
371 | 5.01k | unsigned int active_count = bsd.partitioning_count_selected[partition_count - 1]; |
372 | 5.01k | promise(active_count > 0); |
373 | | |
374 | 5.01k | if (partition_count == 2) |
375 | 2.18k | { |
376 | 1.12M | for (unsigned int i = 0; i < active_count; i++) |
377 | 1.12M | { |
378 | 1.12M | mismatch_counts[i] = partition_mismatch2(bitmaps, bsd.coverage_bitmaps_2[i]); |
379 | 1.12M | assert(mismatch_counts[i] < BLOCK_MAX_KMEANS_TEXELS); |
380 | 1.12M | assert(mismatch_counts[i] < bsd.texel_count); |
381 | 1.12M | } |
382 | 2.18k | } |
383 | 2.82k | else if (partition_count == 3) |
384 | 1.81k | { |
385 | 692k | for (unsigned int i = 0; i < active_count; i++) |
386 | 690k | { |
387 | 690k | mismatch_counts[i] = partition_mismatch3(bitmaps, bsd.coverage_bitmaps_3[i]); |
388 | 690k | assert(mismatch_counts[i] < BLOCK_MAX_KMEANS_TEXELS); |
389 | 690k | assert(mismatch_counts[i] < bsd.texel_count); |
390 | 690k | } |
391 | 1.81k | } |
392 | 1.01k | else |
393 | 1.01k | { |
394 | 380k | for (unsigned int i = 0; i < active_count; i++) |
395 | 379k | { |
396 | 379k | mismatch_counts[i] = partition_mismatch4(bitmaps, bsd.coverage_bitmaps_4[i]); |
397 | 379k | assert(mismatch_counts[i] < BLOCK_MAX_KMEANS_TEXELS); |
398 | 379k | assert(mismatch_counts[i] < bsd.texel_count); |
399 | 379k | } |
400 | 1.01k | } |
401 | 5.01k | } |
402 | | |
403 | | /** |
404 | | * @brief Use counting sort on the mismatch array to sort partition candidates. |
405 | | * |
406 | | * @param partitioning_count The number of packed partitionings. |
407 | | * @param mismatch_count Partitioning mismatch counts, in index order. |
408 | | * @param[out] partition_ordering Partition index values, in mismatch order. |
409 | | * |
410 | | * @return The number of active partitions in this selection. |
411 | | */ |
412 | | static unsigned int get_partition_ordering_by_mismatch_bits( |
413 | | unsigned int texel_count, |
414 | | unsigned int partitioning_count, |
415 | | const uint8_t mismatch_count[BLOCK_MAX_PARTITIONINGS], |
416 | | uint16_t partition_ordering[BLOCK_MAX_PARTITIONINGS] |
417 | 5.01k | ) { |
418 | 5.01k | promise(partitioning_count > 0); |
419 | 5.01k | uint16_t mscount[BLOCK_MAX_KMEANS_TEXELS] { 0 }; |
420 | | |
421 | | // Create the histogram of mismatch counts |
422 | 2.20M | for (unsigned int i = 0; i < partitioning_count; i++) |
423 | 2.19M | { |
424 | 2.19M | mscount[mismatch_count[i]]++; |
425 | 2.19M | } |
426 | | |
427 | | // Create a running sum from the histogram array |
428 | | // Indices store previous values only; i.e. exclude self after sum |
429 | 5.01k | uint16_t sum = 0; |
430 | 121k | for (unsigned int i = 0; i < texel_count; i++) |
431 | 116k | { |
432 | 116k | uint16_t cnt = mscount[i]; |
433 | 116k | mscount[i] = sum; |
434 | 116k | sum += cnt; |
435 | 116k | } |
436 | | |
437 | | // Use the running sum as the index, incrementing after read to allow |
438 | | // sequential entries with the same count |
439 | 2.20M | for (unsigned int i = 0; i < partitioning_count; i++) |
440 | 2.19M | { |
441 | 2.19M | unsigned int idx = mscount[mismatch_count[i]]++; |
442 | 2.19M | partition_ordering[idx] = static_cast<uint16_t>(i); |
443 | 2.19M | } |
444 | | |
445 | 5.01k | return partitioning_count; |
446 | 5.01k | } |
447 | | |
448 | | /** |
449 | | * @brief Use k-means clustering to compute a partition ordering for a block.. |
450 | | * |
451 | | * @param bsd The block size information. |
452 | | * @param blk The image block color data to compress. |
453 | | * @param partition_count The desired number of partitions in the block. |
454 | | * @param[out] partition_ordering The list of recommended partition indices, in priority order. |
455 | | * |
456 | | * @return The number of active partitionings in this selection. |
457 | | */ |
458 | | static unsigned int compute_kmeans_partition_ordering( |
459 | | const block_size_descriptor& bsd, |
460 | | const image_block& blk, |
461 | | unsigned int partition_count, |
462 | | uint16_t partition_ordering[BLOCK_MAX_PARTITIONINGS] |
463 | 5.01k | ) { |
464 | 5.01k | vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS]; |
465 | 5.01k | uint8_t texel_partitions[BLOCK_MAX_TEXELS]; |
466 | | |
467 | | // Use three passes of k-means clustering to partition the block data |
468 | 20.0k | for (unsigned int i = 0; i < 3; i++) |
469 | 15.0k | { |
470 | 15.0k | if (i == 0) |
471 | 5.01k | { |
472 | 5.01k | kmeans_init(blk, bsd.texel_count, partition_count, cluster_centers); |
473 | 5.01k | } |
474 | 10.0k | else |
475 | 10.0k | { |
476 | 10.0k | kmeans_update(blk, bsd.texel_count, partition_count, cluster_centers, texel_partitions); |
477 | 10.0k | } |
478 | | |
479 | 15.0k | kmeans_assign(blk, bsd.texel_count, partition_count, cluster_centers, texel_partitions); |
480 | 15.0k | } |
481 | | |
482 | | // Construct the block bitmaps of texel assignments to each partition |
483 | 5.01k | uint64_t bitmaps[BLOCK_MAX_PARTITIONS] { 0 }; |
484 | 5.01k | unsigned int texels_to_process = astc::min(bsd.texel_count, BLOCK_MAX_KMEANS_TEXELS); |
485 | 5.01k | promise(texels_to_process > 0); |
486 | 121k | for (unsigned int i = 0; i < texels_to_process; i++) |
487 | 116k | { |
488 | 116k | unsigned int idx = bsd.kmeans_texels[i]; |
489 | 116k | bitmaps[texel_partitions[idx]] |= 1ULL << i; |
490 | 116k | } |
491 | | |
492 | | // Count the mismatch between the block and the format's partition tables |
493 | 5.01k | uint8_t mismatch_counts[BLOCK_MAX_PARTITIONINGS]; |
494 | 5.01k | count_partition_mismatch_bits(bsd, partition_count, bitmaps, mismatch_counts); |
495 | | |
496 | | // Sort the partitions based on the number of mismatched bits |
497 | 5.01k | return get_partition_ordering_by_mismatch_bits( |
498 | 5.01k | texels_to_process, |
499 | 5.01k | bsd.partitioning_count_selected[partition_count - 1], |
500 | 5.01k | mismatch_counts, partition_ordering); |
501 | 5.01k | } |
502 | | |
503 | | /** |
504 | | * @brief Insert a partitioning into an order list of results, sorted by error. |
505 | | * |
506 | | * @param max_values The max number of entries in the best result arrays. |
507 | | * @param this_error The error of the new entry. |
508 | | * @param this_partition The partition ID of the new entry. |
509 | | * @param[out] best_errors The array of best error values. |
510 | | * @param[out] best_partitions The array of best partition values. |
511 | | */ |
512 | | static void insert_result( |
513 | | unsigned int max_values, |
514 | | float this_error, |
515 | | unsigned int this_partition, |
516 | | float* best_errors, |
517 | | unsigned int* best_partitions) |
518 | 231k | { |
519 | 231k | promise(max_values > 0); |
520 | | |
521 | | // Don't bother searching if the current worst error beats the new error |
522 | 231k | if (this_error >= best_errors[max_values - 1]) |
523 | 179k | { |
524 | 179k | return; |
525 | 179k | } |
526 | | |
527 | | // Else insert into the list in error-order |
528 | 75.2k | for (unsigned int i = 0; i < max_values; i++) |
529 | 75.2k | { |
530 | | // Existing result is better - move on ... |
531 | 75.2k | if (this_error > best_errors[i]) |
532 | 22.7k | { |
533 | 22.7k | continue; |
534 | 22.7k | } |
535 | | |
536 | | // Move existing results down one |
537 | 82.2k | for (unsigned int j = max_values - 1; j > i; j--) |
538 | 29.7k | { |
539 | 29.7k | best_errors[j] = best_errors[j - 1]; |
540 | 29.7k | best_partitions[j] = best_partitions[j - 1]; |
541 | 29.7k | } |
542 | | |
543 | | // Insert new result |
544 | 52.5k | best_errors[i] = this_error; |
545 | 52.5k | best_partitions[i] = this_partition; |
546 | 52.5k | break; |
547 | 75.2k | } |
548 | 52.5k | } |
549 | | |
550 | | /* See header for documentation. */ |
551 | | unsigned int find_best_partition_candidates( |
552 | | const block_size_descriptor& bsd, |
553 | | const image_block& blk, |
554 | | unsigned int partition_count, |
555 | | unsigned int partition_search_limit, |
556 | | unsigned int best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES], |
557 | | unsigned int requested_candidates |
558 | 5.01k | ) { |
559 | | // Constant used to estimate quantization error for a given partitioning; the optimal value for |
560 | | // this depends on bitrate. These values have been determined empirically. |
561 | 5.01k | unsigned int texels_per_block = bsd.texel_count; |
562 | 5.01k | float weight_imprecision_estim = 0.055f; |
563 | 5.01k | if (texels_per_block <= 20) |
564 | 3.72k | { |
565 | 3.72k | weight_imprecision_estim = 0.03f; |
566 | 3.72k | } |
567 | 1.29k | else if (texels_per_block <= 31) |
568 | 594 | { |
569 | 594 | weight_imprecision_estim = 0.04f; |
570 | 594 | } |
571 | 698 | else if (texels_per_block <= 41) |
572 | 171 | { |
573 | 171 | weight_imprecision_estim = 0.05f; |
574 | 171 | } |
575 | | |
576 | 5.01k | promise(partition_count > 0); |
577 | 5.01k | promise(partition_search_limit > 0); |
578 | | |
579 | 5.01k | weight_imprecision_estim = weight_imprecision_estim * weight_imprecision_estim; |
580 | | |
581 | 5.01k | uint16_t partition_sequence[BLOCK_MAX_PARTITIONINGS]; |
582 | 5.01k | unsigned int sequence_len = compute_kmeans_partition_ordering(bsd, blk, partition_count, partition_sequence); |
583 | 5.01k | partition_search_limit = astc::min(partition_search_limit, sequence_len); |
584 | 5.01k | requested_candidates = astc::min(partition_search_limit, requested_candidates); |
585 | | |
586 | 5.01k | bool uses_alpha = !blk.is_constant_channel(3); |
587 | | |
588 | | // Partitioning errors assuming uncorrelated-chrominance endpoints |
589 | 5.01k | float uncor_best_errors[TUNE_MAX_PARTITIONING_CANDIDATES]; |
590 | 5.01k | unsigned int uncor_best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES] = {}; |
591 | | |
592 | | // Partitioning errors assuming same-chrominance endpoints |
593 | 5.01k | float samec_best_errors[TUNE_MAX_PARTITIONING_CANDIDATES]; |
594 | 5.01k | unsigned int samec_best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES] = {}; |
595 | | |
596 | 15.0k | for (unsigned int i = 0; i < requested_candidates; i++) |
597 | 10.0k | { |
598 | 10.0k | uncor_best_errors[i] = ERROR_CALC_DEFAULT; |
599 | 10.0k | samec_best_errors[i] = ERROR_CALC_DEFAULT; |
600 | 10.0k | } |
601 | | |
602 | 5.01k | if (uses_alpha) |
603 | 4.28k | { |
604 | 103k | for (unsigned int i = 0; i < partition_search_limit; i++) |
605 | 99.0k | { |
606 | 99.0k | unsigned int partition = partition_sequence[i]; |
607 | 99.0k | const auto& pi = bsd.get_raw_partition_info(partition_count, partition); |
608 | | |
609 | | // Compute weighting to give to each component in each partition |
610 | 99.0k | partition_metrics pms[BLOCK_MAX_PARTITIONS]; |
611 | | |
612 | 99.0k | compute_avgs_and_dirs_4_comp(pi, blk, pms); |
613 | | |
614 | 99.0k | line4 uncor_lines[BLOCK_MAX_PARTITIONS]; |
615 | 99.0k | line4 samec_lines[BLOCK_MAX_PARTITIONS]; |
616 | | |
617 | 99.0k | processed_line4 uncor_plines[BLOCK_MAX_PARTITIONS]; |
618 | 99.0k | processed_line4 samec_plines[BLOCK_MAX_PARTITIONS]; |
619 | | |
620 | 99.0k | float line_lengths[BLOCK_MAX_PARTITIONS]; |
621 | | |
622 | 357k | for (unsigned int j = 0; j < partition_count; j++) |
623 | 258k | { |
624 | 258k | partition_metrics& pm = pms[j]; |
625 | | |
626 | 258k | uncor_lines[j].a = pm.avg; |
627 | 258k | uncor_lines[j].b = normalize_safe(pm.dir, unit4()); |
628 | | |
629 | 258k | uncor_plines[j].amod = uncor_lines[j].a - uncor_lines[j].b * dot(uncor_lines[j].a, uncor_lines[j].b); |
630 | 258k | uncor_plines[j].bs = uncor_lines[j].b; |
631 | | |
632 | 258k | samec_lines[j].a = vfloat4::zero(); |
633 | 258k | samec_lines[j].b = normalize_safe(pm.avg, unit4()); |
634 | | |
635 | 258k | samec_plines[j].amod = vfloat4::zero(); |
636 | 258k | samec_plines[j].bs = samec_lines[j].b; |
637 | 258k | } |
638 | | |
639 | 99.0k | float uncor_error = 0.0f; |
640 | 99.0k | float samec_error = 0.0f; |
641 | | |
642 | 99.0k | compute_error_squared_rgba(pi, |
643 | 99.0k | blk, |
644 | 99.0k | uncor_plines, |
645 | 99.0k | samec_plines, |
646 | 99.0k | line_lengths, |
647 | 99.0k | uncor_error, |
648 | 99.0k | samec_error); |
649 | | |
650 | | // Compute an estimate of error introduced by weight quantization imprecision. |
651 | | // This error is computed as follows, for each partition |
652 | | // 1: compute the principal-axis vector (full length) in error-space |
653 | | // 2: convert the principal-axis vector to regular RGB-space |
654 | | // 3: scale the vector by a constant that estimates average quantization error |
655 | | // 4: for each texel, square the vector, then do a dot-product with the texel's |
656 | | // error weight; sum up the results across all texels. |
657 | | // 4(optimized): square the vector once, then do a dot-product with the average |
658 | | // texel error, then multiply by the number of texels. |
659 | | |
660 | 357k | for (unsigned int j = 0; j < partition_count; j++) |
661 | 258k | { |
662 | 258k | float tpp = static_cast<float>(pi.partition_texel_count[j]); |
663 | 258k | vfloat4 error_weights(tpp * weight_imprecision_estim); |
664 | | |
665 | 258k | vfloat4 uncor_vector = uncor_lines[j].b * line_lengths[j]; |
666 | 258k | vfloat4 samec_vector = samec_lines[j].b * line_lengths[j]; |
667 | | |
668 | 258k | uncor_error += dot_s(uncor_vector * uncor_vector, error_weights); |
669 | 258k | samec_error += dot_s(samec_vector * samec_vector, error_weights); |
670 | 258k | } |
671 | | |
672 | 99.0k | insert_result(requested_candidates, uncor_error, partition, uncor_best_errors, uncor_best_partitions); |
673 | 99.0k | insert_result(requested_candidates, samec_error, partition, samec_best_errors, samec_best_partitions); |
674 | 99.0k | } |
675 | 4.28k | } |
676 | 729 | else |
677 | 729 | { |
678 | 17.6k | for (unsigned int i = 0; i < partition_search_limit; i++) |
679 | 16.9k | { |
680 | 16.9k | unsigned int partition = partition_sequence[i]; |
681 | 16.9k | const auto& pi = bsd.get_raw_partition_info(partition_count, partition); |
682 | | |
683 | | // Compute weighting to give to each component in each partition |
684 | 16.9k | partition_metrics pms[BLOCK_MAX_PARTITIONS]; |
685 | 16.9k | compute_avgs_and_dirs_3_comp_rgb(pi, blk, pms); |
686 | | |
687 | 16.9k | partition_lines3 plines[BLOCK_MAX_PARTITIONS]; |
688 | | |
689 | 61.2k | for (unsigned int j = 0; j < partition_count; j++) |
690 | 44.2k | { |
691 | 44.2k | partition_metrics& pm = pms[j]; |
692 | 44.2k | partition_lines3& pl = plines[j]; |
693 | | |
694 | 44.2k | pl.uncor_line.a = pm.avg; |
695 | 44.2k | pl.uncor_line.b = normalize_safe(pm.dir, unit3()); |
696 | | |
697 | 44.2k | pl.samec_line.a = vfloat4::zero(); |
698 | 44.2k | pl.samec_line.b = normalize_safe(pm.avg, unit3()); |
699 | | |
700 | 44.2k | pl.uncor_pline.amod = pl.uncor_line.a - pl.uncor_line.b * dot3(pl.uncor_line.a, pl.uncor_line.b); |
701 | 44.2k | pl.uncor_pline.bs = pl.uncor_line.b; |
702 | | |
703 | 44.2k | pl.samec_pline.amod = vfloat4::zero(); |
704 | 44.2k | pl.samec_pline.bs = pl.samec_line.b; |
705 | 44.2k | } |
706 | | |
707 | 16.9k | float uncor_error = 0.0f; |
708 | 16.9k | float samec_error = 0.0f; |
709 | | |
710 | 16.9k | compute_error_squared_rgb(pi, |
711 | 16.9k | blk, |
712 | 16.9k | plines, |
713 | 16.9k | uncor_error, |
714 | 16.9k | samec_error); |
715 | | |
716 | | // Compute an estimate of error introduced by weight quantization imprecision. |
717 | | // This error is computed as follows, for each partition |
718 | | // 1: compute the principal-axis vector (full length) in error-space |
719 | | // 2: convert the principal-axis vector to regular RGB-space |
720 | | // 3: scale the vector by a constant that estimates average quantization error |
721 | | // 4: for each texel, square the vector, then do a dot-product with the texel's |
722 | | // error weight; sum up the results across all texels. |
723 | | // 4(optimized): square the vector once, then do a dot-product with the average |
724 | | // texel error, then multiply by the number of texels. |
725 | | |
726 | 61.2k | for (unsigned int j = 0; j < partition_count; j++) |
727 | 44.2k | { |
728 | 44.2k | partition_lines3& pl = plines[j]; |
729 | | |
730 | 44.2k | float tpp = static_cast<float>(pi.partition_texel_count[j]); |
731 | 44.2k | vfloat4 error_weights(tpp * weight_imprecision_estim); |
732 | | |
733 | 44.2k | vfloat4 uncor_vector = pl.uncor_line.b * pl.line_length; |
734 | 44.2k | vfloat4 samec_vector = pl.samec_line.b * pl.line_length; |
735 | | |
736 | 44.2k | uncor_error += dot3_s(uncor_vector * uncor_vector, error_weights); |
737 | 44.2k | samec_error += dot3_s(samec_vector * samec_vector, error_weights); |
738 | 44.2k | } |
739 | | |
740 | 16.9k | insert_result(requested_candidates, uncor_error, partition, uncor_best_errors, uncor_best_partitions); |
741 | 16.9k | insert_result(requested_candidates, samec_error, partition, samec_best_errors, samec_best_partitions); |
742 | 16.9k | } |
743 | 729 | } |
744 | | |
745 | 5.01k | unsigned int interleave[2 * TUNE_MAX_PARTITIONING_CANDIDATES]; |
746 | 15.0k | for (unsigned int i = 0; i < requested_candidates; i++) |
747 | 10.0k | { |
748 | 10.0k | interleave[2 * i] = bsd.get_raw_partition_info(partition_count, uncor_best_partitions[i]).partition_index; |
749 | 10.0k | interleave[2 * i + 1] = bsd.get_raw_partition_info(partition_count, samec_best_partitions[i]).partition_index; |
750 | 10.0k | } |
751 | | |
752 | 5.01k | uint64_t bitmasks[1024/64] { 0 }; |
753 | 5.01k | unsigned int emitted = 0; |
754 | | |
755 | | // Deduplicate the first "requested" entries |
756 | 11.6k | for (unsigned int i = 0; i < requested_candidates * 2; i++) |
757 | 11.6k | { |
758 | 11.6k | unsigned int partition = interleave[i]; |
759 | | |
760 | 11.6k | unsigned int word = partition / 64; |
761 | 11.6k | unsigned int bit = partition % 64; |
762 | | |
763 | 11.6k | bool written = bitmasks[word] & (1ull << bit); |
764 | | |
765 | 11.6k | if (!written) |
766 | 10.0k | { |
767 | 10.0k | best_partitions[emitted] = partition; |
768 | 10.0k | bitmasks[word] |= 1ull << bit; |
769 | 10.0k | emitted++; |
770 | | |
771 | 10.0k | if (emitted == requested_candidates) |
772 | 5.01k | { |
773 | 5.01k | break; |
774 | 5.01k | } |
775 | 10.0k | } |
776 | 11.6k | } |
777 | | |
778 | 5.01k | return emitted; |
779 | 5.01k | } |
780 | | |
781 | | #endif |