/src/astc-encoder/Source/astcenc_compress_symbolic.cpp
Line | Count | Source |
1 | | // SPDX-License-Identifier: Apache-2.0 |
2 | | // ---------------------------------------------------------------------------- |
3 | | // Copyright 2011-2026 Arm Limited |
4 | | // |
5 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not |
6 | | // use this file except in compliance with the License. You may obtain a copy |
7 | | // of the License at: |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, software |
12 | | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT |
13 | | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the |
14 | | // License for the specific language governing permissions and limitations |
15 | | // under the License. |
16 | | // ---------------------------------------------------------------------------- |
17 | | |
18 | | #if !defined(ASTCENC_DECOMPRESS_ONLY) |
19 | | |
20 | | /** |
21 | | * @brief Functions to compress a symbolic block. |
22 | | */ |
23 | | |
24 | | #include "astcenc_internal.h" |
25 | | #include "astcenc_diagnostic_trace.h" |
26 | | |
27 | | #include <cassert> |
28 | | |
29 | | /** |
30 | | * @brief Merge two planes of endpoints into a single vector. |
31 | | * |
32 | | * @param ep_plane1 The endpoints for plane 1. |
33 | | * @param ep_plane2 The endpoints for plane 2. |
34 | | * @param component_plane2 The color component for plane 2. |
35 | | * @param[out] result The merged output. |
36 | | */ |
37 | | static void merge_endpoints( |
38 | | const endpoints& ep_plane1, |
39 | | const endpoints& ep_plane2, |
40 | | unsigned int component_plane2, |
41 | | endpoints& result |
42 | 6.99k | ) { |
43 | 6.99k | unsigned int partition_count = ep_plane1.partition_count; |
44 | 6.99k | assert(partition_count == 1); |
45 | | |
46 | 6.99k | vmask4 sep_mask = vint4::lane_id() == vint4(component_plane2); |
47 | | |
48 | 6.99k | result.partition_count = partition_count; |
49 | 6.99k | result.endpt0[0] = select(ep_plane1.endpt0[0], ep_plane2.endpt0[0], sep_mask); |
50 | 6.99k | result.endpt1[0] = select(ep_plane1.endpt1[0], ep_plane2.endpt1[0], sep_mask); |
51 | 6.99k | } |
52 | | |
53 | | /** |
54 | | * @brief Attempt to improve weights given a chosen configuration. |
55 | | * |
56 | | * Given a fixed weight grid decimation and weight value quantization, iterate over all weights (per |
57 | | * partition and per plane) and attempt to improve image quality by moving each weight up by one or |
58 | | * down by one quantization step. |
59 | | * |
60 | | * This is a specialized function which only supports operating on undecimated weight grids, |
61 | | * therefore primarily improving the performance of 4x4 and 5x5 blocks where grid decimation |
62 | | * is needed less often. |
63 | | * |
64 | | * @param decode_mode The decode mode (LDR, HDR). |
65 | | * @param bsd The block size information. |
66 | | * @param blk The image block color data to compress. |
67 | | * @param[out] scb The symbolic compressed block output. |
68 | | */ |
69 | | static bool realign_weights_undecimated( |
70 | | astcenc_profile decode_mode, |
71 | | const block_size_descriptor& bsd, |
72 | | const image_block& blk, |
73 | | symbolic_compressed_block& scb |
74 | 33.9k | ) { |
75 | | // Get the partition descriptor |
76 | 33.9k | unsigned int partition_count = scb.partition_count; |
77 | 33.9k | const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index); |
78 | | |
79 | | // Get the quantization table |
80 | 33.9k | const block_mode& bm = bsd.get_block_mode(scb.block_mode); |
81 | 33.9k | unsigned int weight_quant_level = bm.quant_mode; |
82 | 33.9k | const quant_and_transfer_table& qat = quant_and_xfer_tables[weight_quant_level]; |
83 | | |
84 | 33.9k | unsigned int max_plane = bm.is_dual_plane; |
85 | 33.9k | int plane2_component = scb.plane2_component; |
86 | 33.9k | vmask4 plane_mask = vint4::lane_id() == vint4(plane2_component); |
87 | | |
88 | | // Decode the color endpoints |
89 | 33.9k | bool rgb_hdr; |
90 | 33.9k | bool alpha_hdr; |
91 | 33.9k | vint4 endpnt0[BLOCK_MAX_PARTITIONS]; |
92 | 33.9k | vint4 endpnt1[BLOCK_MAX_PARTITIONS]; |
93 | 33.9k | vfloat4 endpnt0f[BLOCK_MAX_PARTITIONS]; |
94 | 33.9k | vfloat4 offset[BLOCK_MAX_PARTITIONS]; |
95 | | |
96 | 33.9k | promise(partition_count > 0); |
97 | | |
98 | 92.8k | for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++) |
99 | 58.8k | { |
100 | 58.8k | unpack_color_endpoints(decode_mode, |
101 | 58.8k | scb.color_formats[pa_idx], |
102 | 58.8k | scb.color_values[pa_idx], |
103 | 58.8k | rgb_hdr, alpha_hdr, |
104 | 58.8k | endpnt0[pa_idx], |
105 | 58.8k | endpnt1[pa_idx]); |
106 | 58.8k | } |
107 | | |
108 | 33.9k | uint8_t* dec_weights_uquant = scb.weights; |
109 | 33.9k | bool adjustments = false; |
110 | | |
111 | | // For each plane and partition ... |
112 | 76.6k | for (unsigned int pl_idx = 0; pl_idx <= max_plane; pl_idx++) |
113 | 42.6k | { |
114 | 110k | for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++) |
115 | 67.5k | { |
116 | | // Compute the endpoint delta for all components in current plane |
117 | 67.5k | vint4 epd = endpnt1[pa_idx] - endpnt0[pa_idx]; |
118 | 67.5k | epd = select(epd, vint4::zero(), plane_mask); |
119 | | |
120 | 67.5k | endpnt0f[pa_idx] = int_to_float(endpnt0[pa_idx]); |
121 | 67.5k | offset[pa_idx] = int_to_float(epd) * (1.0f / 64.0f); |
122 | 67.5k | } |
123 | | |
124 | | // For each weight compute previous, current, and next errors |
125 | 42.6k | promise(bsd.texel_count > 0); |
126 | 791k | for (unsigned int texel = 0; texel < bsd.texel_count; texel++) |
127 | 749k | { |
128 | 749k | int uqw = dec_weights_uquant[texel]; |
129 | | |
130 | 749k | uint32_t prev_and_next = qat.prev_next_values[uqw]; |
131 | 749k | int uqw_down = prev_and_next & 0xFF; |
132 | 749k | int uqw_up = (prev_and_next >> 8) & 0xFF; |
133 | | |
134 | | // Interpolate the colors to create the diffs |
135 | 749k | float weight_base = static_cast<float>(uqw); |
136 | 749k | float weight_down = static_cast<float>(uqw_down - uqw); |
137 | 749k | float weight_up = static_cast<float>(uqw_up - uqw); |
138 | | |
139 | 749k | unsigned int partition = pi.partition_of_texel[texel]; |
140 | 749k | vfloat4 color_offset = offset[partition]; |
141 | 749k | vfloat4 color_base = endpnt0f[partition]; |
142 | | |
143 | 749k | vfloat4 color = color_base + color_offset * weight_base; |
144 | 749k | vfloat4 orig_color = blk.texel(texel); |
145 | 749k | vfloat4 error_weight = blk.channel_weight; |
146 | | |
147 | 749k | vfloat4 color_diff = color - orig_color; |
148 | 749k | vfloat4 color_diff_down = color_diff + color_offset * weight_down; |
149 | 749k | vfloat4 color_diff_up = color_diff + color_offset * weight_up; |
150 | | |
151 | 749k | float error_base = dot_s(color_diff * color_diff, error_weight); |
152 | 749k | float error_down = dot_s(color_diff_down * color_diff_down, error_weight); |
153 | 749k | float error_up = dot_s(color_diff_up * color_diff_up, error_weight); |
154 | | |
155 | | // Check if the prev or next error is better, and if so use it |
156 | 749k | if ((error_up < error_base) && (error_up < error_down) && (uqw < 64)) |
157 | 54.7k | { |
158 | 54.7k | dec_weights_uquant[texel] = static_cast<uint8_t>(uqw_up); |
159 | 54.7k | adjustments = true; |
160 | 54.7k | } |
161 | 694k | else if ((error_down < error_base) && (uqw > 0)) |
162 | 54.1k | { |
163 | 54.1k | dec_weights_uquant[texel] = static_cast<uint8_t>(uqw_down); |
164 | 54.1k | adjustments = true; |
165 | 54.1k | } |
166 | 749k | } |
167 | | |
168 | | // Prepare iteration for plane 2 |
169 | 42.6k | dec_weights_uquant += WEIGHTS_PLANE2_OFFSET; |
170 | 42.6k | plane_mask = ~plane_mask; |
171 | 42.6k | } |
172 | | |
173 | 33.9k | return adjustments; |
174 | 33.9k | } |
175 | | |
176 | | /** |
177 | | * @brief Attempt to improve weights given a chosen configuration. |
178 | | * |
179 | | * Given a fixed weight grid decimation and weight value quantization, iterate over all weights (per |
180 | | * partition and per plane) and attempt to improve image quality by moving each weight up by one or |
181 | | * down by one quantization step. |
182 | | * |
183 | | * @param decode_mode The decode mode (LDR, HDR). |
184 | | * @param bsd The block size information. |
185 | | * @param blk The image block color data to compress. |
186 | | * @param[out] scb The symbolic compressed block output. |
187 | | */ |
188 | | static bool realign_weights_decimated( |
189 | | astcenc_profile decode_mode, |
190 | | const block_size_descriptor& bsd, |
191 | | const image_block& blk, |
192 | | symbolic_compressed_block& scb |
193 | 12.2k | ) { |
194 | | // Get the partition descriptor |
195 | 12.2k | unsigned int partition_count = scb.partition_count; |
196 | 12.2k | const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index); |
197 | | |
198 | | // Get the quantization table |
199 | 12.2k | const block_mode& bm = bsd.get_block_mode(scb.block_mode); |
200 | 12.2k | unsigned int weight_quant_level = bm.quant_mode; |
201 | 12.2k | const quant_and_transfer_table& qat = quant_and_xfer_tables[weight_quant_level]; |
202 | | |
203 | | // Get the decimation table |
204 | 12.2k | const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode); |
205 | 12.2k | unsigned int weight_count = di.weight_count; |
206 | 12.2k | assert(weight_count != bsd.texel_count); |
207 | | |
208 | 12.2k | unsigned int max_plane = bm.is_dual_plane; |
209 | 12.2k | int plane2_component = scb.plane2_component; |
210 | 12.2k | vmask4 plane_mask = vint4::lane_id() == vint4(plane2_component); |
211 | | |
212 | | // Decode the color endpoints |
213 | 12.2k | bool rgb_hdr; |
214 | 12.2k | bool alpha_hdr; |
215 | 12.2k | vint4 endpnt0[BLOCK_MAX_PARTITIONS]; |
216 | 12.2k | vint4 endpnt1[BLOCK_MAX_PARTITIONS]; |
217 | 12.2k | vfloat4 endpnt0f[BLOCK_MAX_PARTITIONS]; |
218 | 12.2k | vfloat4 offset[BLOCK_MAX_PARTITIONS]; |
219 | | |
220 | 12.2k | promise(partition_count > 0); |
221 | 12.2k | promise(weight_count > 0); |
222 | | |
223 | 28.5k | for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++) |
224 | 16.3k | { |
225 | 16.3k | unpack_color_endpoints(decode_mode, |
226 | 16.3k | scb.color_formats[pa_idx], |
227 | 16.3k | scb.color_values[pa_idx], |
228 | 16.3k | rgb_hdr, alpha_hdr, |
229 | 16.3k | endpnt0[pa_idx], |
230 | 16.3k | endpnt1[pa_idx]); |
231 | 16.3k | } |
232 | | |
233 | 12.2k | uint8_t* dec_weights_uquant = scb.weights; |
234 | 12.2k | bool adjustments = false; |
235 | | |
236 | | // For each plane and partition ... |
237 | 32.0k | for (unsigned int pl_idx = 0; pl_idx <= max_plane; pl_idx++) |
238 | 19.8k | { |
239 | 43.8k | for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++) |
240 | 23.9k | { |
241 | | // Compute the endpoint delta for all components in current plane |
242 | 23.9k | vint4 epd = endpnt1[pa_idx] - endpnt0[pa_idx]; |
243 | 23.9k | epd = select(epd, vint4::zero(), plane_mask); |
244 | | |
245 | 23.9k | endpnt0f[pa_idx] = int_to_float(endpnt0[pa_idx]); |
246 | 23.9k | offset[pa_idx] = int_to_float(epd) * (1.0f / 64.0f); |
247 | 23.9k | } |
248 | | |
249 | | // Create an unquantized weight grid for this decimation level |
250 | 19.8k | ASTCENC_ALIGNAS float uq_weightsf[BLOCK_MAX_WEIGHTS]; |
251 | 118k | for (unsigned int we_idx = 0; we_idx < weight_count; we_idx += ASTCENC_SIMD_WIDTH) |
252 | 99.0k | { |
253 | 99.0k | vint unquant_value(dec_weights_uquant + we_idx); |
254 | 99.0k | vfloat unquant_valuef = int_to_float(unquant_value); |
255 | 99.0k | storea(unquant_valuef, uq_weightsf + we_idx); |
256 | 99.0k | } |
257 | | |
258 | | // For each weight compute previous, current, and next errors |
259 | 407k | for (unsigned int we_idx = 0; we_idx < weight_count; we_idx++) |
260 | 387k | { |
261 | 387k | int uqw = dec_weights_uquant[we_idx]; |
262 | 387k | uint32_t prev_and_next = qat.prev_next_values[uqw]; |
263 | | |
264 | 387k | float uqw_base = uq_weightsf[we_idx]; |
265 | 387k | float uqw_down = static_cast<float>(prev_and_next & 0xFF); |
266 | 387k | float uqw_up = static_cast<float>((prev_and_next >> 8) & 0xFF); |
267 | | |
268 | 387k | float uqw_diff_down = uqw_down - uqw_base; |
269 | 387k | float uqw_diff_up = uqw_up - uqw_base; |
270 | | |
271 | 387k | vfloat4 error_basev = vfloat4::zero(); |
272 | 387k | vfloat4 error_downv = vfloat4::zero(); |
273 | 387k | vfloat4 error_upv = vfloat4::zero(); |
274 | | |
275 | | // Interpolate the colors to create the diffs |
276 | 387k | unsigned int texels_to_evaluate = di.weight_texel_count[we_idx]; |
277 | 387k | promise(texels_to_evaluate > 0); |
278 | 2.21M | for (unsigned int te_idx = 0; te_idx < texels_to_evaluate; te_idx++) |
279 | 1.82M | { |
280 | 1.82M | unsigned int texel = di.weight_texels_tr[te_idx][we_idx]; |
281 | | |
282 | 1.82M | float tw_base = di.texel_contrib_for_weight[te_idx][we_idx]; |
283 | | |
284 | 1.82M | float weight_base = (uq_weightsf[di.texel_weights_tr[0][texel]] * di.texel_weight_contribs_float_tr[0][texel] |
285 | 1.82M | + uq_weightsf[di.texel_weights_tr[1][texel]] * di.texel_weight_contribs_float_tr[1][texel]) |
286 | 1.82M | + (uq_weightsf[di.texel_weights_tr[2][texel]] * di.texel_weight_contribs_float_tr[2][texel] |
287 | 1.82M | + uq_weightsf[di.texel_weights_tr[3][texel]] * di.texel_weight_contribs_float_tr[3][texel]); |
288 | | |
289 | | // Ideally this is integer rounded, but IQ gain it isn't worth the overhead |
290 | | // float weight = astc::flt_rd(weight_base + 0.5f); |
291 | | // float weight_down = astc::flt_rd(weight_base + 0.5f + uqw_diff_down * tw_base) - weight; |
292 | | // float weight_up = astc::flt_rd(weight_base + 0.5f + uqw_diff_up * tw_base) - weight; |
293 | 1.82M | float weight_down = weight_base + uqw_diff_down * tw_base - weight_base; |
294 | 1.82M | float weight_up = weight_base + uqw_diff_up * tw_base - weight_base; |
295 | | |
296 | 1.82M | unsigned int partition = pi.partition_of_texel[texel]; |
297 | 1.82M | vfloat4 color_offset = offset[partition]; |
298 | 1.82M | vfloat4 color_base = endpnt0f[partition]; |
299 | | |
300 | 1.82M | vfloat4 color = color_base + color_offset * weight_base; |
301 | 1.82M | vfloat4 orig_color = blk.texel(texel); |
302 | | |
303 | 1.82M | vfloat4 color_diff = color - orig_color; |
304 | 1.82M | vfloat4 color_down_diff = color_diff + color_offset * weight_down; |
305 | 1.82M | vfloat4 color_up_diff = color_diff + color_offset * weight_up; |
306 | | |
307 | 1.82M | error_basev += color_diff * color_diff; |
308 | 1.82M | error_downv += color_down_diff * color_down_diff; |
309 | 1.82M | error_upv += color_up_diff * color_up_diff; |
310 | 1.82M | } |
311 | | |
312 | 387k | vfloat4 error_weight = blk.channel_weight; |
313 | 387k | float error_base = hadd_s(error_basev * error_weight); |
314 | 387k | float error_down = hadd_s(error_downv * error_weight); |
315 | 387k | float error_up = hadd_s(error_upv * error_weight); |
316 | | |
317 | | // Check if the prev or next error is better, and if so use it |
318 | 387k | if ((error_up < error_base) && (error_up < error_down) && (uqw < 64)) |
319 | 17.6k | { |
320 | 17.6k | uq_weightsf[we_idx] = uqw_up; |
321 | 17.6k | dec_weights_uquant[we_idx] = static_cast<uint8_t>(uqw_up); |
322 | 17.6k | adjustments = true; |
323 | 17.6k | } |
324 | 369k | else if ((error_down < error_base) && (uqw > 0)) |
325 | 33.0k | { |
326 | 33.0k | uq_weightsf[we_idx] = uqw_down; |
327 | 33.0k | dec_weights_uquant[we_idx] = static_cast<uint8_t>(uqw_down); |
328 | 33.0k | adjustments = true; |
329 | 33.0k | } |
330 | 387k | } |
331 | | |
332 | | // Prepare iteration for plane 2 |
333 | 19.8k | dec_weights_uquant += WEIGHTS_PLANE2_OFFSET; |
334 | 19.8k | plane_mask = ~plane_mask; |
335 | 19.8k | } |
336 | | |
337 | 12.2k | return adjustments; |
338 | 12.2k | } |
339 | | |
340 | | /** |
341 | | * @brief Compress a block using a chosen partitioning and 1 plane of weights. |
342 | | * |
343 | | * @param config The compressor configuration. |
344 | | * @param bsd The block size information. |
345 | | * @param blk The image block color data to compress. |
346 | | * @param only_always True if we only use "always" percentile block modes. |
347 | | * @param tune_errorval_threshold The error value threshold. |
348 | | * @param partition_count The partition count. |
349 | | * @param partition_index The partition index if @c partition_count is 2-4. |
350 | | * @param[out] scb The symbolic compressed block output. |
351 | | * @param[out] tmpbuf The quantized weights for plane 1. |
352 | | */ |
353 | | static float compress_symbolic_block_for_partition_1plane( |
354 | | const astcenc_config& config, |
355 | | const block_size_descriptor& bsd, |
356 | | const image_block& blk, |
357 | | bool only_always, |
358 | | float tune_errorval_threshold, |
359 | | unsigned int partition_count, |
360 | | unsigned int partition_index, |
361 | | symbolic_compressed_block& scb, |
362 | | compression_working_buffers& tmpbuf, |
363 | | int quant_limit |
364 | 10.9k | ) { |
365 | 10.9k | promise(partition_count > 0); |
366 | 10.9k | promise(config.tune_candidate_limit > 0); |
367 | 10.9k | promise(config.tune_refinement_limit > 0); |
368 | | |
369 | 10.9k | int max_weight_quant = astc::min(static_cast<int>(QUANT_32), quant_limit); |
370 | | |
371 | 10.9k | auto compute_difference = &compute_symbolic_block_difference_1plane; |
372 | 10.9k | if ((partition_count == 1) && !(config.flags & ASTCENC_FLG_MAP_RGBM)) |
373 | 1.76k | { |
374 | 1.76k | compute_difference = &compute_symbolic_block_difference_1plane_1partition; |
375 | 1.76k | } |
376 | | |
377 | 10.9k | const auto& pi = bsd.get_partition_info(partition_count, partition_index); |
378 | | |
379 | | // Compute ideal weights and endpoint colors, with no quantization or decimation |
380 | 10.9k | endpoints_and_weights& ei = tmpbuf.ei1; |
381 | 10.9k | compute_ideal_colors_and_weights_1plane(blk, pi, ei); |
382 | | |
383 | | // Compute ideal weights and endpoint colors for every decimation |
384 | 10.9k | float* dec_weights_ideal = tmpbuf.dec_weights_ideal; |
385 | 10.9k | uint8_t* dec_weights_uquant = tmpbuf.dec_weights_uquant; |
386 | | |
387 | | // For each decimation mode, compute an ideal set of weights with no quantization |
388 | 10.9k | unsigned int max_decimation_modes = only_always ? bsd.decimation_mode_count_always |
389 | 10.9k | : bsd.decimation_mode_count_selected; |
390 | 10.9k | promise(max_decimation_modes > 0); |
391 | 116k | for (unsigned int i = 0; i < max_decimation_modes; i++) |
392 | 105k | { |
393 | 105k | const auto& dm = bsd.get_decimation_mode(i); |
394 | 105k | if (!dm.is_ref_1plane(static_cast<quant_method>(max_weight_quant))) |
395 | 34.5k | { |
396 | 34.5k | continue; |
397 | 34.5k | } |
398 | | |
399 | 70.7k | const auto& di = bsd.get_decimation_info(i); |
400 | | |
401 | 70.7k | compute_ideal_weights_for_decimation( |
402 | 70.7k | ei, |
403 | 70.7k | di, |
404 | 70.7k | dec_weights_ideal + i * BLOCK_MAX_WEIGHTS); |
405 | 70.7k | } |
406 | | |
407 | | // Compute maximum colors for the endpoints and ideal weights, then for each endpoint and ideal |
408 | | // weight pair, compute the smallest weight that will result in a color value greater than 1 |
409 | 10.9k | vfloat4 min_ep(10.0f); |
410 | 35.4k | for (unsigned int i = 0; i < partition_count; i++) |
411 | 24.5k | { |
412 | 24.5k | vfloat4 ep = (vfloat4(1.0f) - ei.ep.endpt0[i]) / (ei.ep.endpt1[i] - ei.ep.endpt0[i]); |
413 | | |
414 | 24.5k | vmask4 use_ep = (ep > vfloat4(0.5f)) & (ep < min_ep); |
415 | 24.5k | min_ep = select(min_ep, ep, use_ep); |
416 | 24.5k | } |
417 | | |
418 | 10.9k | float min_wt_cutoff = hmin_s(min_ep); |
419 | | |
420 | | // For each mode, use the angular method to compute a shift |
421 | 10.9k | compute_angular_endpoints_1plane( |
422 | 10.9k | only_always, bsd, dec_weights_ideal, max_weight_quant, tmpbuf); |
423 | | |
424 | 10.9k | float* weight_low_value = tmpbuf.weight_low_value1; |
425 | 10.9k | float* weight_high_value = tmpbuf.weight_high_value1; |
426 | 10.9k | int8_t* qwt_bitcounts = tmpbuf.qwt_bitcounts; |
427 | 10.9k | float* qwt_errors = tmpbuf.qwt_errors; |
428 | | |
429 | | // For each mode (which specifies a decimation and a quantization): |
430 | | // * Compute number of bits needed for the quantized weights |
431 | | // * Generate an optimized set of quantized weights |
432 | | // * Compute quantization errors for the mode |
433 | | |
434 | 10.9k | static const int8_t free_bits_for_partition_count[4] { |
435 | 10.9k | 115 - 4, 111 - 4 - PARTITION_INDEX_BITS, 108 - 4 - PARTITION_INDEX_BITS, 105 - 4 - PARTITION_INDEX_BITS |
436 | 10.9k | }; |
437 | | |
438 | 10.9k | unsigned int max_block_modes = only_always ? bsd.block_mode_count_1plane_always |
439 | 10.9k | : bsd.block_mode_count_1plane_selected; |
440 | 10.9k | promise(max_block_modes > 0); |
441 | 280k | for (unsigned int i = 0; i < max_block_modes; i++) |
442 | 269k | { |
443 | 269k | const block_mode& bm = bsd.block_modes[i]; |
444 | | |
445 | 269k | if (bm.quant_mode > max_weight_quant) |
446 | 91.2k | { |
447 | 91.2k | qwt_errors[i] = 1e38f; |
448 | 91.2k | continue; |
449 | 91.2k | } |
450 | | |
451 | 269k | assert(!bm.is_dual_plane); |
452 | 178k | int bitcount = free_bits_for_partition_count[partition_count - 1] - bm.weight_bits; |
453 | 178k | if (bitcount <= 0) |
454 | 1.29k | { |
455 | 1.29k | qwt_errors[i] = 1e38f; |
456 | 1.29k | continue; |
457 | 1.29k | } |
458 | | |
459 | 176k | if (weight_high_value[i] > 1.02f * min_wt_cutoff) |
460 | 35.0k | { |
461 | 35.0k | weight_high_value[i] = 1.0f; |
462 | 35.0k | } |
463 | | |
464 | 176k | int decimation_mode = bm.decimation_mode; |
465 | 176k | const auto& di = bsd.get_decimation_info(decimation_mode); |
466 | | |
467 | 176k | qwt_bitcounts[i] = static_cast<int8_t>(bitcount); |
468 | | |
469 | 176k | ASTCENC_ALIGNAS float dec_weights_uquantf[BLOCK_MAX_WEIGHTS]; |
470 | | |
471 | | // Generate the optimized set of weights for the weight mode |
472 | 176k | compute_quantized_weights_for_decimation( |
473 | 176k | di, |
474 | 176k | weight_low_value[i], weight_high_value[i], |
475 | 176k | dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode, |
476 | 176k | dec_weights_uquantf, |
477 | 176k | dec_weights_uquant + BLOCK_MAX_WEIGHTS * i, |
478 | 176k | bm.get_weight_quant_mode()); |
479 | | |
480 | | // Compute weight quantization errors for the block mode |
481 | 176k | qwt_errors[i] = compute_error_of_weight_set_1plane( |
482 | 176k | ei, |
483 | 176k | di, |
484 | 176k | dec_weights_uquantf); |
485 | 176k | } |
486 | | |
487 | | // Decide the optimal combination of color endpoint encodings and weight encodings |
488 | 10.9k | uint8_t partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS]; |
489 | 10.9k | int block_mode_index[TUNE_MAX_TRIAL_CANDIDATES]; |
490 | | |
491 | 10.9k | quant_method color_quant_level[TUNE_MAX_TRIAL_CANDIDATES]; |
492 | 10.9k | quant_method color_quant_level_mod[TUNE_MAX_TRIAL_CANDIDATES]; |
493 | | |
494 | 10.9k | unsigned int candidate_count = compute_ideal_endpoint_formats( |
495 | 10.9k | pi, blk, ei.ep, qwt_bitcounts, qwt_errors, |
496 | 10.9k | config.tune_candidate_limit, 0, max_block_modes, |
497 | 10.9k | partition_format_specifiers, block_mode_index, |
498 | 10.9k | color_quant_level, color_quant_level_mod, tmpbuf); |
499 | | |
500 | | // Iterate over the N believed-to-be-best modes to find out which one is actually best |
501 | 10.9k | float best_errorval_in_mode = ERROR_CALC_DEFAULT; |
502 | 10.9k | float best_errorval_in_scb = scb.errorval; |
503 | | |
504 | 41.7k | for (unsigned int i = 0; i < candidate_count; i++) |
505 | 30.8k | { |
506 | 30.8k | TRACE_NODE(node0, "candidate"); |
507 | | |
508 | 30.8k | const int bm_packed_index = block_mode_index[i]; |
509 | 30.8k | assert(bm_packed_index >= 0 && bm_packed_index < static_cast<int>(bsd.block_mode_count_1plane_selected)); |
510 | 30.8k | const block_mode& qw_bm = bsd.block_modes[bm_packed_index]; |
511 | | |
512 | 30.8k | int decimation_mode = qw_bm.decimation_mode; |
513 | 30.8k | const auto& di = bsd.get_decimation_info(decimation_mode); |
514 | 30.8k | promise(di.weight_count > 0); |
515 | | |
516 | 30.8k | trace_add_data("weight_x", di.weight_x); |
517 | 30.8k | trace_add_data("weight_y", di.weight_y); |
518 | 30.8k | trace_add_data("weight_z", di.weight_z); |
519 | 30.8k | trace_add_data("weight_quant", qw_bm.quant_mode); |
520 | | |
521 | | // Recompute the ideal color endpoints before storing them |
522 | 30.8k | vfloat4 rgbs_colors[BLOCK_MAX_PARTITIONS]; |
523 | 30.8k | vfloat4 rgbo_colors[BLOCK_MAX_PARTITIONS]; |
524 | | |
525 | 30.8k | symbolic_compressed_block workscb; |
526 | 30.8k | endpoints workep = ei.ep; |
527 | | |
528 | 30.8k | uint8_t* u8_weight_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index; |
529 | | |
530 | 694k | for (unsigned int j = 0; j < di.weight_count; j++) |
531 | 664k | { |
532 | 664k | workscb.weights[j] = u8_weight_src[j]; |
533 | 664k | } |
534 | | |
535 | 48.7k | for (unsigned int l = 0; l < config.tune_refinement_limit; l++) |
536 | 45.8k | { |
537 | 45.8k | recompute_ideal_colors_1plane( |
538 | 45.8k | blk, pi, di, workscb.weights, |
539 | 45.8k | workep, rgbs_colors, rgbo_colors); |
540 | | |
541 | | // Quantize the chosen color, tracking if worth trying the mod value |
542 | 45.8k | bool all_same = color_quant_level[i] != color_quant_level_mod[i]; |
543 | 145k | for (unsigned int j = 0; j < partition_count; j++) |
544 | 99.7k | { |
545 | 99.7k | workscb.color_formats[j] = pack_color_endpoints( |
546 | 99.7k | workep.endpt0[j], |
547 | 99.7k | workep.endpt1[j], |
548 | 99.7k | rgbs_colors[j], |
549 | 99.7k | rgbo_colors[j], |
550 | 99.7k | partition_format_specifiers[i][j], |
551 | 99.7k | workscb.color_values[j], |
552 | 99.7k | color_quant_level[i]); |
553 | | |
554 | 99.7k | all_same = all_same && workscb.color_formats[j] == workscb.color_formats[0]; |
555 | 99.7k | } |
556 | | |
557 | | // If all the color endpoint modes are the same, we get a few more bits to store colors; |
558 | | // let's see if we can take advantage of this: requantize all the colors and see if the |
559 | | // endpoint modes remain the same. |
560 | 45.8k | workscb.color_formats_matched = 0; |
561 | 45.8k | if (partition_count >= 2 && all_same) |
562 | 8.87k | { |
563 | 8.87k | uint8_t colorvals[BLOCK_MAX_PARTITIONS][8]; |
564 | 8.87k | uint8_t color_formats_mod[BLOCK_MAX_PARTITIONS] { 0 }; |
565 | 8.87k | bool all_same_mod = true; |
566 | 33.3k | for (unsigned int j = 0; j < partition_count; j++) |
567 | 24.8k | { |
568 | 24.8k | color_formats_mod[j] = pack_color_endpoints( |
569 | 24.8k | workep.endpt0[j], |
570 | 24.8k | workep.endpt1[j], |
571 | 24.8k | rgbs_colors[j], |
572 | 24.8k | rgbo_colors[j], |
573 | 24.8k | partition_format_specifiers[i][j], |
574 | 24.8k | colorvals[j], |
575 | 24.8k | color_quant_level_mod[i]); |
576 | | |
577 | | // Early out as soon as it's no longer possible to use mod |
578 | 24.8k | if (color_formats_mod[j] != color_formats_mod[0]) |
579 | 416 | { |
580 | 416 | all_same_mod = false; |
581 | 416 | break; |
582 | 416 | } |
583 | 24.8k | } |
584 | | |
585 | 8.87k | if (all_same_mod) |
586 | 8.45k | { |
587 | 8.45k | workscb.color_formats_matched = 1; |
588 | 42.2k | for (unsigned int j = 0; j < BLOCK_MAX_PARTITIONS; j++) |
589 | 33.8k | { |
590 | 304k | for (unsigned int k = 0; k < 8; k++) |
591 | 270k | { |
592 | 270k | workscb.color_values[j][k] = colorvals[j][k]; |
593 | 270k | } |
594 | | |
595 | 33.8k | workscb.color_formats[j] = color_formats_mod[j]; |
596 | 33.8k | } |
597 | 8.45k | } |
598 | 8.87k | } |
599 | | |
600 | | // Store header fields |
601 | 45.8k | workscb.partition_count = static_cast<uint8_t>(partition_count); |
602 | 45.8k | workscb.partition_index = static_cast<uint16_t>(partition_index); |
603 | 45.8k | workscb.plane2_component = -1; |
604 | 45.8k | workscb.quant_mode = workscb.color_formats_matched ? color_quant_level_mod[i] : color_quant_level[i]; |
605 | 45.8k | workscb.block_mode = qw_bm.mode_index; |
606 | 45.8k | workscb.block_type = SYM_BTYPE_NONCONST; |
607 | | |
608 | | // Pre-realign test |
609 | 45.8k | if (l == 0) |
610 | 30.8k | { |
611 | 30.8k | float errorval = compute_difference(config, bsd, workscb, blk); |
612 | 30.8k | if (errorval == -ERROR_CALC_DEFAULT) |
613 | 5.92k | { |
614 | 5.92k | errorval = -errorval; |
615 | 5.92k | workscb.block_type = SYM_BTYPE_ERROR; |
616 | 5.92k | } |
617 | | |
618 | 30.8k | trace_add_data("error_prerealign", errorval); |
619 | 30.8k | best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode); |
620 | | |
621 | | // Average refinement improvement is 3.5% per iteration (allow 4.5%), but the first |
622 | | // iteration can help more so we give it a extra 8% leeway. Use this knowledge to |
623 | | // drive a heuristic to skip blocks that are unlikely to catch up with the best |
624 | | // block we have already. |
625 | 30.8k | unsigned int iters_remaining = config.tune_refinement_limit - l; |
626 | 30.8k | float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.08f; |
627 | 30.8k | if (errorval > (threshold * best_errorval_in_scb)) |
628 | 15.9k | { |
629 | 15.9k | break; |
630 | 15.9k | } |
631 | | |
632 | 14.9k | if (errorval < best_errorval_in_scb) |
633 | 3.88k | { |
634 | 3.88k | best_errorval_in_scb = errorval; |
635 | 3.88k | workscb.errorval = errorval; |
636 | 3.88k | scb = workscb; |
637 | | |
638 | 3.88k | if (errorval < tune_errorval_threshold) |
639 | 29 | { |
640 | | // Skip remaining candidates - this is "good enough" |
641 | 29 | i = candidate_count; |
642 | 29 | break; |
643 | 29 | } |
644 | 3.88k | } |
645 | 14.9k | } |
646 | | |
647 | 29.8k | bool adjustments; |
648 | 29.8k | if (di.weight_count != bsd.texel_count) |
649 | 4.59k | { |
650 | 4.59k | adjustments = realign_weights_decimated( |
651 | 4.59k | config.profile, bsd, blk, workscb); |
652 | 4.59k | } |
653 | 25.2k | else |
654 | 25.2k | { |
655 | 25.2k | adjustments = realign_weights_undecimated( |
656 | 25.2k | config.profile, bsd, blk, workscb); |
657 | 25.2k | } |
658 | | |
659 | | // Post-realign test |
660 | 29.8k | float errorval = compute_difference(config, bsd, workscb, blk); |
661 | 29.8k | if (errorval == -ERROR_CALC_DEFAULT) |
662 | 5.44k | { |
663 | 5.44k | errorval = -errorval; |
664 | 5.44k | workscb.block_type = SYM_BTYPE_ERROR; |
665 | 5.44k | } |
666 | | |
667 | 29.8k | trace_add_data("error_postrealign", errorval); |
668 | 29.8k | best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode); |
669 | | |
670 | | // Average refinement improvement is 3.5% per iteration, so skip blocks that are |
671 | | // unlikely to catch up with the best block we have already. Assume a 4.5% per step to |
672 | | // give benefit of the doubt ... |
673 | 29.8k | unsigned int iters_remaining = config.tune_refinement_limit - 1 - l; |
674 | 29.8k | float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.0f; |
675 | 29.8k | if (errorval > (threshold * best_errorval_in_scb)) |
676 | 4.35k | { |
677 | 4.35k | break; |
678 | 4.35k | } |
679 | | |
680 | 25.5k | if (errorval < best_errorval_in_scb) |
681 | 6.50k | { |
682 | 6.50k | best_errorval_in_scb = errorval; |
683 | 6.50k | workscb.errorval = errorval; |
684 | 6.50k | scb = workscb; |
685 | | |
686 | 6.50k | if (errorval < tune_errorval_threshold) |
687 | 7 | { |
688 | | // Skip remaining candidates - this is "good enough" |
689 | 7 | i = candidate_count; |
690 | 7 | break; |
691 | 7 | } |
692 | 6.50k | } |
693 | | |
694 | 25.5k | if (!adjustments) |
695 | 7.64k | { |
696 | 7.64k | break; |
697 | 7.64k | } |
698 | 25.5k | } |
699 | 30.8k | } |
700 | | |
701 | 10.9k | return best_errorval_in_mode; |
702 | 10.9k | } |
703 | | |
704 | | /** |
705 | | * @brief Compress a block using a chosen partitioning and 2 planes of weights. |
706 | | * |
707 | | * @param config The compressor configuration. |
708 | | * @param bsd The block size information. |
709 | | * @param blk The image block color data to compress. |
710 | | * @param tune_errorval_threshold The error value threshold. |
711 | | * @param plane2_component The component index for the second plane of weights. |
712 | | * @param[out] scb The symbolic compressed block output. |
713 | | * @param[out] tmpbuf The quantized weights for plane 1. |
714 | | */ |
715 | | static float compress_symbolic_block_for_partition_2planes( |
716 | | const astcenc_config& config, |
717 | | const block_size_descriptor& bsd, |
718 | | const image_block& blk, |
719 | | float tune_errorval_threshold, |
720 | | unsigned int plane2_component, |
721 | | symbolic_compressed_block& scb, |
722 | | compression_working_buffers& tmpbuf, |
723 | | int quant_limit |
724 | 6.99k | ) { |
725 | 6.99k | promise(config.tune_candidate_limit > 0); |
726 | 6.99k | promise(config.tune_refinement_limit > 0); |
727 | 6.99k | promise(bsd.decimation_mode_count_selected > 0); |
728 | | |
729 | 6.99k | int max_weight_quant = astc::min(static_cast<int>(QUANT_32), quant_limit); |
730 | | |
731 | | // Compute ideal weights and endpoint colors, with no quantization or decimation |
732 | 6.99k | endpoints_and_weights& ei1 = tmpbuf.ei1; |
733 | 6.99k | endpoints_and_weights& ei2 = tmpbuf.ei2; |
734 | | |
735 | 6.99k | compute_ideal_colors_and_weights_2planes(bsd, blk, plane2_component, ei1, ei2); |
736 | | |
737 | | // Compute ideal weights and endpoint colors for every decimation |
738 | 6.99k | float* dec_weights_ideal = tmpbuf.dec_weights_ideal; |
739 | 6.99k | uint8_t* dec_weights_uquant = tmpbuf.dec_weights_uquant; |
740 | | |
741 | | // For each decimation mode, compute an ideal set of weights with no quantization |
742 | 72.7k | for (unsigned int i = 0; i < bsd.decimation_mode_count_selected; i++) |
743 | 65.7k | { |
744 | 65.7k | const auto& dm = bsd.get_decimation_mode(i); |
745 | 65.7k | if (!dm.is_ref_2plane(static_cast<quant_method>(max_weight_quant))) |
746 | 26.7k | { |
747 | 26.7k | continue; |
748 | 26.7k | } |
749 | | |
750 | 38.9k | const auto& di = bsd.get_decimation_info(i); |
751 | | |
752 | 38.9k | compute_ideal_weights_for_decimation( |
753 | 38.9k | ei1, |
754 | 38.9k | di, |
755 | 38.9k | dec_weights_ideal + i * BLOCK_MAX_WEIGHTS); |
756 | | |
757 | 38.9k | compute_ideal_weights_for_decimation( |
758 | 38.9k | ei2, |
759 | 38.9k | di, |
760 | 38.9k | dec_weights_ideal + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET); |
761 | 38.9k | } |
762 | | |
763 | | // Compute maximum colors for the endpoints and ideal weights, then for each endpoint and ideal |
764 | | // weight pair, compute the smallest weight that will result in a color value greater than 1 |
765 | 6.99k | vfloat4 min_ep1(10.0f); |
766 | 6.99k | vfloat4 min_ep2(10.0f); |
767 | | |
768 | 6.99k | vfloat4 ep1 = (vfloat4(1.0f) - ei1.ep.endpt0[0]) / (ei1.ep.endpt1[0] - ei1.ep.endpt0[0]); |
769 | 6.99k | vmask4 use_ep1 = (ep1 > vfloat4(0.5f)) & (ep1 < min_ep1); |
770 | 6.99k | min_ep1 = select(min_ep1, ep1, use_ep1); |
771 | | |
772 | 6.99k | vfloat4 ep2 = (vfloat4(1.0f) - ei2.ep.endpt0[0]) / (ei2.ep.endpt1[0] - ei2.ep.endpt0[0]); |
773 | 6.99k | vmask4 use_ep2 = (ep2 > vfloat4(0.5f)) & (ep2 < min_ep2); |
774 | 6.99k | min_ep2 = select(min_ep2, ep2, use_ep2); |
775 | | |
776 | 6.99k | vfloat4 err_max(ERROR_CALC_DEFAULT); |
777 | 6.99k | vmask4 err_mask = vint4::lane_id() == vint4(plane2_component); |
778 | | |
779 | | // Set the plane2 component to max error in ep1 |
780 | 6.99k | min_ep1 = select(min_ep1, err_max, err_mask); |
781 | | |
782 | 6.99k | float min_wt_cutoff1 = hmin_s(min_ep1); |
783 | | |
784 | | // Set the minwt2 to the plane2 component min in ep2 |
785 | 6.99k | float min_wt_cutoff2 = hmin_s(select(err_max, min_ep2, err_mask)); |
786 | | |
787 | 6.99k | compute_angular_endpoints_2planes( |
788 | 6.99k | bsd, dec_weights_ideal, max_weight_quant, tmpbuf); |
789 | | |
790 | | // For each mode (which specifies a decimation and a quantization): |
791 | | // * Compute number of bits needed for the quantized weights |
792 | | // * Generate an optimized set of quantized weights |
793 | | // * Compute quantization errors for the mode |
794 | | |
795 | 6.99k | float* weight_low_value1 = tmpbuf.weight_low_value1; |
796 | 6.99k | float* weight_high_value1 = tmpbuf.weight_high_value1; |
797 | 6.99k | float* weight_low_value2 = tmpbuf.weight_low_value2; |
798 | 6.99k | float* weight_high_value2 = tmpbuf.weight_high_value2; |
799 | | |
800 | 6.99k | int8_t* qwt_bitcounts = tmpbuf.qwt_bitcounts; |
801 | 6.99k | float* qwt_errors = tmpbuf.qwt_errors; |
802 | | |
803 | 6.99k | unsigned int start_2plane = bsd.block_mode_count_1plane_selected; |
804 | 6.99k | unsigned int end_2plane = bsd.block_mode_count_1plane_2plane_selected; |
805 | | |
806 | 78.8k | for (unsigned int i = start_2plane; i < end_2plane; i++) |
807 | 71.8k | { |
808 | 71.8k | const block_mode& bm = bsd.block_modes[i]; |
809 | 71.8k | assert(bm.is_dual_plane); |
810 | | |
811 | 71.8k | if (bm.quant_mode > max_weight_quant) |
812 | 22.6k | { |
813 | 22.6k | qwt_errors[i] = 1e38f; |
814 | 22.6k | continue; |
815 | 22.6k | } |
816 | | |
817 | 49.1k | qwt_bitcounts[i] = static_cast<int8_t>(109 - bm.weight_bits); |
818 | | |
819 | 49.1k | if (weight_high_value1[i] > 1.02f * min_wt_cutoff1) |
820 | 6.42k | { |
821 | 6.42k | weight_high_value1[i] = 1.0f; |
822 | 6.42k | } |
823 | | |
824 | 49.1k | if (weight_high_value2[i] > 1.02f * min_wt_cutoff2) |
825 | 575 | { |
826 | 575 | weight_high_value2[i] = 1.0f; |
827 | 575 | } |
828 | | |
829 | 49.1k | unsigned int decimation_mode = bm.decimation_mode; |
830 | 49.1k | const auto& di = bsd.get_decimation_info(decimation_mode); |
831 | | |
832 | 49.1k | ASTCENC_ALIGNAS float dec_weights_uquantf[BLOCK_MAX_WEIGHTS]; |
833 | | |
834 | | // Generate the optimized set of weights for the mode |
835 | 49.1k | compute_quantized_weights_for_decimation( |
836 | 49.1k | di, |
837 | 49.1k | weight_low_value1[i], |
838 | 49.1k | weight_high_value1[i], |
839 | 49.1k | dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode, |
840 | 49.1k | dec_weights_uquantf, |
841 | 49.1k | dec_weights_uquant + BLOCK_MAX_WEIGHTS * i, |
842 | 49.1k | bm.get_weight_quant_mode()); |
843 | | |
844 | 49.1k | compute_quantized_weights_for_decimation( |
845 | 49.1k | di, |
846 | 49.1k | weight_low_value2[i], |
847 | 49.1k | weight_high_value2[i], |
848 | 49.1k | dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode + WEIGHTS_PLANE2_OFFSET, |
849 | 49.1k | dec_weights_uquantf + WEIGHTS_PLANE2_OFFSET, |
850 | 49.1k | dec_weights_uquant + BLOCK_MAX_WEIGHTS * i + WEIGHTS_PLANE2_OFFSET, |
851 | 49.1k | bm.get_weight_quant_mode()); |
852 | | |
853 | | // Compute weight quantization errors for the block mode |
854 | 49.1k | qwt_errors[i] = compute_error_of_weight_set_2planes( |
855 | 49.1k | ei1, |
856 | 49.1k | ei2, |
857 | 49.1k | di, |
858 | 49.1k | dec_weights_uquantf, |
859 | 49.1k | dec_weights_uquantf + WEIGHTS_PLANE2_OFFSET); |
860 | 49.1k | } |
861 | | |
862 | | // Decide the optimal combination of color endpoint encodings and weight encodings |
863 | 6.99k | uint8_t partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS]; |
864 | 6.99k | int block_mode_index[TUNE_MAX_TRIAL_CANDIDATES]; |
865 | | |
866 | 6.99k | quant_method color_quant_level[TUNE_MAX_TRIAL_CANDIDATES]; |
867 | 6.99k | quant_method color_quant_level_mod[TUNE_MAX_TRIAL_CANDIDATES]; |
868 | | |
869 | 6.99k | endpoints epm; |
870 | 6.99k | merge_endpoints(ei1.ep, ei2.ep, plane2_component, epm); |
871 | | |
872 | 6.99k | const auto& pi = bsd.get_partition_info(1, 0); |
873 | 6.99k | unsigned int candidate_count = compute_ideal_endpoint_formats( |
874 | 6.99k | pi, blk, epm, qwt_bitcounts, qwt_errors, |
875 | 6.99k | config.tune_candidate_limit, |
876 | 6.99k | bsd.block_mode_count_1plane_selected, bsd.block_mode_count_1plane_2plane_selected, |
877 | 6.99k | partition_format_specifiers, block_mode_index, |
878 | 6.99k | color_quant_level, color_quant_level_mod, tmpbuf); |
879 | | |
880 | | // Iterate over the N believed-to-be-best modes to find out which one is actually best |
881 | 6.99k | float best_errorval_in_mode = ERROR_CALC_DEFAULT; |
882 | 6.99k | float best_errorval_in_scb = scb.errorval; |
883 | | |
884 | 24.5k | for (unsigned int i = 0; i < candidate_count; i++) |
885 | 17.5k | { |
886 | 17.5k | TRACE_NODE(node0, "candidate"); |
887 | | |
888 | 17.5k | const int bm_packed_index = block_mode_index[i]; |
889 | 17.5k | assert(bm_packed_index >= static_cast<int>(bsd.block_mode_count_1plane_selected) && |
890 | 17.5k | bm_packed_index < static_cast<int>(bsd.block_mode_count_1plane_2plane_selected)); |
891 | 17.5k | const block_mode& qw_bm = bsd.block_modes[bm_packed_index]; |
892 | | |
893 | 17.5k | int decimation_mode = qw_bm.decimation_mode; |
894 | 17.5k | const auto& di = bsd.get_decimation_info(decimation_mode); |
895 | 17.5k | promise(di.weight_count > 0); |
896 | | |
897 | 17.5k | trace_add_data("weight_x", di.weight_x); |
898 | 17.5k | trace_add_data("weight_y", di.weight_y); |
899 | 17.5k | trace_add_data("weight_z", di.weight_z); |
900 | 17.5k | trace_add_data("weight_quant", qw_bm.quant_mode); |
901 | | |
902 | 17.5k | vfloat4 rgbs_color; |
903 | 17.5k | vfloat4 rgbo_color; |
904 | | |
905 | 17.5k | symbolic_compressed_block workscb; |
906 | 17.5k | endpoints workep = epm; |
907 | | |
908 | 17.5k | uint8_t* u8_weight1_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index; |
909 | 17.5k | uint8_t* u8_weight2_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index + WEIGHTS_PLANE2_OFFSET; |
910 | | |
911 | 308k | for (int j = 0; j < di.weight_count; j++) |
912 | 290k | { |
913 | 290k | workscb.weights[j] = u8_weight1_src[j]; |
914 | 290k | workscb.weights[j + WEIGHTS_PLANE2_OFFSET] = u8_weight2_src[j]; |
915 | 290k | } |
916 | | |
917 | 27.0k | for (unsigned int l = 0; l < config.tune_refinement_limit; l++) |
918 | 25.6k | { |
919 | 25.6k | recompute_ideal_colors_2planes( |
920 | 25.6k | blk, bsd, di, |
921 | 25.6k | workscb.weights, workscb.weights + WEIGHTS_PLANE2_OFFSET, |
922 | 25.6k | workep, rgbs_color, rgbo_color, plane2_component); |
923 | | |
924 | | // Quantize the chosen color |
925 | 25.6k | workscb.color_formats[0] = pack_color_endpoints( |
926 | 25.6k | workep.endpt0[0], |
927 | 25.6k | workep.endpt1[0], |
928 | 25.6k | rgbs_color, rgbo_color, |
929 | 25.6k | partition_format_specifiers[i][0], |
930 | 25.6k | workscb.color_values[0], |
931 | 25.6k | color_quant_level[i]); |
932 | | |
933 | | // Store header fields |
934 | 25.6k | workscb.partition_count = 1; |
935 | 25.6k | workscb.partition_index = 0; |
936 | 25.6k | workscb.quant_mode = color_quant_level[i]; |
937 | 25.6k | workscb.color_formats_matched = 0; |
938 | 25.6k | workscb.block_mode = qw_bm.mode_index; |
939 | 25.6k | workscb.plane2_component = static_cast<int8_t>(plane2_component); |
940 | 25.6k | workscb.block_type = SYM_BTYPE_NONCONST; |
941 | | |
942 | | // Pre-realign test |
943 | 25.6k | if (l == 0) |
944 | 17.5k | { |
945 | 17.5k | float errorval = compute_symbolic_block_difference_2plane(config, bsd, workscb, blk); |
946 | 17.5k | if (errorval == -ERROR_CALC_DEFAULT) |
947 | 5.22k | { |
948 | 5.22k | errorval = -errorval; |
949 | 5.22k | workscb.block_type = SYM_BTYPE_ERROR; |
950 | 5.22k | } |
951 | | |
952 | 17.5k | trace_add_data("error_prerealign", errorval); |
953 | 17.5k | best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode); |
954 | | |
955 | | // Average refinement improvement is 3.5% per iteration (allow 4.5%), but the first |
956 | | // iteration can help more so we give it a extra 8% leeway. Use this knowledge to |
957 | | // drive a heuristic to skip blocks that are unlikely to catch up with the best |
958 | | // block we have already. |
959 | 17.5k | unsigned int iters_remaining = config.tune_refinement_limit - l; |
960 | 17.5k | float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.08f; |
961 | 17.5k | if (errorval > (threshold * best_errorval_in_scb)) |
962 | 9.36k | { |
963 | 9.36k | break; |
964 | 9.36k | } |
965 | | |
966 | 8.22k | if (errorval < best_errorval_in_scb) |
967 | 1.73k | { |
968 | 1.73k | best_errorval_in_scb = errorval; |
969 | 1.73k | workscb.errorval = errorval; |
970 | 1.73k | scb = workscb; |
971 | | |
972 | 1.73k | if (errorval < tune_errorval_threshold) |
973 | 3 | { |
974 | | // Skip remaining candidates - this is "good enough" |
975 | 3 | i = candidate_count; |
976 | 3 | break; |
977 | 3 | } |
978 | 1.73k | } |
979 | 8.22k | } |
980 | | |
981 | | // Perform a final pass over the weights to try to improve them. |
982 | 16.3k | bool adjustments; |
983 | 16.3k | if (di.weight_count != bsd.texel_count) |
984 | 7.62k | { |
985 | 7.62k | adjustments = realign_weights_decimated( |
986 | 7.62k | config.profile, bsd, blk, workscb); |
987 | 7.62k | } |
988 | 8.68k | else |
989 | 8.68k | { |
990 | 8.68k | adjustments = realign_weights_undecimated( |
991 | 8.68k | config.profile, bsd, blk, workscb); |
992 | 8.68k | } |
993 | | |
994 | | // Post-realign test |
995 | 16.3k | float errorval = compute_symbolic_block_difference_2plane(config, bsd, workscb, blk); |
996 | 16.3k | if (errorval == -ERROR_CALC_DEFAULT) |
997 | 5.38k | { |
998 | 5.38k | errorval = -errorval; |
999 | 5.38k | workscb.block_type = SYM_BTYPE_ERROR; |
1000 | 5.38k | } |
1001 | | |
1002 | 16.3k | trace_add_data("error_postrealign", errorval); |
1003 | 16.3k | best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode); |
1004 | | |
1005 | | // Average refinement improvement is 3.5% per iteration, so skip blocks that are |
1006 | | // unlikely to catch up with the best block we have already. Assume a 4.5% per step to |
1007 | | // give benefit of the doubt ... |
1008 | 16.3k | unsigned int iters_remaining = config.tune_refinement_limit - 1 - l; |
1009 | 16.3k | float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.0f; |
1010 | 16.3k | if (errorval > (threshold * best_errorval_in_scb)) |
1011 | 1.63k | { |
1012 | 1.63k | break; |
1013 | 1.63k | } |
1014 | | |
1015 | 14.6k | if (errorval < best_errorval_in_scb) |
1016 | 2.37k | { |
1017 | 2.37k | best_errorval_in_scb = errorval; |
1018 | 2.37k | workscb.errorval = errorval; |
1019 | 2.37k | scb = workscb; |
1020 | | |
1021 | 2.37k | if (errorval < tune_errorval_threshold) |
1022 | 5 | { |
1023 | | // Skip remaining candidates - this is "good enough" |
1024 | 5 | i = candidate_count; |
1025 | 5 | break; |
1026 | 5 | } |
1027 | 2.37k | } |
1028 | | |
1029 | 14.6k | if (!adjustments) |
1030 | 5.22k | { |
1031 | 5.22k | break; |
1032 | 5.22k | } |
1033 | 14.6k | } |
1034 | 17.5k | } |
1035 | | |
1036 | 6.99k | return best_errorval_in_mode; |
1037 | 6.99k | } |
1038 | | |
1039 | | /** |
1040 | | * @brief Determine the lowest cross-channel correlation factor. |
1041 | | * |
1042 | | * @param texels_per_block The number of texels in a block. |
1043 | | * @param blk The image block color data to compress. |
1044 | | * |
1045 | | * @return Return the lowest correlation factor. |
1046 | | */ |
1047 | | static float prepare_block_statistics( |
1048 | | int texels_per_block, |
1049 | | const image_block& blk |
1050 | 2.17k | ) { |
1051 | | // Compute covariance matrix, as a collection of 10 scalars that form the upper-triangular row |
1052 | | // of the matrix. The matrix is symmetric, so this is all we need for this use case. |
1053 | 2.17k | float rs = 0.0f; |
1054 | 2.17k | float gs = 0.0f; |
1055 | 2.17k | float bs = 0.0f; |
1056 | 2.17k | float as = 0.0f; |
1057 | 2.17k | float rr_var = 0.0f; |
1058 | 2.17k | float gg_var = 0.0f; |
1059 | 2.17k | float bb_var = 0.0f; |
1060 | 2.17k | float aa_var = 0.0f; |
1061 | 2.17k | float rg_cov = 0.0f; |
1062 | 2.17k | float rb_cov = 0.0f; |
1063 | 2.17k | float ra_cov = 0.0f; |
1064 | 2.17k | float gb_cov = 0.0f; |
1065 | 2.17k | float ga_cov = 0.0f; |
1066 | 2.17k | float ba_cov = 0.0f; |
1067 | | |
1068 | 2.17k | float weight_sum = 0.0f; |
1069 | | |
1070 | 2.17k | promise(texels_per_block > 0); |
1071 | 58.0k | for (int i = 0; i < texels_per_block; i++) |
1072 | 55.8k | { |
1073 | 55.8k | float weight = hadd_s(blk.channel_weight) / 4.0f; |
1074 | 55.8k | assert(weight >= 0.0f); |
1075 | 55.8k | weight_sum += weight; |
1076 | | |
1077 | 55.8k | float r = blk.data_r[i]; |
1078 | 55.8k | float g = blk.data_g[i]; |
1079 | 55.8k | float b = blk.data_b[i]; |
1080 | 55.8k | float a = blk.data_a[i]; |
1081 | | |
1082 | 55.8k | float rw = r * weight; |
1083 | 55.8k | rs += rw; |
1084 | 55.8k | rr_var += r * rw; |
1085 | 55.8k | rg_cov += g * rw; |
1086 | 55.8k | rb_cov += b * rw; |
1087 | 55.8k | ra_cov += a * rw; |
1088 | | |
1089 | 55.8k | float gw = g * weight; |
1090 | 55.8k | gs += gw; |
1091 | 55.8k | gg_var += g * gw; |
1092 | 55.8k | gb_cov += b * gw; |
1093 | 55.8k | ga_cov += a * gw; |
1094 | | |
1095 | 55.8k | float bw = b * weight; |
1096 | 55.8k | bs += bw; |
1097 | 55.8k | bb_var += b * bw; |
1098 | 55.8k | ba_cov += a * bw; |
1099 | | |
1100 | 55.8k | float aw = a * weight; |
1101 | 55.8k | as += aw; |
1102 | 55.8k | aa_var += a * aw; |
1103 | 55.8k | } |
1104 | | |
1105 | 2.17k | float rpt = 1.0f / astc::max(weight_sum, 1e-7f); |
1106 | | |
1107 | 2.17k | rr_var -= rs * (rs * rpt); |
1108 | 2.17k | rg_cov -= gs * (rs * rpt); |
1109 | 2.17k | rb_cov -= bs * (rs * rpt); |
1110 | 2.17k | ra_cov -= as * (rs * rpt); |
1111 | | |
1112 | 2.17k | gg_var -= gs * (gs * rpt); |
1113 | 2.17k | gb_cov -= bs * (gs * rpt); |
1114 | 2.17k | ga_cov -= as * (gs * rpt); |
1115 | | |
1116 | 2.17k | bb_var -= bs * (bs * rpt); |
1117 | 2.17k | ba_cov -= as * (bs * rpt); |
1118 | | |
1119 | 2.17k | aa_var -= as * (as * rpt); |
1120 | | |
1121 | | // These will give a NaN if a channel is constant - these are fixed up in the next step |
1122 | 2.17k | rg_cov *= astc::rsqrt(rr_var * gg_var); |
1123 | 2.17k | rb_cov *= astc::rsqrt(rr_var * bb_var); |
1124 | 2.17k | ra_cov *= astc::rsqrt(rr_var * aa_var); |
1125 | 2.17k | gb_cov *= astc::rsqrt(gg_var * bb_var); |
1126 | 2.17k | ga_cov *= astc::rsqrt(gg_var * aa_var); |
1127 | 2.17k | ba_cov *= astc::rsqrt(bb_var * aa_var); |
1128 | | |
1129 | 2.17k | if (astc::isnan(rg_cov)) rg_cov = 1.0f; |
1130 | 2.17k | if (astc::isnan(rb_cov)) rb_cov = 1.0f; |
1131 | 2.17k | if (astc::isnan(ra_cov)) ra_cov = 1.0f; |
1132 | 2.17k | if (astc::isnan(gb_cov)) gb_cov = 1.0f; |
1133 | 2.17k | if (astc::isnan(ga_cov)) ga_cov = 1.0f; |
1134 | 2.17k | if (astc::isnan(ba_cov)) ba_cov = 1.0f; |
1135 | | |
1136 | 2.17k | float lowest_correlation = astc::min(fabsf(rg_cov), fabsf(rb_cov)); |
1137 | 2.17k | lowest_correlation = astc::min(lowest_correlation, fabsf(ra_cov)); |
1138 | 2.17k | lowest_correlation = astc::min(lowest_correlation, fabsf(gb_cov)); |
1139 | 2.17k | lowest_correlation = astc::min(lowest_correlation, fabsf(ga_cov)); |
1140 | 2.17k | lowest_correlation = astc::min(lowest_correlation, fabsf(ba_cov)); |
1141 | | |
1142 | | // Diagnostic trace points |
1143 | 2.17k | trace_add_data("min_r", blk.data_min.lane<0>()); |
1144 | 2.17k | trace_add_data("max_r", blk.data_max.lane<0>()); |
1145 | 2.17k | trace_add_data("min_g", blk.data_min.lane<1>()); |
1146 | 2.17k | trace_add_data("max_g", blk.data_max.lane<1>()); |
1147 | 2.17k | trace_add_data("min_b", blk.data_min.lane<2>()); |
1148 | 2.17k | trace_add_data("max_b", blk.data_max.lane<2>()); |
1149 | 2.17k | trace_add_data("min_a", blk.data_min.lane<3>()); |
1150 | 2.17k | trace_add_data("max_a", blk.data_max.lane<3>()); |
1151 | 2.17k | trace_add_data("cov_rg", fabsf(rg_cov)); |
1152 | 2.17k | trace_add_data("cov_rb", fabsf(rb_cov)); |
1153 | 2.17k | trace_add_data("cov_ra", fabsf(ra_cov)); |
1154 | 2.17k | trace_add_data("cov_gb", fabsf(gb_cov)); |
1155 | 2.17k | trace_add_data("cov_ga", fabsf(ga_cov)); |
1156 | 2.17k | trace_add_data("cov_ba", fabsf(ba_cov)); |
1157 | | |
1158 | 2.17k | return lowest_correlation; |
1159 | 2.17k | } |
1160 | | |
1161 | | /* See header for documentation. */ |
1162 | | void compress_block( |
1163 | | const astcenc_contexti& ctx, |
1164 | | const image_block& blk, |
1165 | | uint8_t pcb[16], |
1166 | | compression_working_buffers& tmpbuf) |
1167 | 2.21k | { |
1168 | 2.21k | astcenc_profile decode_mode = ctx.config.profile; |
1169 | 2.21k | symbolic_compressed_block scb; |
1170 | 2.21k | const block_size_descriptor& bsd = *ctx.bsd; |
1171 | 2.21k | float lowest_correl; |
1172 | | |
1173 | 2.21k | TRACE_NODE(node0, "block"); |
1174 | 2.21k | trace_add_data("pos_x", blk.xpos); |
1175 | 2.21k | trace_add_data("pos_y", blk.ypos); |
1176 | 2.21k | trace_add_data("pos_z", blk.zpos); |
1177 | | |
1178 | | // Set stricter block targets for luminance data as we have more bits to play with |
1179 | 2.21k | bool block_is_l = blk.is_luminance(); |
1180 | 2.21k | float block_is_l_scale = block_is_l ? 1.0f / 1.5f : 1.0f; |
1181 | | |
1182 | | // Set slightly stricter block targets for lumalpha data as we have more bits to play with |
1183 | 2.21k | bool block_is_la = blk.is_luminancealpha(); |
1184 | 2.21k | float block_is_la_scale = block_is_la ? 1.0f / 1.05f : 1.0f; |
1185 | | |
1186 | 2.21k | bool block_skip_two_plane = false; |
1187 | 2.21k | int max_partitions = ctx.config.tune_partition_count_limit; |
1188 | | |
1189 | 2.21k | unsigned int requested_partition_indices[3] { |
1190 | 2.21k | ctx.config.tune_2partition_index_limit, |
1191 | 2.21k | ctx.config.tune_3partition_index_limit, |
1192 | 2.21k | ctx.config.tune_4partition_index_limit |
1193 | 2.21k | }; |
1194 | | |
1195 | 2.21k | unsigned int requested_partition_trials[3] { |
1196 | 2.21k | ctx.config.tune_2partitioning_candidate_limit, |
1197 | 2.21k | ctx.config.tune_3partitioning_candidate_limit, |
1198 | 2.21k | ctx.config.tune_4partitioning_candidate_limit |
1199 | 2.21k | }; |
1200 | | |
1201 | | #if defined(ASTCENC_DIAGNOSTICS) |
1202 | | // Do this early in diagnostic builds so we can dump uniform metrics |
1203 | | // for every block. Do it later in release builds to avoid redundant work! |
1204 | | float error_weight_sum = hadd_s(blk.channel_weight) * bsd.texel_count; |
1205 | | float error_threshold = ctx.config.tune_db_limit |
1206 | | * error_weight_sum |
1207 | | * block_is_l_scale |
1208 | | * block_is_la_scale; |
1209 | | |
1210 | | lowest_correl = prepare_block_statistics(bsd.texel_count, blk); |
1211 | | trace_add_data("lowest_correl", lowest_correl); |
1212 | | trace_add_data("tune_error_threshold", error_threshold); |
1213 | | #endif |
1214 | | |
1215 | | // Detected a constant-color block |
1216 | 2.21k | if (all(blk.data_min == blk.data_max)) |
1217 | 11 | { |
1218 | 11 | TRACE_NODE(node1, "pass"); |
1219 | 11 | trace_add_data("partition_count", 0); |
1220 | 11 | trace_add_data("plane_count", 1); |
1221 | | |
1222 | 11 | scb.partition_count = 0; |
1223 | | |
1224 | | // Encode as FP16 if using HDR |
1225 | 11 | if ((decode_mode == ASTCENC_PRF_HDR) || |
1226 | 8 | (decode_mode == ASTCENC_PRF_HDR_RGB_LDR_A)) |
1227 | 5 | { |
1228 | 5 | scb.block_type = SYM_BTYPE_CONST_F16; |
1229 | 5 | vint4 color_f16 = float_to_float16(blk.origin_texel); |
1230 | 5 | store(color_f16, scb.constant_color); |
1231 | 5 | } |
1232 | | // Encode as UNORM16 if NOT using HDR |
1233 | 6 | else |
1234 | 6 | { |
1235 | 6 | scb.block_type = SYM_BTYPE_CONST_U16; |
1236 | 6 | vfloat4 color_f32 = clamp(0.0f, 1.0f, blk.origin_texel) * 65535.0f; |
1237 | 6 | vint4 color_u16 = float_to_int_rtn(color_f32); |
1238 | 6 | store(color_u16, scb.constant_color); |
1239 | 6 | } |
1240 | | |
1241 | 11 | trace_add_data("exit", "quality hit"); |
1242 | | |
1243 | 11 | symbolic_to_physical(bsd, scb, pcb); |
1244 | 11 | return; |
1245 | 11 | } |
1246 | | |
1247 | 2.20k | #if !defined(ASTCENC_DIAGNOSTICS) |
1248 | 2.20k | float error_weight_sum = hadd_s(blk.channel_weight) * bsd.texel_count; |
1249 | 2.20k | float error_threshold = ctx.config.tune_db_limit |
1250 | 2.20k | * error_weight_sum |
1251 | 2.20k | * block_is_l_scale |
1252 | 2.20k | * block_is_la_scale; |
1253 | 2.20k | #endif |
1254 | | |
1255 | | // Set SCB and mode errors to a very high error value |
1256 | 2.20k | scb.errorval = ERROR_CALC_DEFAULT; |
1257 | 2.20k | scb.block_type = SYM_BTYPE_ERROR; |
1258 | | |
1259 | 2.20k | float best_errorvals_for_pcount[BLOCK_MAX_PARTITIONS] { |
1260 | 2.20k | ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT |
1261 | 2.20k | }; |
1262 | | |
1263 | 2.20k | float exit_thresholds_for_pcount[BLOCK_MAX_PARTITIONS] { |
1264 | 2.20k | 0.0f, |
1265 | 2.20k | ctx.config.tune_2partition_early_out_limit_factor, |
1266 | 2.20k | ctx.config.tune_3partition_early_out_limit_factor, |
1267 | 2.20k | 0.0f |
1268 | 2.20k | }; |
1269 | | |
1270 | | // Trial using 1 plane of weights and 1 partition. |
1271 | | |
1272 | | // Most of the time we test it twice, first with a mode cutoff of 0 and then with the specified |
1273 | | // mode cutoff. This causes an early-out that speeds up encoding of easy blocks. However, this |
1274 | | // optimization is disabled for 4x4 and 5x4 blocks where it nearly always slows down the |
1275 | | // compression and slightly reduces image quality. |
1276 | | |
1277 | 2.20k | float errorval_mult[2] { |
1278 | 2.20k | 1.0f / ctx.config.tune_mse_overshoot, |
1279 | 2.20k | 1.0f |
1280 | 2.20k | }; |
1281 | | |
1282 | 2.20k | const float errorval_overshoot = 1.0f / ctx.config.tune_mse_overshoot; |
1283 | | |
1284 | | // Only enable MODE0 fast path if enabled |
1285 | | // Never enable for 3D blocks as no "always" block modes are available |
1286 | 2.20k | int start_trial = 1; |
1287 | 2.20k | if ((ctx.config.tune_search_mode0_enable >= TUNE_MIN_SEARCH_MODE0) && (bsd.zdim == 1)) |
1288 | 347 | { |
1289 | 347 | start_trial = 0; |
1290 | 347 | } |
1291 | | |
1292 | 2.20k | int quant_limit = QUANT_32; |
1293 | 4.71k | for (int i = start_trial; i < 2; i++) |
1294 | 2.54k | { |
1295 | 2.54k | TRACE_NODE(node1, "pass"); |
1296 | 2.54k | trace_add_data("partition_count", 1); |
1297 | 2.54k | trace_add_data("plane_count", 1); |
1298 | 2.54k | trace_add_data("search_mode", i); |
1299 | | |
1300 | 2.54k | float errorval = compress_symbolic_block_for_partition_1plane( |
1301 | 2.54k | ctx.config, bsd, blk, i == 0, |
1302 | 2.54k | error_threshold * errorval_mult[i] * errorval_overshoot, |
1303 | 2.54k | 1, 0, scb, tmpbuf, QUANT_32); |
1304 | | |
1305 | | // Record the quant level so we can use the filter later searches |
1306 | 2.54k | if (scb.block_type != SYM_BTYPE_ERROR) |
1307 | 1.94k | { |
1308 | 1.94k | const auto& bm = bsd.get_block_mode(scb.block_mode); |
1309 | 1.94k | quant_limit = bm.get_weight_quant_mode(); |
1310 | 1.94k | } |
1311 | | |
1312 | 2.54k | best_errorvals_for_pcount[0] = astc::min(best_errorvals_for_pcount[0], errorval); |
1313 | 2.54k | if (errorval < (error_threshold * errorval_mult[i])) |
1314 | 33 | { |
1315 | 33 | trace_add_data("exit", "quality hit"); |
1316 | 33 | goto END_OF_TESTS; |
1317 | 33 | } |
1318 | 2.54k | } |
1319 | | |
1320 | 2.17k | #if !defined(ASTCENC_DIAGNOSTICS) |
1321 | 2.17k | lowest_correl = prepare_block_statistics(bsd.texel_count, blk); |
1322 | 2.17k | #endif |
1323 | | |
1324 | 2.17k | block_skip_two_plane = lowest_correl > ctx.config.tune_2plane_early_out_limit_correlation; |
1325 | | |
1326 | | // Test the four possible 1-partition, 2-planes modes. Do this in reverse, as |
1327 | | // alpha is the most likely to be non-correlated if it is present in the data. |
1328 | 9.38k | for (int i = BLOCK_MAX_COMPONENTS - 1; i >= 0; i--) |
1329 | 7.64k | { |
1330 | 7.64k | TRACE_NODE(node1, "pass"); |
1331 | 7.64k | trace_add_data("partition_count", 1); |
1332 | 7.64k | trace_add_data("plane_count", 2); |
1333 | 7.64k | trace_add_data("plane_component", i); |
1334 | | |
1335 | 7.64k | if (block_skip_two_plane) |
1336 | 224 | { |
1337 | 224 | trace_add_data("skip", "tune_2plane_early_out_limit_correlation"); |
1338 | 224 | continue; |
1339 | 224 | } |
1340 | | |
1341 | 7.42k | if (blk.grayscale && i != 3) |
1342 | 9 | { |
1343 | 9 | trace_add_data("skip", "grayscale block"); |
1344 | 9 | continue; |
1345 | 9 | } |
1346 | | |
1347 | 7.41k | if (blk.is_constant_channel(i)) |
1348 | 419 | { |
1349 | 419 | trace_add_data("skip", "constant component"); |
1350 | 419 | continue; |
1351 | 419 | } |
1352 | | |
1353 | 6.99k | float errorval = compress_symbolic_block_for_partition_2planes( |
1354 | 6.99k | ctx.config, bsd, blk, error_threshold * errorval_overshoot, |
1355 | 6.99k | i, scb, tmpbuf, quant_limit); |
1356 | | |
1357 | | // If attempting two planes is much worse than the best one plane result |
1358 | | // then further two plane searches are unlikely to help so move on ... |
1359 | 6.99k | if (errorval > (best_errorvals_for_pcount[0] * 1.85f)) |
1360 | 421 | { |
1361 | 421 | break; |
1362 | 421 | } |
1363 | | |
1364 | 6.57k | if (errorval < error_threshold) |
1365 | 13 | { |
1366 | 13 | trace_add_data("exit", "quality hit"); |
1367 | 13 | goto END_OF_TESTS; |
1368 | 13 | } |
1369 | 6.57k | } |
1370 | | |
1371 | | // Find best blocks for 2, 3 and 4 partitions |
1372 | 5.34k | for (int partition_count = 2; partition_count <= max_partitions; partition_count++) |
1373 | 4.94k | { |
1374 | 4.94k | unsigned int partition_indices[TUNE_MAX_PARTITIONING_CANDIDATES]; |
1375 | | |
1376 | 4.94k | unsigned int requested_indices = requested_partition_indices[partition_count - 2]; |
1377 | | |
1378 | 4.94k | unsigned int requested_trials = requested_partition_trials[partition_count - 2]; |
1379 | 4.94k | requested_trials = astc::min(requested_trials, requested_indices); |
1380 | | |
1381 | 4.94k | unsigned int actual_trials = find_best_partition_candidates( |
1382 | 4.94k | bsd, blk, partition_count, requested_indices, partition_indices, requested_trials); |
1383 | | |
1384 | 4.94k | float best_error_in_prev = best_errorvals_for_pcount[partition_count - 2]; |
1385 | | |
1386 | 11.8k | for (unsigned int i = 0; i < actual_trials; i++) |
1387 | 8.37k | { |
1388 | 8.37k | TRACE_NODE(node1, "pass"); |
1389 | 8.37k | trace_add_data("partition_count", partition_count); |
1390 | 8.37k | trace_add_data("partition_index", partition_indices[i]); |
1391 | 8.37k | trace_add_data("plane_count", 1); |
1392 | 8.37k | trace_add_data("search_mode", i); |
1393 | | |
1394 | 8.37k | float errorval = compress_symbolic_block_for_partition_1plane( |
1395 | 8.37k | ctx.config, bsd, blk, false, |
1396 | 8.37k | error_threshold * errorval_overshoot, |
1397 | 8.37k | partition_count, partition_indices[i], |
1398 | 8.37k | scb, tmpbuf, quant_limit); |
1399 | | |
1400 | 8.37k | best_errorvals_for_pcount[partition_count - 1] = astc::min(best_errorvals_for_pcount[partition_count - 1], errorval); |
1401 | | |
1402 | | // If using N partitions doesn't improve much over using N-1 partitions then skip trying |
1403 | | // N+1. Error can dramatically improve if the data is correlated or non-correlated and |
1404 | | // aligns with a partitioning that suits that encoding, so for this inner loop check add |
1405 | | // a large error scale because the "other" trial could be a lot better. |
1406 | 8.37k | float best_error = best_errorvals_for_pcount[partition_count - 1]; |
1407 | 8.37k | float best_error_scale = exit_thresholds_for_pcount[partition_count - 1] * 1.85f; |
1408 | 8.37k | if (best_error > (best_error_in_prev * best_error_scale)) |
1409 | 1.48k | { |
1410 | 1.48k | trace_add_data("skip", "tune_partition_early_out_limit_factor"); |
1411 | 1.48k | goto END_OF_TESTS; |
1412 | 1.48k | } |
1413 | | |
1414 | 6.89k | if (errorval < error_threshold) |
1415 | 23 | { |
1416 | 23 | trace_add_data("exit", "quality hit"); |
1417 | 23 | goto END_OF_TESTS; |
1418 | 23 | } |
1419 | 6.89k | } |
1420 | | |
1421 | | // If using N partitions doesn't improve much over using N-1 partitions then skip trying N+1 |
1422 | 3.43k | float best_error = best_errorvals_for_pcount[partition_count - 1]; |
1423 | 3.43k | float best_error_scale = exit_thresholds_for_pcount[partition_count - 1]; |
1424 | 3.43k | if (best_error > (best_error_in_prev * best_error_scale)) |
1425 | 244 | { |
1426 | 244 | trace_add_data("skip", "tune_partition_early_out_limit_factor"); |
1427 | 244 | goto END_OF_TESTS; |
1428 | 244 | } |
1429 | 3.43k | } |
1430 | | |
1431 | 406 | trace_add_data("exit", "quality not hit"); |
1432 | | |
1433 | 2.20k | END_OF_TESTS: |
1434 | | // If we still have an error block then convert to something we can encode |
1435 | | // TODO: Do something more sensible here, such as average color block |
1436 | 2.20k | if (scb.block_type == SYM_BTYPE_ERROR) |
1437 | 120 | { |
1438 | | #if defined(ASTCENC_DIAGNOSTICS) |
1439 | | static bool printed_once = false; |
1440 | | if (!printed_once) |
1441 | | { |
1442 | | printed_once = true; |
1443 | | printf("WARN: At least one block failed to find a valid encoding.\n" |
1444 | | " Try increasing compression quality settings.\n\n"); |
1445 | | } |
1446 | | #endif |
1447 | | |
1448 | 120 | scb.block_type = SYM_BTYPE_CONST_U16; |
1449 | 120 | vfloat4 color_f32 = clamp(0.0f, 1.0f, blk.origin_texel) * 65535.0f; |
1450 | 120 | vint4 color_u16 = float_to_int_rtn(color_f32); |
1451 | 120 | store(color_u16, scb.constant_color); |
1452 | 120 | } |
1453 | | |
1454 | | // Compress to a physical block |
1455 | 2.20k | symbolic_to_physical(bsd, scb, pcb); |
1456 | 2.20k | } |
1457 | | |
1458 | | #endif |