/src/astc-encoder/Source/astcenc_block_sizes.cpp
Line | Count | Source |
1 | | // SPDX-License-Identifier: Apache-2.0 |
2 | | // ---------------------------------------------------------------------------- |
3 | | // Copyright 2011-2025 Arm Limited |
4 | | // |
5 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not |
6 | | // use this file except in compliance with the License. You may obtain a copy |
7 | | // of the License at: |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, software |
12 | | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT |
13 | | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the |
14 | | // License for the specific language governing permissions and limitations |
15 | | // under the License. |
16 | | // ---------------------------------------------------------------------------- |
17 | | |
18 | | /** |
19 | | * @brief Functions to generate block size descriptor and decimation tables. |
20 | | */ |
21 | | |
22 | | #include "astcenc_internal.h" |
23 | | |
24 | | /** |
25 | | * @brief Decode the properties of an encoded 2D block mode. |
26 | | * |
27 | | * @param block_mode The encoded block mode. |
28 | | * @param[out] x_weights The number of weights in the X dimension. |
29 | | * @param[out] y_weights The number of weights in the Y dimension. |
30 | | * @param[out] is_dual_plane True if this block mode has two weight planes. |
31 | | * @param[out] quant_mode The quantization level for the weights. |
32 | | * @param[out] weight_bits The storage bit count for the weights. |
33 | | * |
34 | | * @return Returns true if a valid mode, false otherwise. |
35 | | */ |
36 | | static bool decode_block_mode_2d( |
37 | | unsigned int block_mode, |
38 | | unsigned int& x_weights, |
39 | | unsigned int& y_weights, |
40 | | bool& is_dual_plane, |
41 | | unsigned int& quant_mode, |
42 | | unsigned int& weight_bits |
43 | 14.8k | ) { |
44 | 14.8k | unsigned int base_quant_mode = (block_mode >> 4) & 1; |
45 | 14.8k | unsigned int H = (block_mode >> 9) & 1; |
46 | 14.8k | unsigned int D = (block_mode >> 10) & 1; |
47 | 14.8k | unsigned int A = (block_mode >> 5) & 0x3; |
48 | | |
49 | 14.8k | x_weights = 0; |
50 | 14.8k | y_weights = 0; |
51 | | |
52 | 14.8k | if ((block_mode & 3) != 0) |
53 | 10.9k | { |
54 | 10.9k | base_quant_mode |= (block_mode & 3) << 1; |
55 | 10.9k | unsigned int B = (block_mode >> 7) & 3; |
56 | 10.9k | switch ((block_mode >> 2) & 3) |
57 | 10.9k | { |
58 | 2.62k | case 0: |
59 | 2.62k | x_weights = B + 4; |
60 | 2.62k | y_weights = A + 2; |
61 | 2.62k | break; |
62 | 2.85k | case 1: |
63 | 2.85k | x_weights = B + 8; |
64 | 2.85k | y_weights = A + 2; |
65 | 2.85k | break; |
66 | 2.85k | case 2: |
67 | 2.85k | x_weights = A + 2; |
68 | 2.85k | y_weights = B + 8; |
69 | 2.85k | break; |
70 | 2.60k | case 3: |
71 | 2.60k | B &= 1; |
72 | 2.60k | if (block_mode & 0x100) |
73 | 1.23k | { |
74 | 1.23k | x_weights = B + 2; |
75 | 1.23k | y_weights = A + 2; |
76 | 1.23k | } |
77 | 1.37k | else |
78 | 1.37k | { |
79 | 1.37k | x_weights = A + 2; |
80 | 1.37k | y_weights = B + 6; |
81 | 1.37k | } |
82 | 2.60k | break; |
83 | 10.9k | } |
84 | 10.9k | } |
85 | 3.94k | else |
86 | 3.94k | { |
87 | 3.94k | base_quant_mode |= ((block_mode >> 2) & 3) << 1; |
88 | 3.94k | if (((block_mode >> 2) & 3) == 0) |
89 | 1.02k | { |
90 | 1.02k | return false; |
91 | 1.02k | } |
92 | | |
93 | 2.92k | unsigned int B = (block_mode >> 9) & 3; |
94 | 2.92k | switch ((block_mode >> 7) & 3) |
95 | 2.92k | { |
96 | 727 | case 0: |
97 | 727 | x_weights = 12; |
98 | 727 | y_weights = A + 2; |
99 | 727 | break; |
100 | 727 | case 1: |
101 | 727 | x_weights = A + 2; |
102 | 727 | y_weights = 12; |
103 | 727 | break; |
104 | 707 | case 2: |
105 | 707 | x_weights = A + 6; |
106 | 707 | y_weights = B + 6; |
107 | 707 | D = 0; |
108 | 707 | H = 0; |
109 | 707 | break; |
110 | 760 | case 3: |
111 | 760 | switch ((block_mode >> 5) & 3) |
112 | 760 | { |
113 | 188 | case 0: |
114 | 188 | x_weights = 6; |
115 | 188 | y_weights = 10; |
116 | 188 | break; |
117 | 188 | case 1: |
118 | 188 | x_weights = 10; |
119 | 188 | y_weights = 6; |
120 | 188 | break; |
121 | 192 | case 2: |
122 | 384 | case 3: |
123 | 384 | return false; |
124 | 760 | } |
125 | 376 | break; |
126 | 2.92k | } |
127 | 2.92k | } |
128 | | |
129 | 13.4k | unsigned int weight_count = x_weights * y_weights * (D + 1); |
130 | 13.4k | quant_mode = (base_quant_mode - 2) + 6 * H; |
131 | 13.4k | is_dual_plane = D != 0; |
132 | | |
133 | 13.4k | weight_bits = get_ise_sequence_bitcount(weight_count, static_cast<quant_method>(quant_mode)); |
134 | 13.4k | return (weight_count <= BLOCK_MAX_WEIGHTS && |
135 | 10.8k | weight_bits >= BLOCK_MIN_WEIGHT_BITS && |
136 | 10.0k | weight_bits <= BLOCK_MAX_WEIGHT_BITS); |
137 | 14.8k | } |
138 | | |
139 | | /** |
140 | | * @brief Decode the properties of an encoded 3D block mode. |
141 | | * |
142 | | * @param block_mode The encoded block mode. |
143 | | * @param[out] x_weights The number of weights in the X dimension. |
144 | | * @param[out] y_weights The number of weights in the Y dimension. |
145 | | * @param[out] z_weights The number of weights in the Z dimension. |
146 | | * @param[out] is_dual_plane True if this block mode has two weight planes. |
147 | | * @param[out] quant_mode The quantization level for the weights. |
148 | | * @param[out] weight_bits The storage bit count for the weights. |
149 | | * |
150 | | * @return Returns true if a valid mode, false otherwise. |
151 | | */ |
152 | | static bool decode_block_mode_3d( |
153 | | unsigned int block_mode, |
154 | | unsigned int& x_weights, |
155 | | unsigned int& y_weights, |
156 | | unsigned int& z_weights, |
157 | | bool& is_dual_plane, |
158 | | unsigned int& quant_mode, |
159 | | unsigned int& weight_bits |
160 | 3.66k | ) { |
161 | 3.66k | unsigned int base_quant_mode = (block_mode >> 4) & 1; |
162 | 3.66k | unsigned int H = (block_mode >> 9) & 1; |
163 | 3.66k | unsigned int D = (block_mode >> 10) & 1; |
164 | 3.66k | unsigned int A = (block_mode >> 5) & 0x3; |
165 | | |
166 | 3.66k | x_weights = 0; |
167 | 3.66k | y_weights = 0; |
168 | 3.66k | z_weights = 0; |
169 | | |
170 | 3.66k | if ((block_mode & 3) != 0) |
171 | 2.74k | { |
172 | 2.74k | base_quant_mode |= (block_mode & 3) << 1; |
173 | 2.74k | unsigned int B = (block_mode >> 7) & 3; |
174 | 2.74k | unsigned int C = (block_mode >> 2) & 0x3; |
175 | 2.74k | x_weights = A + 2; |
176 | 2.74k | y_weights = B + 2; |
177 | 2.74k | z_weights = C + 2; |
178 | 2.74k | } |
179 | 913 | else |
180 | 913 | { |
181 | 913 | base_quant_mode |= ((block_mode >> 2) & 3) << 1; |
182 | 913 | if (((block_mode >> 2) & 3) == 0) |
183 | 256 | { |
184 | 256 | return false; |
185 | 256 | } |
186 | | |
187 | 657 | int B = (block_mode >> 9) & 3; |
188 | 657 | if (((block_mode >> 7) & 3) != 3) |
189 | 492 | { |
190 | 492 | D = 0; |
191 | 492 | H = 0; |
192 | 492 | } |
193 | 657 | switch ((block_mode >> 7) & 3) |
194 | 657 | { |
195 | 164 | case 0: |
196 | 164 | x_weights = 6; |
197 | 164 | y_weights = B + 2; |
198 | 164 | z_weights = A + 2; |
199 | 164 | break; |
200 | 164 | case 1: |
201 | 164 | x_weights = A + 2; |
202 | 164 | y_weights = 6; |
203 | 164 | z_weights = B + 2; |
204 | 164 | break; |
205 | 164 | case 2: |
206 | 164 | x_weights = A + 2; |
207 | 164 | y_weights = B + 2; |
208 | 164 | z_weights = 6; |
209 | 164 | break; |
210 | 165 | case 3: |
211 | 165 | x_weights = 2; |
212 | 165 | y_weights = 2; |
213 | 165 | z_weights = 2; |
214 | 165 | switch ((block_mode >> 5) & 3) |
215 | 165 | { |
216 | 39 | case 0: |
217 | 39 | x_weights = 6; |
218 | 39 | break; |
219 | 39 | case 1: |
220 | 39 | y_weights = 6; |
221 | 39 | break; |
222 | 39 | case 2: |
223 | 39 | z_weights = 6; |
224 | 39 | break; |
225 | 48 | case 3: |
226 | 48 | return false; |
227 | 165 | } |
228 | 117 | break; |
229 | 657 | } |
230 | 657 | } |
231 | | |
232 | 3.35k | unsigned int weight_count = x_weights * y_weights * z_weights * (D + 1); |
233 | 3.35k | quant_mode = (base_quant_mode - 2) + 6 * H; |
234 | 3.35k | is_dual_plane = D != 0; |
235 | | |
236 | 3.35k | weight_bits = get_ise_sequence_bitcount(weight_count, static_cast<quant_method>(quant_mode)); |
237 | 3.35k | return (weight_count <= BLOCK_MAX_WEIGHTS && |
238 | 1.98k | weight_bits >= BLOCK_MIN_WEIGHT_BITS && |
239 | 1.94k | weight_bits <= BLOCK_MAX_WEIGHT_BITS); |
240 | 3.66k | } |
241 | | |
242 | | /** |
243 | | * @brief Create a 2D decimation entry for a block-size and weight-decimation pair. |
244 | | * |
245 | | * @param x_texels The number of texels in the X dimension. |
246 | | * @param y_texels The number of texels in the Y dimension. |
247 | | * @param x_weights The number of weights in the X dimension. |
248 | | * @param y_weights The number of weights in the Y dimension. |
249 | | * @param[out] di The decimation info structure to populate. |
250 | | * @param[out] wb The decimation table init scratch working buffers. |
251 | | */ |
252 | | static void init_decimation_info_2d( |
253 | | unsigned int x_texels, |
254 | | unsigned int y_texels, |
255 | | unsigned int x_weights, |
256 | | unsigned int y_weights, |
257 | | decimation_info& di, |
258 | | dt_init_working_buffers& wb |
259 | 96 | ) { |
260 | 96 | unsigned int texels_per_block = x_texels * y_texels; |
261 | 96 | unsigned int weights_per_block = x_weights * y_weights; |
262 | | |
263 | 96 | uint8_t max_texel_count_of_weight = 0; |
264 | | |
265 | 96 | promise(weights_per_block > 0); |
266 | 96 | promise(texels_per_block > 0); |
267 | 96 | promise(x_texels > 0); |
268 | 96 | promise(y_texels > 0); |
269 | | |
270 | 2.93k | for (unsigned int i = 0; i < weights_per_block; i++) |
271 | 2.83k | { |
272 | 2.83k | wb.texel_count_of_weight[i] = 0; |
273 | 2.83k | } |
274 | | |
275 | 12.7k | for (unsigned int i = 0; i < texels_per_block; i++) |
276 | 12.6k | { |
277 | 12.6k | wb.weight_count_of_texel[i] = 0; |
278 | 12.6k | } |
279 | | |
280 | 1.17k | for (unsigned int y = 0; y < y_texels; y++) |
281 | 1.08k | { |
282 | 13.7k | for (unsigned int x = 0; x < x_texels; x++) |
283 | 12.6k | { |
284 | 12.6k | unsigned int texel = y * x_texels + x; |
285 | | |
286 | 12.6k | unsigned int x_weight = (((1024 + x_texels / 2) / (x_texels - 1)) * x * (x_weights - 1) + 32) >> 6; |
287 | 12.6k | unsigned int y_weight = (((1024 + y_texels / 2) / (y_texels - 1)) * y * (y_weights - 1) + 32) >> 6; |
288 | | |
289 | 12.6k | unsigned int x_weight_frac = x_weight & 0xF; |
290 | 12.6k | unsigned int y_weight_frac = y_weight & 0xF; |
291 | 12.6k | unsigned int x_weight_int = x_weight >> 4; |
292 | 12.6k | unsigned int y_weight_int = y_weight >> 4; |
293 | | |
294 | 12.6k | unsigned int qweight[4]; |
295 | 12.6k | qweight[0] = x_weight_int + y_weight_int * x_weights; |
296 | 12.6k | qweight[1] = qweight[0] + 1; |
297 | 12.6k | qweight[2] = qweight[0] + x_weights; |
298 | 12.6k | qweight[3] = qweight[2] + 1; |
299 | | |
300 | | // Truncated-precision bilinear interpolation |
301 | 12.6k | unsigned int prod = x_weight_frac * y_weight_frac; |
302 | | |
303 | 12.6k | unsigned int weight[4]; |
304 | 12.6k | weight[3] = (prod + 8) >> 4; |
305 | 12.6k | weight[1] = x_weight_frac - weight[3]; |
306 | 12.6k | weight[2] = y_weight_frac - weight[3]; |
307 | 12.6k | weight[0] = 16 - x_weight_frac - y_weight_frac + weight[3]; |
308 | | |
309 | 63.3k | for (unsigned int i = 0; i < 4; i++) |
310 | 50.6k | { |
311 | 50.6k | if (weight[i] != 0) |
312 | 37.8k | { |
313 | 37.8k | wb.grid_weights_of_texel[texel][wb.weight_count_of_texel[texel]] = static_cast<uint8_t>(qweight[i]); |
314 | 37.8k | wb.weights_of_texel[texel][wb.weight_count_of_texel[texel]] = static_cast<uint8_t>(weight[i]); |
315 | 37.8k | wb.weight_count_of_texel[texel]++; |
316 | 37.8k | wb.texels_of_weight[qweight[i]][wb.texel_count_of_weight[qweight[i]]] = static_cast<uint8_t>(texel); |
317 | 37.8k | wb.texel_weights_of_weight[qweight[i]][wb.texel_count_of_weight[qweight[i]]] = static_cast<uint8_t>(weight[i]); |
318 | 37.8k | wb.texel_count_of_weight[qweight[i]]++; |
319 | 37.8k | max_texel_count_of_weight = astc::max(max_texel_count_of_weight, wb.texel_count_of_weight[qweight[i]]); |
320 | 37.8k | } |
321 | 50.6k | } |
322 | 12.6k | } |
323 | 1.08k | } |
324 | | |
325 | 96 | uint8_t max_texel_weight_count = 0; |
326 | 12.7k | for (unsigned int i = 0; i < texels_per_block; i++) |
327 | 12.6k | { |
328 | 12.6k | di.texel_weight_count[i] = wb.weight_count_of_texel[i]; |
329 | 12.6k | max_texel_weight_count = astc::max(max_texel_weight_count, di.texel_weight_count[i]); |
330 | | |
331 | 50.5k | for (unsigned int j = 0; j < wb.weight_count_of_texel[i]; j++) |
332 | 37.8k | { |
333 | 37.8k | di.texel_weight_contribs_int_tr[j][i] = wb.weights_of_texel[i][j]; |
334 | 37.8k | di.texel_weight_contribs_float_tr[j][i] = static_cast<float>(wb.weights_of_texel[i][j]) * (1.0f / WEIGHTS_TEXEL_SUM); |
335 | 37.8k | di.texel_weights_tr[j][i] = wb.grid_weights_of_texel[i][j]; |
336 | 37.8k | } |
337 | | |
338 | | // Init all 4 entries so we can rely on zeros for vectorization |
339 | 25.4k | for (unsigned int j = wb.weight_count_of_texel[i]; j < 4; j++) |
340 | 12.7k | { |
341 | 12.7k | di.texel_weight_contribs_int_tr[j][i] = 0; |
342 | 12.7k | di.texel_weight_contribs_float_tr[j][i] = 0.0f; |
343 | 12.7k | di.texel_weights_tr[j][i] = 0; |
344 | 12.7k | } |
345 | 12.6k | } |
346 | | |
347 | 96 | di.max_texel_weight_count = max_texel_weight_count; |
348 | | |
349 | 2.93k | for (unsigned int i = 0; i < weights_per_block; i++) |
350 | 2.83k | { |
351 | 2.83k | unsigned int texel_count_wt = wb.texel_count_of_weight[i]; |
352 | 2.83k | di.weight_texel_count[i] = static_cast<uint8_t>(texel_count_wt); |
353 | | |
354 | 40.7k | for (unsigned int j = 0; j < texel_count_wt; j++) |
355 | 37.8k | { |
356 | 37.8k | uint8_t texel = wb.texels_of_weight[i][j]; |
357 | | |
358 | | // Create transposed versions of these for better vectorization |
359 | 37.8k | di.weight_texels_tr[j][i] = texel; |
360 | 37.8k | di.weights_texel_contribs_tr[j][i] = static_cast<float>(wb.texel_weights_of_weight[i][j]); |
361 | | |
362 | | // Store the per-texel contribution of this weight for each texel it contributes to |
363 | 37.8k | di.texel_contrib_for_weight[j][i] = 0.0f; |
364 | 81.4k | for (unsigned int k = 0; k < 4; k++) |
365 | 81.4k | { |
366 | 81.4k | uint8_t dttw = di.texel_weights_tr[k][texel]; |
367 | 81.4k | float dttwf = di.texel_weight_contribs_float_tr[k][texel]; |
368 | 81.4k | if (dttw == i && dttwf != 0.0f) |
369 | 37.8k | { |
370 | 37.8k | di.texel_contrib_for_weight[j][i] = di.texel_weight_contribs_float_tr[k][texel]; |
371 | 37.8k | break; |
372 | 37.8k | } |
373 | 81.4k | } |
374 | 37.8k | } |
375 | | |
376 | | // Initialize array tail so we can over-fetch with SIMD later to avoid loop tails |
377 | | // Match last texel in active lane in SIMD group, for better gathers |
378 | 2.83k | uint8_t last_texel = di.weight_texels_tr[texel_count_wt - 1][i]; |
379 | 20.2k | for (unsigned int j = texel_count_wt; j < max_texel_count_of_weight; j++) |
380 | 17.4k | { |
381 | 17.4k | di.weight_texels_tr[j][i] = last_texel; |
382 | 17.4k | di.weights_texel_contribs_tr[j][i] = 0.0f; |
383 | 17.4k | } |
384 | 2.83k | } |
385 | | |
386 | | // Initialize array tail so we can over-fetch with SIMD later to avoid loop tails |
387 | 96 | size_t texels_per_block_simd = round_up_to_simd_multiple_vla(texels_per_block); |
388 | 96 | for (size_t i = texels_per_block; i < texels_per_block_simd; i++) |
389 | 0 | { |
390 | 0 | di.texel_weight_count[i] = 0; |
391 | |
|
392 | 0 | for (size_t j = 0; j < 4; j++) |
393 | 0 | { |
394 | 0 | di.texel_weight_contribs_float_tr[j][i] = 0; |
395 | 0 | di.texel_weights_tr[j][i] = 0; |
396 | 0 | di.texel_weight_contribs_int_tr[j][i] = 0; |
397 | 0 | } |
398 | 0 | } |
399 | | |
400 | | // Initialize array tail so we can over-fetch with SIMD later to avoid loop tails |
401 | | // Match last texel in active lane in SIMD group, for better gathers |
402 | 96 | unsigned int last_texel_count_wt = wb.texel_count_of_weight[weights_per_block - 1]; |
403 | 96 | uint8_t last_texel = di.weight_texels_tr[last_texel_count_wt - 1][weights_per_block - 1]; |
404 | | |
405 | 96 | size_t weights_per_block_simd = round_up_to_simd_multiple_vla(weights_per_block); |
406 | 184 | for (size_t i = weights_per_block; i < weights_per_block_simd; i++) |
407 | 88 | { |
408 | 88 | di.weight_texel_count[i] = 0; |
409 | | |
410 | 2.79k | for (size_t j = 0; j < max_texel_count_of_weight; j++) |
411 | 2.70k | { |
412 | 2.70k | di.weight_texels_tr[j][i] = last_texel; |
413 | 2.70k | di.weights_texel_contribs_tr[j][i] = 0.0f; |
414 | 2.70k | } |
415 | 88 | } |
416 | | |
417 | 96 | di.texel_count = static_cast<uint8_t>(texels_per_block); |
418 | 96 | di.weight_count = static_cast<uint8_t>(weights_per_block); |
419 | 96 | di.weight_x = static_cast<uint8_t>(x_weights); |
420 | 96 | di.weight_y = static_cast<uint8_t>(y_weights); |
421 | 96 | di.weight_z = 1; |
422 | 96 | } |
423 | | |
424 | | /** |
425 | | * @brief Create a 3D decimation entry for a block-size and weight-decimation pair. |
426 | | * |
427 | | * @param x_texels The number of texels in the X dimension. |
428 | | * @param y_texels The number of texels in the Y dimension. |
429 | | * @param z_texels The number of texels in the Z dimension. |
430 | | * @param x_weights The number of weights in the X dimension. |
431 | | * @param y_weights The number of weights in the Y dimension. |
432 | | * @param z_weights The number of weights in the Z dimension. |
433 | | * @param[out] di The decimation info structure to populate. |
434 | | @param[out] wb The decimation table init scratch working buffers. |
435 | | */ |
436 | | static void init_decimation_info_3d( |
437 | | unsigned int x_texels, |
438 | | unsigned int y_texels, |
439 | | unsigned int z_texels, |
440 | | unsigned int x_weights, |
441 | | unsigned int y_weights, |
442 | | unsigned int z_weights, |
443 | | decimation_info& di, |
444 | | dt_init_working_buffers& wb |
445 | 78 | ) { |
446 | 78 | unsigned int texels_per_block = x_texels * y_texels * z_texels; |
447 | 78 | unsigned int weights_per_block = x_weights * y_weights * z_weights; |
448 | | |
449 | 78 | uint8_t max_texel_count_of_weight = 0; |
450 | | |
451 | 78 | promise(weights_per_block > 0); |
452 | 78 | promise(texels_per_block > 0); |
453 | | |
454 | 3.03k | for (unsigned int i = 0; i < weights_per_block; i++) |
455 | 2.95k | { |
456 | 2.95k | wb.texel_count_of_weight[i] = 0; |
457 | 2.95k | } |
458 | | |
459 | 16.9k | for (unsigned int i = 0; i < texels_per_block; i++) |
460 | 16.8k | { |
461 | 16.8k | wb.weight_count_of_texel[i] = 0; |
462 | 16.8k | } |
463 | | |
464 | 546 | for (unsigned int z = 0; z < z_texels; z++) |
465 | 468 | { |
466 | 3.27k | for (unsigned int y = 0; y < y_texels; y++) |
467 | 2.80k | { |
468 | 19.6k | for (unsigned int x = 0; x < x_texels; x++) |
469 | 16.8k | { |
470 | 16.8k | int texel = (z * y_texels + y) * x_texels + x; |
471 | | |
472 | 16.8k | int x_weight = (((1024 + x_texels / 2) / (x_texels - 1)) * x * (x_weights - 1) + 32) >> 6; |
473 | 16.8k | int y_weight = (((1024 + y_texels / 2) / (y_texels - 1)) * y * (y_weights - 1) + 32) >> 6; |
474 | 16.8k | int z_weight = (((1024 + z_texels / 2) / (z_texels - 1)) * z * (z_weights - 1) + 32) >> 6; |
475 | | |
476 | 16.8k | int x_weight_frac = x_weight & 0xF; |
477 | 16.8k | int y_weight_frac = y_weight & 0xF; |
478 | 16.8k | int z_weight_frac = z_weight & 0xF; |
479 | 16.8k | int x_weight_int = x_weight >> 4; |
480 | 16.8k | int y_weight_int = y_weight >> 4; |
481 | 16.8k | int z_weight_int = z_weight >> 4; |
482 | 16.8k | int qweight[4]; |
483 | 16.8k | int weight[4]; |
484 | 16.8k | qweight[0] = (z_weight_int * y_weights + y_weight_int) * x_weights + x_weight_int; |
485 | 16.8k | qweight[3] = ((z_weight_int + 1) * y_weights + (y_weight_int + 1)) * x_weights + (x_weight_int + 1); |
486 | | |
487 | | // simplex interpolation |
488 | 16.8k | int fs = x_weight_frac; |
489 | 16.8k | int ft = y_weight_frac; |
490 | 16.8k | int fp = z_weight_frac; |
491 | | |
492 | 16.8k | int cas = ((fs > ft) << 2) + ((ft > fp) << 1) + ((fs > fp)); |
493 | 16.8k | int N = x_weights; |
494 | 16.8k | int NM = x_weights * y_weights; |
495 | | |
496 | 16.8k | int s1, s2, w0, w1, w2, w3; |
497 | 16.8k | switch (cas) |
498 | 16.8k | { |
499 | 1.15k | case 7: |
500 | 1.15k | s1 = 1; |
501 | 1.15k | s2 = N; |
502 | 1.15k | w0 = 16 - fs; |
503 | 1.15k | w1 = fs - ft; |
504 | 1.15k | w2 = ft - fp; |
505 | 1.15k | w3 = fp; |
506 | 1.15k | break; |
507 | 2.10k | case 3: |
508 | 2.10k | s1 = N; |
509 | 2.10k | s2 = 1; |
510 | 2.10k | w0 = 16 - ft; |
511 | 2.10k | w1 = ft - fs; |
512 | 2.10k | w2 = fs - fp; |
513 | 2.10k | w3 = fp; |
514 | 2.10k | break; |
515 | 3.10k | case 5: |
516 | 3.10k | s1 = 1; |
517 | 3.10k | s2 = NM; |
518 | 3.10k | w0 = 16 - fs; |
519 | 3.10k | w1 = fs - fp; |
520 | 3.10k | w2 = fp - ft; |
521 | 3.10k | w3 = ft; |
522 | 3.10k | break; |
523 | 2.10k | case 4: |
524 | 2.10k | s1 = NM; |
525 | 2.10k | s2 = 1; |
526 | 2.10k | w0 = 16 - fp; |
527 | 2.10k | w1 = fp - fs; |
528 | 2.10k | w2 = fs - ft; |
529 | 2.10k | w3 = ft; |
530 | 2.10k | break; |
531 | 3.10k | case 2: |
532 | 3.10k | s1 = N; |
533 | 3.10k | s2 = NM; |
534 | 3.10k | w0 = 16 - ft; |
535 | 3.10k | w1 = ft - fp; |
536 | 3.10k | w2 = fp - fs; |
537 | 3.10k | w3 = fs; |
538 | 3.10k | break; |
539 | 5.28k | case 0: |
540 | 5.28k | s1 = NM; |
541 | 5.28k | s2 = N; |
542 | 5.28k | w0 = 16 - fp; |
543 | 5.28k | w1 = fp - ft; |
544 | 5.28k | w2 = ft - fs; |
545 | 5.28k | w3 = fs; |
546 | 5.28k | break; |
547 | 0 | default: |
548 | 0 | s1 = NM; |
549 | 0 | s2 = N; |
550 | 0 | w0 = 16 - fp; |
551 | 0 | w1 = fp - ft; |
552 | 0 | w2 = ft - fs; |
553 | 0 | w3 = fs; |
554 | 0 | break; |
555 | 16.8k | } |
556 | | |
557 | 16.8k | qweight[1] = qweight[0] + s1; |
558 | 16.8k | qweight[2] = qweight[1] + s2; |
559 | 16.8k | weight[0] = w0; |
560 | 16.8k | weight[1] = w1; |
561 | 16.8k | weight[2] = w2; |
562 | 16.8k | weight[3] = w3; |
563 | | |
564 | 84.2k | for (unsigned int i = 0; i < 4; i++) |
565 | 67.3k | { |
566 | 67.3k | if (weight[i] != 0) |
567 | 42.8k | { |
568 | 42.8k | wb.grid_weights_of_texel[texel][wb.weight_count_of_texel[texel]] = static_cast<uint8_t>(qweight[i]); |
569 | 42.8k | wb.weights_of_texel[texel][wb.weight_count_of_texel[texel]] = static_cast<uint8_t>(weight[i]); |
570 | 42.8k | wb.weight_count_of_texel[texel]++; |
571 | 42.8k | wb.texels_of_weight[qweight[i]][wb.texel_count_of_weight[qweight[i]]] = static_cast<uint8_t>(texel); |
572 | 42.8k | wb.texel_weights_of_weight[qweight[i]][wb.texel_count_of_weight[qweight[i]]] = static_cast<uint8_t>(weight[i]); |
573 | 42.8k | wb.texel_count_of_weight[qweight[i]]++; |
574 | 42.8k | max_texel_count_of_weight = astc::max(max_texel_count_of_weight, wb.texel_count_of_weight[qweight[i]]); |
575 | 42.8k | } |
576 | 67.3k | } |
577 | 16.8k | } |
578 | 2.80k | } |
579 | 468 | } |
580 | | |
581 | 78 | uint8_t max_texel_weight_count = 0; |
582 | 16.9k | for (unsigned int i = 0; i < texels_per_block; i++) |
583 | 16.8k | { |
584 | 16.8k | di.texel_weight_count[i] = wb.weight_count_of_texel[i]; |
585 | 16.8k | max_texel_weight_count = astc::max(max_texel_weight_count, di.texel_weight_count[i]); |
586 | | |
587 | | // Init all 4 entries so we can rely on zeros for vectorization |
588 | 84.2k | for (unsigned int j = 0; j < 4; j++) |
589 | 67.3k | { |
590 | 67.3k | di.texel_weight_contribs_int_tr[j][i] = 0; |
591 | 67.3k | di.texel_weight_contribs_float_tr[j][i] = 0.0f; |
592 | 67.3k | di.texel_weights_tr[j][i] = 0; |
593 | 67.3k | } |
594 | | |
595 | 59.6k | for (unsigned int j = 0; j < wb.weight_count_of_texel[i]; j++) |
596 | 42.8k | { |
597 | 42.8k | di.texel_weight_contribs_int_tr[j][i] = wb.weights_of_texel[i][j]; |
598 | 42.8k | di.texel_weight_contribs_float_tr[j][i] = static_cast<float>(wb.weights_of_texel[i][j]) * (1.0f / WEIGHTS_TEXEL_SUM); |
599 | 42.8k | di.texel_weights_tr[j][i] = wb.grid_weights_of_texel[i][j]; |
600 | 42.8k | } |
601 | 16.8k | } |
602 | | |
603 | 78 | di.max_texel_weight_count = max_texel_weight_count; |
604 | | |
605 | 3.03k | for (unsigned int i = 0; i < weights_per_block; i++) |
606 | 2.95k | { |
607 | 2.95k | unsigned int texel_count_wt = wb.texel_count_of_weight[i]; |
608 | 2.95k | di.weight_texel_count[i] = static_cast<uint8_t>(texel_count_wt); |
609 | | |
610 | 45.7k | for (unsigned int j = 0; j < texel_count_wt; j++) |
611 | 42.8k | { |
612 | 42.8k | unsigned int texel = wb.texels_of_weight[i][j]; |
613 | | |
614 | | // Create transposed versions of these for better vectorization |
615 | 42.8k | di.weight_texels_tr[j][i] = static_cast<uint8_t>(texel); |
616 | 42.8k | di.weights_texel_contribs_tr[j][i] = static_cast<float>(wb.texel_weights_of_weight[i][j]); |
617 | | |
618 | | // Store the per-texel contribution of this weight for each texel it contributes to |
619 | 42.8k | di.texel_contrib_for_weight[j][i] = 0.0f; |
620 | 80.2k | for (unsigned int k = 0; k < 4; k++) |
621 | 80.2k | { |
622 | 80.2k | uint8_t dttw = di.texel_weights_tr[k][texel]; |
623 | 80.2k | float dttwf = di.texel_weight_contribs_float_tr[k][texel]; |
624 | 80.2k | if (dttw == i && dttwf != 0.0f) |
625 | 42.8k | { |
626 | 42.8k | di.texel_contrib_for_weight[j][i] = di.texel_weight_contribs_float_tr[k][texel]; |
627 | 42.8k | break; |
628 | 42.8k | } |
629 | 80.2k | } |
630 | 42.8k | } |
631 | | |
632 | | // Initialize array tail so we can over-fetch with SIMD later to avoid loop tails |
633 | | // Match last texel in active lane in SIMD group, for better gathers |
634 | 2.95k | uint8_t last_texel = di.weight_texels_tr[texel_count_wt - 1][i]; |
635 | 25.4k | for (unsigned int j = texel_count_wt; j < max_texel_count_of_weight; j++) |
636 | 22.5k | { |
637 | 22.5k | di.weight_texels_tr[j][i] = last_texel; |
638 | 22.5k | di.weights_texel_contribs_tr[j][i] = 0.0f; |
639 | 22.5k | } |
640 | 2.95k | } |
641 | | |
642 | | // Initialize array tail so we can over-fetch with SIMD later to avoid loop tails |
643 | 78 | size_t texels_per_block_simd = round_up_to_simd_multiple_vla(texels_per_block); |
644 | 78 | for (size_t i = texels_per_block; i < texels_per_block_simd; i++) |
645 | 0 | { |
646 | 0 | di.texel_weight_count[i] = 0; |
647 | |
|
648 | 0 | for (size_t j = 0; j < 4; j++) |
649 | 0 | { |
650 | 0 | di.texel_weight_contribs_float_tr[j][i] = 0; |
651 | 0 | di.texel_weights_tr[j][i] = 0; |
652 | 0 | di.texel_weight_contribs_int_tr[j][i] = 0; |
653 | 0 | } |
654 | 0 | } |
655 | | |
656 | | // Initialize array tail so we can over-fetch with SIMD later to avoid loop tails |
657 | | // Match last texel in active lane in SIMD group, for better gathers |
658 | 78 | int last_texel_count_wt = wb.texel_count_of_weight[weights_per_block - 1]; |
659 | 78 | uint8_t last_texel = di.weight_texels_tr[last_texel_count_wt - 1][weights_per_block - 1]; |
660 | | |
661 | 78 | size_t weights_per_block_simd = round_up_to_simd_multiple_vla(weights_per_block); |
662 | 118 | for (size_t i = weights_per_block; i < weights_per_block_simd; i++) |
663 | 40 | { |
664 | 40 | di.weight_texel_count[i] = 0; |
665 | | |
666 | 1.15k | for (size_t j = 0; j < max_texel_count_of_weight; j++) |
667 | 1.11k | { |
668 | 1.11k | di.weight_texels_tr[j][i] = last_texel; |
669 | 1.11k | di.weights_texel_contribs_tr[j][i] = 0.0f; |
670 | 1.11k | } |
671 | 40 | } |
672 | | |
673 | 78 | di.texel_count = static_cast<uint8_t>(texels_per_block); |
674 | 78 | di.weight_count = static_cast<uint8_t>(weights_per_block); |
675 | 78 | di.weight_x = static_cast<uint8_t>(x_weights); |
676 | 78 | di.weight_y = static_cast<uint8_t>(y_weights); |
677 | 78 | di.weight_z = static_cast<uint8_t>(z_weights); |
678 | 78 | } |
679 | | |
680 | | /** |
681 | | * @brief Assign the texels to use for kmeans clustering. |
682 | | * |
683 | | * The max limit is @c BLOCK_MAX_KMEANS_TEXELS; above this a random selection is used. |
684 | | * The @c bsd.texel_count is an input and must be populated beforehand. |
685 | | * |
686 | | * @param[in,out] bsd The block size descriptor to populate. |
687 | | */ |
688 | | static void assign_kmeans_texels( |
689 | | block_size_descriptor& bsd |
690 | 3 | ) { |
691 | | // Use all texels for kmeans on a small block |
692 | 3 | if (bsd.texel_count <= BLOCK_MAX_KMEANS_TEXELS) |
693 | 1 | { |
694 | 17 | for (uint8_t i = 0; i < bsd.texel_count; i++) |
695 | 16 | { |
696 | 16 | bsd.kmeans_texels[i] = i; |
697 | 16 | } |
698 | | |
699 | 1 | return; |
700 | 1 | } |
701 | | |
702 | | // Select a random subset of BLOCK_MAX_KMEANS_TEXELS for kmeans on a large block |
703 | 2 | uint64_t rng_state[2]; |
704 | 2 | astc::rand_init(rng_state); |
705 | | |
706 | | // Initialize array used for tracking used indices |
707 | 2 | bool seen[BLOCK_MAX_TEXELS]; |
708 | 362 | for (uint8_t i = 0; i < bsd.texel_count; i++) |
709 | 360 | { |
710 | 360 | seen[i] = false; |
711 | 360 | } |
712 | | |
713 | | // Assign 64 random indices, retrying if we see repeats |
714 | 2 | unsigned int arr_elements_set = 0; |
715 | 155 | while (arr_elements_set < BLOCK_MAX_KMEANS_TEXELS) |
716 | 153 | { |
717 | 153 | uint8_t texel = static_cast<uint8_t>(astc::rand(rng_state)); |
718 | 153 | texel = texel % bsd.texel_count; |
719 | 153 | if (!seen[texel]) |
720 | 128 | { |
721 | 128 | bsd.kmeans_texels[arr_elements_set++] = texel; |
722 | 128 | seen[texel] = true; |
723 | 128 | } |
724 | 153 | } |
725 | 2 | } |
726 | | |
727 | | /** |
728 | | * @brief Allocate a single 2D decimation table entry. |
729 | | * |
730 | | * @param x_texels The number of texels in the X dimension. |
731 | | * @param y_texels The number of texels in the Y dimension. |
732 | | * @param x_weights The number of weights in the X dimension. |
733 | | * @param y_weights The number of weights in the Y dimension. |
734 | | * @param bsd The block size descriptor we are populating. |
735 | | * @param wb The decimation table init scratch working buffers. |
736 | | * @param index The packed array index to populate. |
737 | | */ |
738 | | static void construct_dt_entry_2d( |
739 | | unsigned int x_texels, |
740 | | unsigned int y_texels, |
741 | | unsigned int x_weights, |
742 | | unsigned int y_weights, |
743 | | block_size_descriptor& bsd, |
744 | | dt_init_working_buffers& wb, |
745 | | unsigned int index |
746 | 96 | ) { |
747 | 96 | unsigned int weight_count = x_weights * y_weights; |
748 | 96 | assert(weight_count <= BLOCK_MAX_WEIGHTS); |
749 | | |
750 | 96 | bool try_2planes = (2 * weight_count) <= BLOCK_MAX_WEIGHTS; |
751 | | |
752 | 96 | decimation_info& di = bsd.decimation_tables[index]; |
753 | 96 | init_decimation_info_2d(x_texels, y_texels, x_weights, y_weights, di, wb); |
754 | | |
755 | 96 | int maxprec_1plane = -1; |
756 | 96 | int maxprec_2planes = -1; |
757 | 1.24k | for (int i = 0; i < 12; i++) |
758 | 1.15k | { |
759 | 1.15k | unsigned int bits_1plane = get_ise_sequence_bitcount(weight_count, static_cast<quant_method>(i)); |
760 | 1.15k | if (bits_1plane >= BLOCK_MIN_WEIGHT_BITS && bits_1plane <= BLOCK_MAX_WEIGHT_BITS) |
761 | 590 | { |
762 | 590 | maxprec_1plane = i; |
763 | 590 | } |
764 | | |
765 | 1.15k | if (try_2planes) |
766 | 684 | { |
767 | 684 | unsigned int bits_2planes = get_ise_sequence_bitcount(2 * weight_count, static_cast<quant_method>(i)); |
768 | 684 | if (bits_2planes >= BLOCK_MIN_WEIGHT_BITS && bits_2planes <= BLOCK_MAX_WEIGHT_BITS) |
769 | 330 | { |
770 | 330 | maxprec_2planes = i; |
771 | 330 | } |
772 | 684 | } |
773 | 1.15k | } |
774 | | |
775 | | // At least one of the two should be valid ... |
776 | 96 | assert(maxprec_1plane >= 0 || maxprec_2planes >= 0); |
777 | 96 | bsd.decimation_modes[index].maxprec_1plane = static_cast<int8_t>(maxprec_1plane); |
778 | 96 | bsd.decimation_modes[index].maxprec_2planes = static_cast<int8_t>(maxprec_2planes); |
779 | 96 | bsd.decimation_modes[index].refprec_1plane = 0; |
780 | 96 | bsd.decimation_modes[index].refprec_2planes = 0; |
781 | 96 | } |
782 | | |
783 | | /** |
784 | | * @brief Allocate block modes and decimation tables for a single 2D block size. |
785 | | * |
786 | | * @param x_texels The number of texels in the X dimension. |
787 | | * @param y_texels The number of texels in the Y dimension. |
788 | | * @param can_omit_modes Can we discard modes that astcenc won't use, even if legal? |
789 | | * @param mode_cutoff Percentile cutoff in range [0,1]. Low values more likely to be used. |
790 | | * @param[out] bsd The block size descriptor to populate. |
791 | | */ |
792 | | static void construct_block_size_descriptor_2d( |
793 | | unsigned int x_texels, |
794 | | unsigned int y_texels, |
795 | | bool can_omit_modes, |
796 | | float mode_cutoff, |
797 | | block_size_descriptor& bsd |
798 | 2 | ) { |
799 | | // Store a remap table for storing packed decimation modes. |
800 | | // Indexing uses [Y * 16 + X] and max size for each axis is 12. |
801 | 2 | static const unsigned int MAX_DMI = 12 * 16 + 12; |
802 | 2 | int decimation_mode_index[MAX_DMI]; |
803 | | |
804 | 2 | dt_init_working_buffers* wb = new dt_init_working_buffers; |
805 | | |
806 | 2 | bsd.xdim = static_cast<uint8_t>(x_texels); |
807 | 2 | bsd.ydim = static_cast<uint8_t>(y_texels); |
808 | 2 | bsd.zdim = 1; |
809 | 2 | bsd.texel_count = static_cast<uint8_t>(x_texels * y_texels); |
810 | | |
811 | 410 | for (unsigned int i = 0; i < MAX_DMI; i++) |
812 | 408 | { |
813 | 408 | decimation_mode_index[i] = -1; |
814 | 408 | } |
815 | | |
816 | | // Gather all the decimation grids that can be used with the current block |
817 | 2 | #if !defined(ASTCENC_DECOMPRESS_ONLY) |
818 | 2 | const float *percentiles = get_2d_percentile_table(x_texels, y_texels); |
819 | 2 | float always_cutoff = 0.0f; |
820 | | #else |
821 | | // Unused in decompress-only builds |
822 | | (void)can_omit_modes; |
823 | | (void)mode_cutoff; |
824 | | #endif |
825 | | |
826 | | // Construct the list of block formats referencing the decimation tables |
827 | 2 | unsigned int packed_bm_idx = 0; |
828 | 2 | unsigned int packed_dm_idx = 0; |
829 | | |
830 | | // Trackers |
831 | 2 | unsigned int bm_counts[4] { 0 }; |
832 | 2 | unsigned int dm_counts[4] { 0 }; |
833 | | |
834 | | // Clear the list to a known-bad value |
835 | 4.09k | for (unsigned int i = 0; i < WEIGHTS_MAX_BLOCK_MODES; i++) |
836 | 4.09k | { |
837 | 4.09k | bsd.block_mode_packed_index[i] = BLOCK_BAD_BLOCK_MODE; |
838 | 4.09k | } |
839 | | |
840 | | // Iterate four times to build a usefully ordered list: |
841 | | // - Pass 0 - keep selected single plane "always" block modes |
842 | | // - Pass 1 - keep selected single plane "non-always" block modes |
843 | | // - Pass 2 - keep select dual plane block modes |
844 | | // - Pass 3 - keep everything else that's legal |
845 | 2 | unsigned int limit = can_omit_modes ? 3 : 4; |
846 | 10 | for (unsigned int j = 0; j < limit; j ++) |
847 | 8 | { |
848 | 16.3k | for (unsigned int i = 0; i < WEIGHTS_MAX_BLOCK_MODES; i++) |
849 | 16.3k | { |
850 | | // Skip modes we've already included in a previous pass |
851 | 16.3k | if (bsd.block_mode_packed_index[i] != BLOCK_BAD_BLOCK_MODE) |
852 | 1.51k | { |
853 | 1.51k | continue; |
854 | 1.51k | } |
855 | | |
856 | | // Decode parameters |
857 | 14.8k | unsigned int x_weights; |
858 | 14.8k | unsigned int y_weights; |
859 | 14.8k | bool is_dual_plane; |
860 | 14.8k | unsigned int quant_mode; |
861 | 14.8k | unsigned int weight_bits; |
862 | 14.8k | bool valid = decode_block_mode_2d(i, x_weights, y_weights, is_dual_plane, quant_mode, weight_bits); |
863 | | |
864 | | // Always skip invalid encodings for the current block size |
865 | 14.8k | if (!valid || (x_weights > x_texels) || (y_weights > y_texels)) |
866 | 12.7k | { |
867 | 12.7k | continue; |
868 | 12.7k | } |
869 | | |
870 | | // Selectively skip dual plane encodings |
871 | 2.16k | if (((j <= 1) && is_dual_plane) || (j == 2 && !is_dual_plane)) |
872 | 660 | { |
873 | 660 | continue; |
874 | 660 | } |
875 | | |
876 | | // Always skip encodings we can't physically encode based on |
877 | | // generic encoding bit availability |
878 | 1.50k | if (is_dual_plane) |
879 | 330 | { |
880 | | // This is the only check we need as only support 1 partition |
881 | 330 | if ((109 - weight_bits) <= 0) |
882 | 0 | { |
883 | 0 | continue; |
884 | 0 | } |
885 | 330 | } |
886 | 1.17k | else |
887 | 1.17k | { |
888 | | // This is conservative - fewer bits may be available for > 1 partition |
889 | 1.17k | if ((111 - weight_bits) <= 0) |
890 | 0 | { |
891 | 0 | continue; |
892 | 0 | } |
893 | 1.17k | } |
894 | | |
895 | | // Selectively skip encodings based on percentile |
896 | 1.50k | bool percentile_hit = false; |
897 | 1.50k | #if !defined(ASTCENC_DECOMPRESS_ONLY) |
898 | 1.50k | if (j == 0) |
899 | 590 | { |
900 | 590 | percentile_hit = percentiles[i] <= always_cutoff; |
901 | 590 | } |
902 | 918 | else |
903 | 918 | { |
904 | 918 | percentile_hit = percentiles[i] <= mode_cutoff; |
905 | 918 | } |
906 | 1.50k | #endif |
907 | | |
908 | 1.50k | if (j != 3 && !percentile_hit) |
909 | 588 | { |
910 | 588 | continue; |
911 | 588 | } |
912 | | |
913 | | // Allocate and initialize the decimation table entry if we've not used it yet |
914 | 920 | int decimation_mode = decimation_mode_index[y_weights * 16 + x_weights]; |
915 | 920 | if (decimation_mode < 0) |
916 | 96 | { |
917 | 96 | construct_dt_entry_2d(x_texels, y_texels, x_weights, y_weights, bsd, *wb, packed_dm_idx); |
918 | 96 | decimation_mode_index[y_weights * 16 + x_weights] = packed_dm_idx; |
919 | 96 | decimation_mode = packed_dm_idx; |
920 | | |
921 | 96 | dm_counts[j]++; |
922 | 96 | packed_dm_idx++; |
923 | 96 | } |
924 | | |
925 | 920 | auto& bm = bsd.block_modes[packed_bm_idx]; |
926 | | |
927 | 920 | bm.decimation_mode = static_cast<uint8_t>(decimation_mode); |
928 | 920 | bm.quant_mode = static_cast<uint8_t>(quant_mode); |
929 | 920 | bm.is_dual_plane = static_cast<uint8_t>(is_dual_plane); |
930 | 920 | bm.weight_bits = static_cast<uint8_t>(weight_bits); |
931 | 920 | bm.mode_index = static_cast<uint16_t>(i); |
932 | | |
933 | 920 | auto& dm = bsd.decimation_modes[decimation_mode]; |
934 | | |
935 | 920 | if (is_dual_plane) |
936 | 330 | { |
937 | 330 | dm.set_ref_2plane(bm.get_weight_quant_mode()); |
938 | 330 | } |
939 | 590 | else |
940 | 590 | { |
941 | 590 | dm.set_ref_1plane(bm.get_weight_quant_mode()); |
942 | 590 | } |
943 | | |
944 | 920 | bsd.block_mode_packed_index[i] = static_cast<uint16_t>(packed_bm_idx); |
945 | | |
946 | 920 | packed_bm_idx++; |
947 | 920 | bm_counts[j]++; |
948 | 920 | } |
949 | 8 | } |
950 | | |
951 | 2 | bsd.block_mode_count_1plane_always = bm_counts[0]; |
952 | 2 | bsd.block_mode_count_1plane_selected = bm_counts[0] + bm_counts[1]; |
953 | 2 | bsd.block_mode_count_1plane_2plane_selected = bm_counts[0] + bm_counts[1] + bm_counts[2]; |
954 | 2 | bsd.block_mode_count_all = bm_counts[0] + bm_counts[1] + bm_counts[2] + bm_counts[3]; |
955 | | |
956 | 2 | bsd.decimation_mode_count_always = dm_counts[0]; |
957 | 2 | bsd.decimation_mode_count_selected = dm_counts[0] + dm_counts[1] + dm_counts[2]; |
958 | 2 | bsd.decimation_mode_count_all = dm_counts[0] + dm_counts[1] + dm_counts[2] + dm_counts[3]; |
959 | | |
960 | 2 | #if !defined(ASTCENC_DECOMPRESS_ONLY) |
961 | 2 | assert(bsd.block_mode_count_1plane_always > 0); |
962 | 2 | assert(bsd.decimation_mode_count_always > 0); |
963 | | |
964 | 2 | delete[] percentiles; |
965 | 2 | #endif |
966 | | |
967 | | // Ensure the end of the array contains valid data (should never get read) |
968 | 80 | for (unsigned int i = bsd.decimation_mode_count_all; i < WEIGHTS_MAX_DECIMATION_MODES; i++) |
969 | 78 | { |
970 | 78 | bsd.decimation_modes[i].maxprec_1plane = -1; |
971 | 78 | bsd.decimation_modes[i].maxprec_2planes = -1; |
972 | 78 | bsd.decimation_modes[i].refprec_1plane = 0; |
973 | 78 | bsd.decimation_modes[i].refprec_2planes = 0; |
974 | 78 | } |
975 | | |
976 | | // Determine the texels to use for kmeans clustering. |
977 | 2 | assign_kmeans_texels(bsd); |
978 | | |
979 | 2 | delete wb; |
980 | 2 | } |
981 | | |
982 | | /** |
983 | | * @brief Allocate block modes and decimation tables for a single 3D block size. |
984 | | * |
985 | | * TODO: This function doesn't include all of the heuristics that we use for 2D block sizes such as |
986 | | * the percentile mode cutoffs. If 3D becomes more widely used we should look at this. |
987 | | * |
988 | | * @param x_texels The number of texels in the X dimension. |
989 | | * @param y_texels The number of texels in the Y dimension. |
990 | | * @param z_texels The number of texels in the Z dimension. |
991 | | * @param[out] bsd The block size descriptor to populate. |
992 | | */ |
993 | | static void construct_block_size_descriptor_3d( |
994 | | unsigned int x_texels, |
995 | | unsigned int y_texels, |
996 | | unsigned int z_texels, |
997 | | block_size_descriptor& bsd |
998 | 1 | ) { |
999 | | // Store a remap table for storing packed decimation modes. |
1000 | | // Indexing uses [Z * 64 + Y * 8 + X] and max size for each axis is 6. |
1001 | 1 | static constexpr unsigned int MAX_DMI = 6 * 64 + 6 * 8 + 6; |
1002 | 1 | int decimation_mode_index[MAX_DMI]; |
1003 | 1 | unsigned int decimation_mode_count = 0; |
1004 | | |
1005 | 1 | dt_init_working_buffers* wb = new dt_init_working_buffers; |
1006 | | |
1007 | 1 | bsd.xdim = static_cast<uint8_t>(x_texels); |
1008 | 1 | bsd.ydim = static_cast<uint8_t>(y_texels); |
1009 | 1 | bsd.zdim = static_cast<uint8_t>(z_texels); |
1010 | 1 | bsd.texel_count = static_cast<uint8_t>(x_texels * y_texels * z_texels); |
1011 | | |
1012 | 439 | for (unsigned int i = 0; i < MAX_DMI; i++) |
1013 | 438 | { |
1014 | 438 | decimation_mode_index[i] = -1; |
1015 | 438 | } |
1016 | | |
1017 | | // gather all the infill-modes that can be used with the current block size |
1018 | 6 | for (unsigned int x_weights = 2; x_weights <= x_texels; x_weights++) |
1019 | 5 | { |
1020 | 30 | for (unsigned int y_weights = 2; y_weights <= y_texels; y_weights++) |
1021 | 25 | { |
1022 | 150 | for (unsigned int z_weights = 2; z_weights <= z_texels; z_weights++) |
1023 | 125 | { |
1024 | 125 | unsigned int weight_count = x_weights * y_weights * z_weights; |
1025 | 125 | if (weight_count > BLOCK_MAX_WEIGHTS) |
1026 | 47 | { |
1027 | 47 | continue; |
1028 | 47 | } |
1029 | | |
1030 | 78 | decimation_info& di = bsd.decimation_tables[decimation_mode_count]; |
1031 | 78 | decimation_mode_index[z_weights * 64 + y_weights * 8 + x_weights] = decimation_mode_count; |
1032 | 78 | init_decimation_info_3d(x_texels, y_texels, z_texels, x_weights, y_weights, z_weights, di, *wb); |
1033 | | |
1034 | 78 | int maxprec_1plane = -1; |
1035 | 78 | int maxprec_2planes = -1; |
1036 | 1.01k | for (unsigned int i = 0; i < 12; i++) |
1037 | 936 | { |
1038 | 936 | unsigned int bits_1plane = get_ise_sequence_bitcount(weight_count, static_cast<quant_method>(i)); |
1039 | 936 | if (bits_1plane >= BLOCK_MIN_WEIGHT_BITS && bits_1plane <= BLOCK_MAX_WEIGHT_BITS) |
1040 | 417 | { |
1041 | 417 | maxprec_1plane = i; |
1042 | 417 | } |
1043 | | |
1044 | 936 | unsigned int bits_2planes = get_ise_sequence_bitcount(2 * weight_count, static_cast<quant_method>(i)); |
1045 | 936 | if (bits_2planes >= BLOCK_MIN_WEIGHT_BITS && bits_2planes <= BLOCK_MAX_WEIGHT_BITS) |
1046 | 154 | { |
1047 | 154 | maxprec_2planes = i; |
1048 | 154 | } |
1049 | 936 | } |
1050 | | |
1051 | 78 | if ((2 * weight_count) > BLOCK_MAX_WEIGHTS) |
1052 | 46 | { |
1053 | 46 | maxprec_2planes = -1; |
1054 | 46 | } |
1055 | | |
1056 | 78 | bsd.decimation_modes[decimation_mode_count].maxprec_1plane = static_cast<int8_t>(maxprec_1plane); |
1057 | 78 | bsd.decimation_modes[decimation_mode_count].maxprec_2planes = static_cast<int8_t>(maxprec_2planes); |
1058 | 78 | bsd.decimation_modes[decimation_mode_count].refprec_1plane = maxprec_1plane == -1 ? 0 : 0xFFFF; |
1059 | 78 | bsd.decimation_modes[decimation_mode_count].refprec_2planes = maxprec_2planes == -1 ? 0 : 0xFFFF; |
1060 | 78 | decimation_mode_count++; |
1061 | 78 | } |
1062 | 25 | } |
1063 | 5 | } |
1064 | | |
1065 | | // Ensure the end of the array contains valid data (should never get read) |
1066 | 10 | for (unsigned int i = decimation_mode_count; i < WEIGHTS_MAX_DECIMATION_MODES; i++) |
1067 | 9 | { |
1068 | 9 | bsd.decimation_modes[i].maxprec_1plane = -1; |
1069 | 9 | bsd.decimation_modes[i].maxprec_2planes = -1; |
1070 | 9 | bsd.decimation_modes[i].refprec_1plane = 0; |
1071 | 9 | bsd.decimation_modes[i].refprec_2planes = 0; |
1072 | 9 | } |
1073 | | |
1074 | 1 | bsd.decimation_mode_count_always = 0; // Skipped for 3D modes |
1075 | 1 | bsd.decimation_mode_count_selected = decimation_mode_count; |
1076 | 1 | bsd.decimation_mode_count_all = decimation_mode_count; |
1077 | | |
1078 | | // Construct the list of block formats referencing the decimation tables |
1079 | | |
1080 | | // Clear the list to a known-bad value |
1081 | 2.04k | for (unsigned int i = 0; i < WEIGHTS_MAX_BLOCK_MODES; i++) |
1082 | 2.04k | { |
1083 | 2.04k | bsd.block_mode_packed_index[i] = BLOCK_BAD_BLOCK_MODE; |
1084 | 2.04k | } |
1085 | | |
1086 | 1 | unsigned int packed_idx = 0; |
1087 | 1 | unsigned int bm_counts[2] { 0 }; |
1088 | | |
1089 | | // Iterate two times to build a usefully ordered list: |
1090 | | // - Pass 0 - keep valid single plane block modes |
1091 | | // - Pass 1 - keep valid dual plane block modes |
1092 | 3 | for (unsigned int j = 0; j < 2; j++) |
1093 | 2 | { |
1094 | 4.09k | for (unsigned int i = 0; i < WEIGHTS_MAX_BLOCK_MODES; i++) |
1095 | 4.09k | { |
1096 | | // Skip modes we've already included in a previous pass |
1097 | 4.09k | if (bsd.block_mode_packed_index[i] != BLOCK_BAD_BLOCK_MODE) |
1098 | 435 | { |
1099 | 435 | continue; |
1100 | 435 | } |
1101 | | |
1102 | 3.66k | unsigned int x_weights; |
1103 | 3.66k | unsigned int y_weights; |
1104 | 3.66k | unsigned int z_weights; |
1105 | 3.66k | bool is_dual_plane; |
1106 | 3.66k | unsigned int quant_mode; |
1107 | 3.66k | unsigned int weight_bits; |
1108 | | |
1109 | 3.66k | bool valid = decode_block_mode_3d(i, x_weights, y_weights, z_weights, is_dual_plane, quant_mode, weight_bits); |
1110 | | // Skip invalid encodings |
1111 | 3.66k | if (!valid || x_weights > x_texels || y_weights > y_texels || z_weights > z_texels) |
1112 | 2.97k | { |
1113 | 2.97k | continue; |
1114 | 2.97k | } |
1115 | | |
1116 | | // Skip encodings in the wrong iteration |
1117 | 689 | if ((j == 0 && is_dual_plane) || (j == 1 && !is_dual_plane)) |
1118 | 127 | { |
1119 | 127 | continue; |
1120 | 127 | } |
1121 | | |
1122 | | // Always skip encodings we can't physically encode based on bit availability |
1123 | 562 | if (is_dual_plane) |
1124 | 127 | { |
1125 | | // This is the only check we need as only support 1 partition |
1126 | 127 | if ((109 - weight_bits) <= 0) |
1127 | 0 | { |
1128 | 0 | continue; |
1129 | 0 | } |
1130 | 127 | } |
1131 | 435 | else |
1132 | 435 | { |
1133 | | // This is conservative - fewer bits may be available for > 1 partition |
1134 | 435 | if ((111 - weight_bits) <= 0) |
1135 | 0 | { |
1136 | 0 | continue; |
1137 | 0 | } |
1138 | 435 | } |
1139 | | |
1140 | 562 | int decimation_mode = decimation_mode_index[z_weights * 64 + y_weights * 8 + x_weights]; |
1141 | 562 | bsd.block_modes[packed_idx].decimation_mode = static_cast<uint8_t>(decimation_mode); |
1142 | 562 | bsd.block_modes[packed_idx].quant_mode = static_cast<uint8_t>(quant_mode); |
1143 | 562 | bsd.block_modes[packed_idx].weight_bits = static_cast<uint8_t>(weight_bits); |
1144 | 562 | bsd.block_modes[packed_idx].is_dual_plane = static_cast<uint8_t>(is_dual_plane); |
1145 | 562 | bsd.block_modes[packed_idx].mode_index = static_cast<uint16_t>(i); |
1146 | | |
1147 | 562 | bsd.block_mode_packed_index[i] = static_cast<uint16_t>(packed_idx); |
1148 | 562 | bm_counts[j]++; |
1149 | 562 | packed_idx++; |
1150 | 562 | } |
1151 | 2 | } |
1152 | | |
1153 | 1 | bsd.block_mode_count_1plane_always = 0; // Skipped for 3D modes |
1154 | 1 | bsd.block_mode_count_1plane_selected = bm_counts[0]; |
1155 | 1 | bsd.block_mode_count_1plane_2plane_selected = bm_counts[0] + bm_counts[1]; |
1156 | 1 | bsd.block_mode_count_all = bm_counts[0] + bm_counts[1]; |
1157 | | |
1158 | | // Determine the texels to use for kmeans clustering. |
1159 | 1 | assign_kmeans_texels(bsd); |
1160 | | |
1161 | 1 | delete wb; |
1162 | 1 | } |
1163 | | |
1164 | | /* See header for documentation. */ |
1165 | | void init_block_size_descriptor( |
1166 | | unsigned int x_texels, |
1167 | | unsigned int y_texels, |
1168 | | unsigned int z_texels, |
1169 | | bool can_omit_modes, |
1170 | | unsigned int partition_count_cutoff, |
1171 | | float mode_cutoff, |
1172 | | block_size_descriptor& bsd |
1173 | 3 | ) { |
1174 | 3 | if (z_texels > 1) |
1175 | 1 | { |
1176 | 1 | construct_block_size_descriptor_3d(x_texels, y_texels, z_texels, bsd); |
1177 | 1 | } |
1178 | 2 | else |
1179 | 2 | { |
1180 | 2 | construct_block_size_descriptor_2d(x_texels, y_texels, can_omit_modes, mode_cutoff, bsd); |
1181 | 2 | } |
1182 | | |
1183 | 3 | init_partition_tables(bsd, can_omit_modes, partition_count_cutoff); |
1184 | 3 | } |