/src/astc-encoder/Source/astcenc_ideal_endpoints_and_weights.cpp
Line | Count | Source |
1 | | // SPDX-License-Identifier: Apache-2.0 |
2 | | // ---------------------------------------------------------------------------- |
3 | | // Copyright 2011-2024 Arm Limited |
4 | | // |
5 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not |
6 | | // use this file except in compliance with the License. You may obtain a copy |
7 | | // of the License at: |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, software |
12 | | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT |
13 | | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the |
14 | | // License for the specific language governing permissions and limitations |
15 | | // under the License. |
16 | | // ---------------------------------------------------------------------------- |
17 | | |
18 | | #if !defined(ASTCENC_DECOMPRESS_ONLY) |
19 | | |
20 | | /** |
21 | | * @brief Functions for computing color endpoints and texel weights. |
22 | | */ |
23 | | |
24 | | #include <cassert> |
25 | | |
26 | | #include "astcenc_internal.h" |
27 | | #include "astcenc_vecmathlib.h" |
28 | | |
29 | | /** |
30 | | * @brief Compute the infilled weight for N texel indices in a decimated grid. |
31 | | * |
32 | | * @param di The weight grid decimation to use. |
33 | | * @param weights The decimated weight values to use. |
34 | | * @param index The first texel index to interpolate. |
35 | | * |
36 | | * @return The interpolated weight for the given set of SIMD_WIDTH texels. |
37 | | */ |
38 | | static vfloat bilinear_infill_vla( |
39 | | const decimation_info& di, |
40 | | const float* weights, |
41 | | unsigned int index |
42 | 2.88M | ) { |
43 | | // Load the bilinear filter texel weight indexes in the decimated grid |
44 | 2.88M | const uint8_t* weight_idx0 = di.texel_weights_tr[0] + index; |
45 | 2.88M | const uint8_t* weight_idx1 = di.texel_weights_tr[1] + index; |
46 | 2.88M | const uint8_t* weight_idx2 = di.texel_weights_tr[2] + index; |
47 | 2.88M | const uint8_t* weight_idx3 = di.texel_weights_tr[3] + index; |
48 | | |
49 | | // Load the bilinear filter weights from the decimated grid |
50 | 2.88M | vfloat weight_val0 = gatherf_byte_inds<vfloat>(weights, weight_idx0); |
51 | 2.88M | vfloat weight_val1 = gatherf_byte_inds<vfloat>(weights, weight_idx1); |
52 | 2.88M | vfloat weight_val2 = gatherf_byte_inds<vfloat>(weights, weight_idx2); |
53 | 2.88M | vfloat weight_val3 = gatherf_byte_inds<vfloat>(weights, weight_idx3); |
54 | | |
55 | | // Load the weight contribution factors for each decimated weight |
56 | 2.88M | vfloat tex_weight_float0 = loada(di.texel_weight_contribs_float_tr[0] + index); |
57 | 2.88M | vfloat tex_weight_float1 = loada(di.texel_weight_contribs_float_tr[1] + index); |
58 | 2.88M | vfloat tex_weight_float2 = loada(di.texel_weight_contribs_float_tr[2] + index); |
59 | 2.88M | vfloat tex_weight_float3 = loada(di.texel_weight_contribs_float_tr[3] + index); |
60 | | |
61 | | // Compute the bilinear interpolation to generate the per-texel weight |
62 | 2.88M | return (weight_val0 * tex_weight_float0 + weight_val1 * tex_weight_float1) + |
63 | 2.88M | (weight_val2 * tex_weight_float2 + weight_val3 * tex_weight_float3); |
64 | 2.88M | } |
65 | | |
66 | | /** |
67 | | * @brief Compute the infilled weight for N texel indices in a decimated grid. |
68 | | * |
69 | | * This is specialized version which computes only two weights per texel for |
70 | | * encodings that are only decimated in a single axis. |
71 | | * |
72 | | * @param di The weight grid decimation to use. |
73 | | * @param weights The decimated weight values to use. |
74 | | * @param index The first texel index to interpolate. |
75 | | * |
76 | | * @return The interpolated weight for the given set of SIMD_WIDTH texels. |
77 | | */ |
78 | | static vfloat bilinear_infill_vla_2( |
79 | | const decimation_info& di, |
80 | | const float* weights, |
81 | | unsigned int index |
82 | 1.47M | ) { |
83 | | // Load the bilinear filter texel weight indexes in the decimated grid |
84 | 1.47M | const uint8_t* weight_idx0 = di.texel_weights_tr[0] + index; |
85 | 1.47M | const uint8_t* weight_idx1 = di.texel_weights_tr[1] + index; |
86 | | |
87 | | // Load the bilinear filter weights from the decimated grid |
88 | 1.47M | vfloat weight_val0 = gatherf_byte_inds<vfloat>(weights, weight_idx0); |
89 | 1.47M | vfloat weight_val1 = gatherf_byte_inds<vfloat>(weights, weight_idx1); |
90 | | |
91 | | // Load the weight contribution factors for each decimated weight |
92 | 1.47M | vfloat tex_weight_float0 = loada(di.texel_weight_contribs_float_tr[0] + index); |
93 | 1.47M | vfloat tex_weight_float1 = loada(di.texel_weight_contribs_float_tr[1] + index); |
94 | | |
95 | | // Compute the bilinear interpolation to generate the per-texel weight |
96 | 1.47M | return (weight_val0 * tex_weight_float0 + weight_val1 * tex_weight_float1); |
97 | 1.47M | } |
98 | | |
99 | | /** |
100 | | * @brief Compute the ideal endpoints and weights for 1 color component. |
101 | | * |
102 | | * @param blk The image block color data to compress. |
103 | | * @param pi The partition info for the current trial. |
104 | | * @param[out] ei The computed ideal endpoints and weights. |
105 | | * @param component The color component to compute. |
106 | | */ |
107 | | static void compute_ideal_colors_and_weights_1_comp( |
108 | | const image_block& blk, |
109 | | const partition_info& pi, |
110 | | endpoints_and_weights& ei, |
111 | | unsigned int component |
112 | 6.99k | ) { |
113 | 6.99k | unsigned int partition_count = pi.partition_count; |
114 | 6.99k | ei.ep.partition_count = partition_count; |
115 | 6.99k | promise(partition_count > 0); |
116 | | |
117 | 6.99k | unsigned int texel_count = blk.texel_count; |
118 | 6.99k | promise(texel_count > 0); |
119 | | |
120 | 6.99k | float error_weight; |
121 | 6.99k | const float* data_vr = nullptr; |
122 | | |
123 | 6.99k | assert(component < BLOCK_MAX_COMPONENTS); |
124 | 6.99k | switch (component) |
125 | 6.99k | { |
126 | 1.66k | case 0: |
127 | 1.66k | error_weight = blk.channel_weight.lane<0>(); |
128 | 1.66k | data_vr = blk.data_r; |
129 | 1.66k | break; |
130 | 1.69k | case 1: |
131 | 1.69k | error_weight = blk.channel_weight.lane<1>(); |
132 | 1.69k | data_vr = blk.data_g; |
133 | 1.69k | break; |
134 | 1.83k | case 2: |
135 | 1.83k | error_weight = blk.channel_weight.lane<2>(); |
136 | 1.83k | data_vr = blk.data_b; |
137 | 1.83k | break; |
138 | 1.79k | default: |
139 | 1.79k | assert(component == 3); |
140 | 1.79k | error_weight = blk.channel_weight.lane<3>(); |
141 | 1.79k | data_vr = blk.data_a; |
142 | 1.79k | break; |
143 | 6.99k | } |
144 | | |
145 | 6.99k | vmask4 sep_mask = vint4::lane_id() == vint4(component); |
146 | 6.99k | bool is_constant_wes { true }; |
147 | 6.99k | float partition0_len_sq { 0.0f }; |
148 | | |
149 | 13.9k | for (unsigned int i = 0; i < partition_count; i++) |
150 | 6.99k | { |
151 | 6.99k | float lowvalue { 1e10f }; |
152 | 6.99k | float highvalue { -1e10f }; |
153 | | |
154 | 6.99k | unsigned int partition_texel_count = pi.partition_texel_count[i]; |
155 | 181k | for (unsigned int j = 0; j < partition_texel_count; j++) |
156 | 174k | { |
157 | 174k | unsigned int tix = pi.texels_of_partition[i][j]; |
158 | 174k | float value = data_vr[tix]; |
159 | 174k | lowvalue = astc::min(value, lowvalue); |
160 | 174k | highvalue = astc::max(value, highvalue); |
161 | 174k | } |
162 | | |
163 | 6.99k | if (highvalue <= lowvalue) |
164 | 0 | { |
165 | 0 | lowvalue = 0.0f; |
166 | 0 | highvalue = 1e-7f; |
167 | 0 | } |
168 | | |
169 | 6.99k | float length = highvalue - lowvalue; |
170 | 6.99k | float length_squared = length * length; |
171 | 6.99k | float scale = 1.0f / length; |
172 | | |
173 | 6.99k | if (i == 0) |
174 | 6.99k | { |
175 | 6.99k | partition0_len_sq = length_squared; |
176 | 6.99k | } |
177 | 0 | else |
178 | 0 | { |
179 | 0 | is_constant_wes = is_constant_wes && length_squared == partition0_len_sq; |
180 | 0 | } |
181 | | |
182 | 181k | for (unsigned int j = 0; j < partition_texel_count; j++) |
183 | 174k | { |
184 | 174k | unsigned int tix = pi.texels_of_partition[i][j]; |
185 | 174k | float value = (data_vr[tix] - lowvalue) * scale; |
186 | 174k | value = astc::clamp1f(value); |
187 | | |
188 | 174k | ei.weights[tix] = value; |
189 | 174k | ei.weight_error_scale[tix] = length_squared * error_weight; |
190 | 174k | assert(!astc::isnan(ei.weight_error_scale[tix])); |
191 | 174k | } |
192 | | |
193 | 6.99k | ei.ep.endpt0[i] = select(blk.data_min, vfloat4(lowvalue), sep_mask); |
194 | 6.99k | ei.ep.endpt1[i] = select(blk.data_max, vfloat4(highvalue), sep_mask); |
195 | 6.99k | } |
196 | | |
197 | | // Zero initialize any SIMD over-fetch |
198 | 6.99k | size_t texel_count_simd = round_up_to_simd_multiple_vla(texel_count); |
199 | 9.10k | for (size_t i = texel_count; i < texel_count_simd; i++) |
200 | 2.10k | { |
201 | 2.10k | ei.weights[i] = 0.0f; |
202 | 2.10k | ei.weight_error_scale[i] = 0.0f; |
203 | 2.10k | } |
204 | | |
205 | 6.99k | ei.is_constant_weight_error_scale = is_constant_wes; |
206 | 6.99k | } |
207 | | |
208 | | /** |
209 | | * @brief Compute the ideal endpoints and weights for 2 color components. |
210 | | * |
211 | | * @param blk The image block color data to compress. |
212 | | * @param pi The partition info for the current trial. |
213 | | * @param[out] ei The computed ideal endpoints and weights. |
214 | | * @param component1 The first color component to compute. |
215 | | * @param component2 The second color component to compute. |
216 | | */ |
217 | | static void compute_ideal_colors_and_weights_2_comp( |
218 | | const image_block& blk, |
219 | | const partition_info& pi, |
220 | | endpoints_and_weights& ei, |
221 | | int component1, |
222 | | int component2 |
223 | 802 | ) { |
224 | 802 | unsigned int partition_count = pi.partition_count; |
225 | 802 | ei.ep.partition_count = partition_count; |
226 | 802 | promise(partition_count > 0); |
227 | | |
228 | 802 | unsigned int texel_count = blk.texel_count; |
229 | 802 | promise(texel_count > 0); |
230 | | |
231 | 802 | partition_metrics pms[BLOCK_MAX_PARTITIONS]; |
232 | | |
233 | 802 | float error_weight; |
234 | 802 | const float* data_vr = nullptr; |
235 | 802 | const float* data_vg = nullptr; |
236 | | |
237 | 802 | if (component1 == 0 && component2 == 1) |
238 | 303 | { |
239 | 303 | error_weight = hadd_s(blk.channel_weight.swz<0, 1>()) / 2.0f; |
240 | | |
241 | 303 | data_vr = blk.data_r; |
242 | 303 | data_vg = blk.data_g; |
243 | 303 | } |
244 | 499 | else if (component1 == 0 && component2 == 2) |
245 | 250 | { |
246 | 250 | error_weight = hadd_s(blk.channel_weight.swz<0, 2>()) / 2.0f; |
247 | | |
248 | 250 | data_vr = blk.data_r; |
249 | 250 | data_vg = blk.data_b; |
250 | 250 | } |
251 | 249 | else // (component1 == 1 && component2 == 2) |
252 | 249 | { |
253 | 249 | assert(component1 == 1 && component2 == 2); |
254 | | |
255 | 249 | error_weight = hadd_s(blk.channel_weight.swz<1, 2>()) / 2.0f; |
256 | | |
257 | 249 | data_vr = blk.data_g; |
258 | 249 | data_vg = blk.data_b; |
259 | 249 | } |
260 | | |
261 | 802 | compute_avgs_and_dirs_2_comp(pi, blk, component1, component2, pms); |
262 | | |
263 | 802 | bool is_constant_wes { true }; |
264 | 802 | float partition0_len_sq { 0.0f }; |
265 | | |
266 | 802 | vmask4 comp1_mask = vint4::lane_id() == vint4(component1); |
267 | 802 | vmask4 comp2_mask = vint4::lane_id() == vint4(component2); |
268 | | |
269 | 1.60k | for (unsigned int i = 0; i < partition_count; i++) |
270 | 802 | { |
271 | 802 | vfloat4 dir = pms[i].dir; |
272 | 802 | if (hadd_s(dir) < 0.0f) |
273 | 8 | { |
274 | 8 | dir = vfloat4::zero() - dir; |
275 | 8 | } |
276 | | |
277 | 802 | line2 line { pms[i].avg, normalize_safe(dir, unit2()) }; |
278 | 802 | float lowparam { 1e10f }; |
279 | 802 | float highparam { -1e10f }; |
280 | | |
281 | 802 | unsigned int partition_texel_count = pi.partition_texel_count[i]; |
282 | 16.6k | for (unsigned int j = 0; j < partition_texel_count; j++) |
283 | 15.8k | { |
284 | 15.8k | unsigned int tix = pi.texels_of_partition[i][j]; |
285 | 15.8k | vfloat4 point = vfloat2(data_vr[tix], data_vg[tix]); |
286 | 15.8k | float param = dot_s(point - line.a, line.b); |
287 | 15.8k | ei.weights[tix] = param; |
288 | | |
289 | 15.8k | lowparam = astc::min(param, lowparam); |
290 | 15.8k | highparam = astc::max(param, highparam); |
291 | 15.8k | } |
292 | | |
293 | | // It is possible for a uniform-color partition to produce length=0; |
294 | | // this causes NaN issues so set to small value to avoid this problem |
295 | 802 | if (highparam <= lowparam) |
296 | 9 | { |
297 | 9 | lowparam = 0.0f; |
298 | 9 | highparam = 1e-7f; |
299 | 9 | } |
300 | | |
301 | 802 | float length = highparam - lowparam; |
302 | 802 | float length_squared = length * length; |
303 | 802 | float scale = 1.0f / length; |
304 | | |
305 | 802 | if (i == 0) |
306 | 802 | { |
307 | 802 | partition0_len_sq = length_squared; |
308 | 802 | } |
309 | 0 | else |
310 | 0 | { |
311 | 0 | is_constant_wes = is_constant_wes && length_squared == partition0_len_sq; |
312 | 0 | } |
313 | | |
314 | 16.6k | for (unsigned int j = 0; j < partition_texel_count; j++) |
315 | 15.8k | { |
316 | 15.8k | unsigned int tix = pi.texels_of_partition[i][j]; |
317 | 15.8k | float idx = (ei.weights[tix] - lowparam) * scale; |
318 | 15.8k | idx = astc::clamp1f(idx); |
319 | | |
320 | 15.8k | ei.weights[tix] = idx; |
321 | 15.8k | ei.weight_error_scale[tix] = length_squared * error_weight; |
322 | 15.8k | assert(!astc::isnan(ei.weight_error_scale[tix])); |
323 | 15.8k | } |
324 | | |
325 | 802 | vfloat4 lowvalue = line.a + line.b * lowparam; |
326 | 802 | vfloat4 highvalue = line.a + line.b * highparam; |
327 | | |
328 | 802 | vfloat4 ep0 = select(blk.data_min, vfloat4(lowvalue.lane<0>()), comp1_mask); |
329 | 802 | vfloat4 ep1 = select(blk.data_max, vfloat4(highvalue.lane<0>()), comp1_mask); |
330 | | |
331 | 802 | ei.ep.endpt0[i] = select(ep0, vfloat4(lowvalue.lane<1>()), comp2_mask); |
332 | 802 | ei.ep.endpt1[i] = select(ep1, vfloat4(highvalue.lane<1>()), comp2_mask); |
333 | 802 | } |
334 | | |
335 | | // Zero initialize any SIMD over-fetch |
336 | 802 | size_t texel_count_simd = round_up_to_simd_multiple_vla(texel_count); |
337 | 917 | for (size_t i = texel_count; i < texel_count_simd; i++) |
338 | 115 | { |
339 | 115 | ei.weights[i] = 0.0f; |
340 | 115 | ei.weight_error_scale[i] = 0.0f; |
341 | 115 | } |
342 | | |
343 | 802 | ei.is_constant_weight_error_scale = is_constant_wes; |
344 | 802 | } |
345 | | |
346 | | /** |
347 | | * @brief Compute the ideal endpoints and weights for 3 color components. |
348 | | * |
349 | | * @param blk The image block color data to compress. |
350 | | * @param pi The partition info for the current trial. |
351 | | * @param[out] ei The computed ideal endpoints and weights. |
352 | | * @param omitted_component The color component excluded from the calculation. |
353 | | */ |
354 | | static void compute_ideal_colors_and_weights_3_comp( |
355 | | const image_block& blk, |
356 | | const partition_info& pi, |
357 | | endpoints_and_weights& ei, |
358 | | unsigned int omitted_component |
359 | 7.79k | ) { |
360 | 7.79k | unsigned int partition_count = pi.partition_count; |
361 | 7.79k | ei.ep.partition_count = partition_count; |
362 | 7.79k | promise(partition_count > 0); |
363 | | |
364 | 7.79k | unsigned int texel_count = blk.texel_count; |
365 | 7.79k | promise(texel_count > 0); |
366 | | |
367 | 7.79k | partition_metrics pms[BLOCK_MAX_PARTITIONS]; |
368 | | |
369 | 7.79k | float error_weight; |
370 | 7.79k | const float* data_vr = nullptr; |
371 | 7.79k | const float* data_vg = nullptr; |
372 | 7.79k | const float* data_vb = nullptr; |
373 | 7.79k | if (omitted_component == 0) |
374 | 1.41k | { |
375 | 1.41k | error_weight = hadd_s(blk.channel_weight.swz<0, 1, 2>()); |
376 | 1.41k | data_vr = blk.data_g; |
377 | 1.41k | data_vg = blk.data_b; |
378 | 1.41k | data_vb = blk.data_a; |
379 | 1.41k | } |
380 | 6.37k | else if (omitted_component == 1) |
381 | 1.44k | { |
382 | 1.44k | error_weight = hadd_s(blk.channel_weight.swz<0, 2, 3>()); |
383 | 1.44k | data_vr = blk.data_r; |
384 | 1.44k | data_vg = blk.data_b; |
385 | 1.44k | data_vb = blk.data_a; |
386 | 1.44k | } |
387 | 4.92k | else if (omitted_component == 2) |
388 | 1.53k | { |
389 | 1.53k | error_weight = hadd_s(blk.channel_weight.swz<0, 1, 3>()); |
390 | 1.53k | data_vr = blk.data_r; |
391 | 1.53k | data_vg = blk.data_g; |
392 | 1.53k | data_vb = blk.data_a; |
393 | 1.53k | } |
394 | 3.39k | else |
395 | 3.39k | { |
396 | 3.39k | assert(omitted_component == 3); |
397 | | |
398 | 3.39k | error_weight = hadd_s(blk.channel_weight.swz<0, 1, 2>()); |
399 | 3.39k | data_vr = blk.data_r; |
400 | 3.39k | data_vg = blk.data_g; |
401 | 3.39k | data_vb = blk.data_b; |
402 | 3.39k | } |
403 | | |
404 | 7.79k | error_weight = error_weight * (1.0f / 3.0f); |
405 | | |
406 | 7.79k | if (omitted_component == 3) |
407 | 3.39k | { |
408 | 3.39k | compute_avgs_and_dirs_3_comp_rgb(pi, blk, pms); |
409 | 3.39k | } |
410 | 4.39k | else |
411 | 4.39k | { |
412 | 4.39k | compute_avgs_and_dirs_3_comp(pi, blk, omitted_component, pms); |
413 | 4.39k | } |
414 | | |
415 | 7.79k | bool is_constant_wes { true }; |
416 | 7.79k | float partition0_len_sq { 0.0f }; |
417 | | |
418 | 17.5k | for (unsigned int i = 0; i < partition_count; i++) |
419 | 9.79k | { |
420 | 9.79k | vfloat4 dir = pms[i].dir; |
421 | 9.79k | if (hadd_rgb_s(dir) < 0.0f) |
422 | 226 | { |
423 | 226 | dir = vfloat4::zero() - dir; |
424 | 226 | } |
425 | | |
426 | 9.79k | line3 line { pms[i].avg, normalize_safe(dir, unit3()) }; |
427 | 9.79k | float lowparam { 1e10f }; |
428 | 9.79k | float highparam { -1e10f }; |
429 | | |
430 | 9.79k | unsigned int partition_texel_count = pi.partition_texel_count[i]; |
431 | 201k | for (unsigned int j = 0; j < partition_texel_count; j++) |
432 | 191k | { |
433 | 191k | unsigned int tix = pi.texels_of_partition[i][j]; |
434 | 191k | vfloat4 point = vfloat3(data_vr[tix], data_vg[tix], data_vb[tix]); |
435 | 191k | float param = dot3_s(point - line.a, line.b); |
436 | 191k | ei.weights[tix] = param; |
437 | | |
438 | 191k | lowparam = astc::min(param, lowparam); |
439 | 191k | highparam = astc::max(param, highparam); |
440 | 191k | } |
441 | | |
442 | | // It is possible for a uniform-color partition to produce length=0; |
443 | | // this causes NaN issues so set to small value to avoid this problem |
444 | 9.79k | if (highparam <= lowparam) |
445 | 620 | { |
446 | 620 | lowparam = 0.0f; |
447 | 620 | highparam = 1e-7f; |
448 | 620 | } |
449 | | |
450 | 9.79k | float length = highparam - lowparam; |
451 | 9.79k | float length_squared = length * length; |
452 | 9.79k | float scale = 1.0f / length; |
453 | | |
454 | 9.79k | if (i == 0) |
455 | 7.79k | { |
456 | 7.79k | partition0_len_sq = length_squared; |
457 | 7.79k | } |
458 | 2.00k | else |
459 | 2.00k | { |
460 | 2.00k | is_constant_wes = is_constant_wes && length_squared == partition0_len_sq; |
461 | 2.00k | } |
462 | | |
463 | 201k | for (unsigned int j = 0; j < partition_texel_count; j++) |
464 | 191k | { |
465 | 191k | unsigned int tix = pi.texels_of_partition[i][j]; |
466 | 191k | float idx = (ei.weights[tix] - lowparam) * scale; |
467 | 191k | idx = astc::clamp1f(idx); |
468 | | |
469 | 191k | ei.weights[tix] = idx; |
470 | 191k | ei.weight_error_scale[tix] = length_squared * error_weight; |
471 | 191k | assert(!astc::isnan(ei.weight_error_scale[tix])); |
472 | 191k | } |
473 | | |
474 | 9.79k | vfloat4 ep0 = line.a + line.b * lowparam; |
475 | 9.79k | vfloat4 ep1 = line.a + line.b * highparam; |
476 | | |
477 | 9.79k | vfloat4 bmin = blk.data_min; |
478 | 9.79k | vfloat4 bmax = blk.data_max; |
479 | | |
480 | 9.79k | assert(omitted_component < BLOCK_MAX_COMPONENTS); |
481 | 9.79k | switch (omitted_component) |
482 | 9.79k | { |
483 | 1.41k | case 0: |
484 | 1.41k | ei.ep.endpt0[i] = vfloat4(bmin.lane<0>(), ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>()); |
485 | 1.41k | ei.ep.endpt1[i] = vfloat4(bmax.lane<0>(), ep1.lane<0>(), ep1.lane<1>(), ep1.lane<2>()); |
486 | 1.41k | break; |
487 | 1.44k | case 1: |
488 | 1.44k | ei.ep.endpt0[i] = vfloat4(ep0.lane<0>(), bmin.lane<1>(), ep0.lane<1>(), ep0.lane<2>()); |
489 | 1.44k | ei.ep.endpt1[i] = vfloat4(ep1.lane<0>(), bmax.lane<1>(), ep1.lane<1>(), ep1.lane<2>()); |
490 | 1.44k | break; |
491 | 1.53k | case 2: |
492 | 1.53k | ei.ep.endpt0[i] = vfloat4(ep0.lane<0>(), ep0.lane<1>(), bmin.lane<2>(), ep0.lane<2>()); |
493 | 1.53k | ei.ep.endpt1[i] = vfloat4(ep1.lane<0>(), ep1.lane<1>(), bmax.lane<2>(), ep1.lane<2>()); |
494 | 1.53k | break; |
495 | 5.40k | default: |
496 | 5.40k | ei.ep.endpt0[i] = vfloat4(ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>(), bmin.lane<3>()); |
497 | 5.40k | ei.ep.endpt1[i] = vfloat4(ep1.lane<0>(), ep1.lane<1>(), ep1.lane<2>(), bmax.lane<3>()); |
498 | 5.40k | break; |
499 | 9.79k | } |
500 | 9.79k | } |
501 | | |
502 | | // Zero initialize any SIMD over-fetch |
503 | 7.79k | size_t texel_count_simd = round_up_to_simd_multiple_vla(texel_count); |
504 | 10.1k | for (size_t i = texel_count; i < texel_count_simd; i++) |
505 | 2.35k | { |
506 | 2.35k | ei.weights[i] = 0.0f; |
507 | 2.35k | ei.weight_error_scale[i] = 0.0f; |
508 | 2.35k | } |
509 | | |
510 | 7.79k | ei.is_constant_weight_error_scale = is_constant_wes; |
511 | 7.79k | } |
512 | | |
513 | | /** |
514 | | * @brief Compute the ideal endpoints and weights for 4 color components. |
515 | | * |
516 | | * @param blk The image block color data to compress. |
517 | | * @param pi The partition info for the current trial. |
518 | | * @param[out] ei The computed ideal endpoints and weights. |
519 | | */ |
520 | | static void compute_ideal_colors_and_weights_4_comp( |
521 | | const image_block& blk, |
522 | | const partition_info& pi, |
523 | | endpoints_and_weights& ei |
524 | 9.32k | ) { |
525 | 9.32k | const float error_weight = hadd_s(blk.channel_weight) / 4.0f; |
526 | | |
527 | 9.32k | unsigned int partition_count = pi.partition_count; |
528 | | |
529 | 9.32k | unsigned int texel_count = blk.texel_count; |
530 | 9.32k | promise(texel_count > 0); |
531 | 9.32k | promise(partition_count > 0); |
532 | | |
533 | 9.32k | partition_metrics pms[BLOCK_MAX_PARTITIONS]; |
534 | | |
535 | 9.32k | compute_avgs_and_dirs_4_comp(pi, blk, pms); |
536 | | |
537 | 9.32k | bool is_constant_wes { true }; |
538 | 9.32k | float partition0_len_sq { 0.0f }; |
539 | | |
540 | 30.2k | for (unsigned int i = 0; i < partition_count; i++) |
541 | 20.9k | { |
542 | 20.9k | vfloat4 dir = pms[i].dir; |
543 | 20.9k | if (hadd_rgb_s(dir) < 0.0f) |
544 | 2.20k | { |
545 | 2.20k | dir = vfloat4::zero() - dir; |
546 | 2.20k | } |
547 | | |
548 | 20.9k | line4 line { pms[i].avg, normalize_safe(dir, unit4()) }; |
549 | 20.9k | float lowparam { 1e10f }; |
550 | 20.9k | float highparam { -1e10f }; |
551 | | |
552 | 20.9k | unsigned int partition_texel_count = pi.partition_texel_count[i]; |
553 | 274k | for (unsigned int j = 0; j < partition_texel_count; j++) |
554 | 254k | { |
555 | 254k | unsigned int tix = pi.texels_of_partition[i][j]; |
556 | 254k | vfloat4 point = blk.texel(tix); |
557 | 254k | float param = dot_s(point - line.a, line.b); |
558 | 254k | ei.weights[tix] = param; |
559 | | |
560 | 254k | lowparam = astc::min(param, lowparam); |
561 | 254k | highparam = astc::max(param, highparam); |
562 | 254k | } |
563 | | |
564 | | // It is possible for a uniform-color partition to produce length=0; |
565 | | // this causes NaN issues so set to small value to avoid this problem |
566 | 20.9k | if (highparam <= lowparam) |
567 | 1.93k | { |
568 | 1.93k | lowparam = 0.0f; |
569 | 1.93k | highparam = 1e-7f; |
570 | 1.93k | } |
571 | | |
572 | 20.9k | float length = highparam - lowparam; |
573 | 20.9k | float length_squared = length * length; |
574 | 20.9k | float scale = 1.0f / length; |
575 | | |
576 | 20.9k | if (i == 0) |
577 | 9.32k | { |
578 | 9.32k | partition0_len_sq = length_squared; |
579 | 9.32k | } |
580 | 11.6k | else |
581 | 11.6k | { |
582 | 11.6k | is_constant_wes = is_constant_wes && length_squared == partition0_len_sq; |
583 | 11.6k | } |
584 | | |
585 | 20.9k | ei.ep.endpt0[i] = line.a + line.b * lowparam; |
586 | 20.9k | ei.ep.endpt1[i] = line.a + line.b * highparam; |
587 | | |
588 | 274k | for (unsigned int j = 0; j < partition_texel_count; j++) |
589 | 254k | { |
590 | 254k | unsigned int tix = pi.texels_of_partition[i][j]; |
591 | 254k | float idx = (ei.weights[tix] - lowparam) * scale; |
592 | 254k | idx = astc::clamp1f(idx); |
593 | | |
594 | 254k | ei.weights[tix] = idx; |
595 | 254k | ei.weight_error_scale[tix] = length_squared * error_weight; |
596 | 254k | assert(!astc::isnan(ei.weight_error_scale[tix])); |
597 | 254k | } |
598 | 20.9k | } |
599 | | |
600 | | // Zero initialize any SIMD over-fetch |
601 | 9.32k | size_t texel_count_simd = round_up_to_simd_multiple_vla(texel_count); |
602 | 12.7k | for (size_t i = texel_count; i < texel_count_simd; i++) |
603 | 3.44k | { |
604 | 3.44k | ei.weights[i] = 0.0f; |
605 | 3.44k | ei.weight_error_scale[i] = 0.0f; |
606 | 3.44k | } |
607 | | |
608 | 9.32k | ei.is_constant_weight_error_scale = is_constant_wes; |
609 | 9.32k | } |
610 | | |
611 | | /* See header for documentation. */ |
612 | | void compute_ideal_colors_and_weights_1plane( |
613 | | const image_block& blk, |
614 | | const partition_info& pi, |
615 | | endpoints_and_weights& ei |
616 | 10.9k | ) { |
617 | 10.9k | bool uses_alpha = !blk.is_constant_channel(3); |
618 | | |
619 | 10.9k | if (uses_alpha) |
620 | 9.32k | { |
621 | 9.32k | compute_ideal_colors_and_weights_4_comp(blk, pi, ei); |
622 | 9.32k | } |
623 | 1.60k | else |
624 | 1.60k | { |
625 | 1.60k | compute_ideal_colors_and_weights_3_comp(blk, pi, ei, 3); |
626 | 1.60k | } |
627 | 10.9k | } |
628 | | |
629 | | /* See header for documentation. */ |
630 | | void compute_ideal_colors_and_weights_2planes( |
631 | | const block_size_descriptor& bsd, |
632 | | const image_block& blk, |
633 | | unsigned int plane2_component, |
634 | | endpoints_and_weights& ei1, |
635 | | endpoints_and_weights& ei2 |
636 | 6.99k | ) { |
637 | 6.99k | const auto& pi = bsd.get_partition_info(1, 0); |
638 | 6.99k | bool uses_alpha = !blk.is_constant_channel(3); |
639 | | |
640 | 6.99k | assert(plane2_component < BLOCK_MAX_COMPONENTS); |
641 | 6.99k | switch (plane2_component) |
642 | 6.99k | { |
643 | 1.66k | case 0: // Separate weights for red |
644 | 1.66k | if (uses_alpha) |
645 | 1.41k | { |
646 | 1.41k | compute_ideal_colors_and_weights_3_comp(blk, pi, ei1, 0); |
647 | 1.41k | } |
648 | 249 | else |
649 | 249 | { |
650 | 249 | compute_ideal_colors_and_weights_2_comp(blk, pi, ei1, 1, 2); |
651 | 249 | } |
652 | 1.66k | compute_ideal_colors_and_weights_1_comp(blk, pi, ei2, 0); |
653 | 1.66k | break; |
654 | | |
655 | 1.69k | case 1: // Separate weights for green |
656 | 1.69k | if (uses_alpha) |
657 | 1.44k | { |
658 | 1.44k | compute_ideal_colors_and_weights_3_comp(blk, pi, ei1, 1); |
659 | 1.44k | } |
660 | 250 | else |
661 | 250 | { |
662 | 250 | compute_ideal_colors_and_weights_2_comp(blk, pi, ei1, 0, 2); |
663 | 250 | } |
664 | 1.69k | compute_ideal_colors_and_weights_1_comp(blk, pi, ei2, 1); |
665 | 1.69k | break; |
666 | | |
667 | 1.83k | case 2: // Separate weights for blue |
668 | 1.83k | if (uses_alpha) |
669 | 1.53k | { |
670 | 1.53k | compute_ideal_colors_and_weights_3_comp(blk, pi, ei1, 2); |
671 | 1.53k | } |
672 | 303 | else |
673 | 303 | { |
674 | 303 | compute_ideal_colors_and_weights_2_comp(blk, pi, ei1, 0, 1); |
675 | 303 | } |
676 | 1.83k | compute_ideal_colors_and_weights_1_comp(blk, pi, ei2, 2); |
677 | 1.83k | break; |
678 | | |
679 | 1.79k | default: // Separate weights for alpha |
680 | 1.79k | assert(uses_alpha); |
681 | 1.79k | compute_ideal_colors_and_weights_3_comp(blk, pi, ei1, 3); |
682 | 1.79k | compute_ideal_colors_and_weights_1_comp(blk, pi, ei2, 3); |
683 | 1.79k | break; |
684 | 6.99k | } |
685 | 6.99k | } |
686 | | |
687 | | /* See header for documentation. */ |
688 | | float compute_error_of_weight_set_1plane( |
689 | | const endpoints_and_weights& eai, |
690 | | const decimation_info& di, |
691 | | const float* dec_weight_quant_uvalue |
692 | 176k | ) { |
693 | 176k | vfloatacc error_summav = vfloatacc::zero(); |
694 | 176k | unsigned int texel_count = di.texel_count; |
695 | 176k | promise(texel_count > 0); |
696 | | |
697 | | // Process SIMD-width chunks, safe to over-fetch - the extra space is zero initialized |
698 | 176k | if (di.max_texel_weight_count > 2) |
699 | 42.3k | { |
700 | 979k | for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) |
701 | 936k | { |
702 | | // Compute the bilinear interpolation of the decimated weight grid |
703 | 936k | vfloat current_values = bilinear_infill_vla(di, dec_weight_quant_uvalue, i); |
704 | | |
705 | | // Compute the error between the computed value and the ideal weight |
706 | 936k | vfloat actual_values = loada(eai.weights + i); |
707 | 936k | vfloat diff = current_values - actual_values; |
708 | 936k | vfloat significance = loada(eai.weight_error_scale + i); |
709 | 936k | vfloat error = diff * diff * significance; |
710 | | |
711 | 936k | haccumulate(error_summav, error); |
712 | 936k | } |
713 | 42.3k | } |
714 | 134k | else if (di.max_texel_weight_count > 1) |
715 | 59.3k | { |
716 | 720k | for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) |
717 | 660k | { |
718 | | // Compute the bilinear interpolation of the decimated weight grid |
719 | 660k | vfloat current_values = bilinear_infill_vla_2(di, dec_weight_quant_uvalue, i); |
720 | | |
721 | | // Compute the error between the computed value and the ideal weight |
722 | 660k | vfloat actual_values = loada(eai.weights + i); |
723 | 660k | vfloat diff = current_values - actual_values; |
724 | 660k | vfloat significance = loada(eai.weight_error_scale + i); |
725 | 660k | vfloat error = diff * diff * significance; |
726 | | |
727 | 660k | haccumulate(error_summav, error); |
728 | 660k | } |
729 | 59.3k | } |
730 | 74.9k | else |
731 | 74.9k | { |
732 | 428k | for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) |
733 | 353k | { |
734 | | // Load the weight set directly, without interpolation |
735 | 353k | vfloat current_values = loada(dec_weight_quant_uvalue + i); |
736 | | |
737 | | // Compute the error between the computed value and the ideal weight |
738 | 353k | vfloat actual_values = loada(eai.weights + i); |
739 | 353k | vfloat diff = current_values - actual_values; |
740 | 353k | vfloat significance = loada(eai.weight_error_scale + i); |
741 | 353k | vfloat error = diff * diff * significance; |
742 | | |
743 | 353k | haccumulate(error_summav, error); |
744 | 353k | } |
745 | 74.9k | } |
746 | | |
747 | | // Resolve the final scalar accumulator sum |
748 | 176k | return hadd_s(error_summav); |
749 | 176k | } |
750 | | |
751 | | /* See header for documentation. */ |
752 | | float compute_error_of_weight_set_2planes( |
753 | | const endpoints_and_weights& eai1, |
754 | | const endpoints_and_weights& eai2, |
755 | | const decimation_info& di, |
756 | | const float* dec_weight_quant_uvalue_plane1, |
757 | | const float* dec_weight_quant_uvalue_plane2 |
758 | 49.1k | ) { |
759 | 49.1k | vfloatacc error_summav = vfloatacc::zero(); |
760 | 49.1k | unsigned int texel_count = di.texel_count; |
761 | 49.1k | promise(texel_count > 0); |
762 | | |
763 | | // Process SIMD-width chunks, safe to over-fetch - the extra space is zero initialized |
764 | 49.1k | if (di.max_texel_weight_count > 2) |
765 | 21.4k | { |
766 | 290k | for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) |
767 | 268k | { |
768 | | // Plane 1 |
769 | | // Compute the bilinear interpolation of the decimated weight grid |
770 | 268k | vfloat current_values1 = bilinear_infill_vla(di, dec_weight_quant_uvalue_plane1, i); |
771 | | |
772 | | // Compute the error between the computed value and the ideal weight |
773 | 268k | vfloat actual_values1 = loada(eai1.weights + i); |
774 | 268k | vfloat diff = current_values1 - actual_values1; |
775 | 268k | vfloat error1 = diff * diff * loada(eai1.weight_error_scale + i); |
776 | | |
777 | | // Plane 2 |
778 | | // Compute the bilinear interpolation of the decimated weight grid |
779 | 268k | vfloat current_values2 = bilinear_infill_vla(di, dec_weight_quant_uvalue_plane2, i); |
780 | | |
781 | | // Compute the error between the computed value and the ideal weight |
782 | 268k | vfloat actual_values2 = loada(eai2.weights + i); |
783 | 268k | diff = current_values2 - actual_values2; |
784 | 268k | vfloat error2 = diff * diff * loada(eai2.weight_error_scale + i); |
785 | | |
786 | 268k | haccumulate(error_summav, error1 + error2); |
787 | 268k | } |
788 | 21.4k | } |
789 | 27.7k | else if (di.max_texel_weight_count > 1) |
790 | 17.4k | { |
791 | 103k | for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) |
792 | 86.1k | { |
793 | | // Plane 1 |
794 | | // Compute the bilinear interpolation of the decimated weight grid |
795 | 86.1k | vfloat current_values1 = bilinear_infill_vla_2(di, dec_weight_quant_uvalue_plane1, i); |
796 | | |
797 | | // Compute the error between the computed value and the ideal weight |
798 | 86.1k | vfloat actual_values1 = loada(eai1.weights + i); |
799 | 86.1k | vfloat diff = current_values1 - actual_values1; |
800 | 86.1k | vfloat error1 = diff * diff * loada(eai1.weight_error_scale + i); |
801 | | |
802 | | // Plane 2 |
803 | | // Compute the bilinear interpolation of the decimated weight grid |
804 | 86.1k | vfloat current_values2 = bilinear_infill_vla_2(di, dec_weight_quant_uvalue_plane2, i); |
805 | | |
806 | | // Compute the error between the computed value and the ideal weight |
807 | 86.1k | vfloat actual_values2 = loada(eai2.weights + i); |
808 | 86.1k | diff = current_values2 - actual_values2; |
809 | 86.1k | vfloat error2 = diff * diff * loada(eai2.weight_error_scale + i); |
810 | | |
811 | 86.1k | haccumulate(error_summav, error1 + error2); |
812 | 86.1k | } |
813 | 17.4k | } |
814 | 10.3k | else |
815 | 10.3k | { |
816 | 53.0k | for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) |
817 | 42.6k | { |
818 | | // Plane 1 |
819 | | // Load the weight set directly, without interpolation |
820 | 42.6k | vfloat current_values1 = loada(dec_weight_quant_uvalue_plane1 + i); |
821 | | |
822 | | // Compute the error between the computed value and the ideal weight |
823 | 42.6k | vfloat actual_values1 = loada(eai1.weights + i); |
824 | 42.6k | vfloat diff = current_values1 - actual_values1; |
825 | 42.6k | vfloat error1 = diff * diff * loada(eai1.weight_error_scale + i); |
826 | | |
827 | | // Plane 2 |
828 | | // Load the weight set directly, without interpolation |
829 | 42.6k | vfloat current_values2 = loada(dec_weight_quant_uvalue_plane2 + i); |
830 | | |
831 | | // Compute the error between the computed value and the ideal weight |
832 | 42.6k | vfloat actual_values2 = loada(eai2.weights + i); |
833 | 42.6k | diff = current_values2 - actual_values2; |
834 | 42.6k | vfloat error2 = diff * diff * loada(eai2.weight_error_scale + i); |
835 | | |
836 | 42.6k | haccumulate(error_summav, error1 + error2); |
837 | 42.6k | } |
838 | 10.3k | } |
839 | | |
840 | | // Resolve the final scalar accumulator sum |
841 | 49.1k | return hadd_s(error_summav); |
842 | 49.1k | } |
843 | | |
844 | | /* See header for documentation. */ |
845 | | void compute_ideal_weights_for_decimation( |
846 | | const endpoints_and_weights& ei, |
847 | | const decimation_info& di, |
848 | | float* dec_weight_ideal_value |
849 | 148k | ) { |
850 | 148k | unsigned int texel_count = di.texel_count; |
851 | 148k | unsigned int weight_count = di.weight_count; |
852 | 148k | bool is_direct = texel_count == weight_count; |
853 | 148k | promise(texel_count > 0); |
854 | 148k | promise(weight_count > 0); |
855 | | |
856 | | // If we have a 1:1 mapping just shortcut the computation. Transfer enough to also copy the |
857 | | // zero-initialized SIMD over-fetch region |
858 | 148k | if (is_direct) |
859 | 21.3k | { |
860 | 125k | for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) |
861 | 103k | { |
862 | 103k | vfloat weight(ei.weights + i); |
863 | 103k | storea(weight, dec_weight_ideal_value + i); |
864 | 103k | } |
865 | | |
866 | 21.3k | return; |
867 | 21.3k | } |
868 | | |
869 | | // Otherwise compute an estimate and perform single refinement iteration |
870 | | |
871 | | // Compute an initial average for each decimated weight |
872 | 127k | bool constant_wes = ei.is_constant_weight_error_scale; |
873 | 127k | vfloat weight_error_scale(ei.weight_error_scale[0]); |
874 | | |
875 | | // This overshoots - this is OK as we initialize the array tails in the |
876 | | // decimation table structures to safe values ... |
877 | 821k | for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH) |
878 | 693k | { |
879 | | // Start with a small value to avoid div-by-zero later |
880 | 693k | vfloat weight_weight(1e-10f); |
881 | 693k | vfloat initial_weight = vfloat::zero(); |
882 | | |
883 | | // Accumulate error weighting of all the texels using this weight |
884 | 693k | vint weight_texel_count(di.weight_texel_count + i); |
885 | 693k | unsigned int max_texel_count = hmax_s(weight_texel_count); |
886 | 693k | promise(max_texel_count > 0); |
887 | | |
888 | 6.00M | for (unsigned int j = 0; j < max_texel_count; j++) |
889 | 5.30M | { |
890 | 5.30M | const uint8_t* texel = di.weight_texels_tr[j] + i; |
891 | 5.30M | vfloat weight = loada(di.weights_texel_contribs_tr[j] + i); |
892 | | |
893 | 5.30M | if (!constant_wes) |
894 | 2.10M | { |
895 | 2.10M | weight_error_scale = gatherf_byte_inds<vfloat>(ei.weight_error_scale, texel); |
896 | 2.10M | } |
897 | | |
898 | 5.30M | vfloat contrib_weight = weight * weight_error_scale; |
899 | | |
900 | 5.30M | weight_weight += contrib_weight; |
901 | 5.30M | initial_weight += gatherf_byte_inds<vfloat>(ei.weights, texel) * contrib_weight; |
902 | 5.30M | } |
903 | | |
904 | 693k | storea(initial_weight / weight_weight, dec_weight_ideal_value + i); |
905 | 693k | } |
906 | | |
907 | | // Populate the interpolated weight grid based on the initial average |
908 | | // Process SIMD-width texel coordinates at at time while we can. Safe to |
909 | | // over-process full SIMD vectors - the tail is zeroed. |
910 | 127k | ASTCENC_ALIGNAS float infilled_weights[BLOCK_MAX_TEXELS]; |
911 | 127k | if (di.max_texel_weight_count <= 2) |
912 | 53.7k | { |
913 | 528k | for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) |
914 | 474k | { |
915 | 474k | vfloat weight = bilinear_infill_vla_2(di, dec_weight_ideal_value, i); |
916 | 474k | storea(weight, infilled_weights + i); |
917 | 474k | } |
918 | 53.7k | } |
919 | 73.5k | else |
920 | 73.5k | { |
921 | 1.30M | for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) |
922 | 1.22M | { |
923 | 1.22M | vfloat weight = bilinear_infill_vla(di, dec_weight_ideal_value, i); |
924 | 1.22M | storea(weight, infilled_weights + i); |
925 | 1.22M | } |
926 | 73.5k | } |
927 | | |
928 | | // Perform a single iteration of refinement |
929 | | // Empirically determined step size; larger values don't help but smaller drops image quality |
930 | 127k | constexpr float stepsize = 0.25f; |
931 | 127k | constexpr float chd_scale = -WEIGHTS_TEXEL_SUM; |
932 | | |
933 | 821k | for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH) |
934 | 693k | { |
935 | 693k | vfloat weight_val = loada(dec_weight_ideal_value + i); |
936 | | |
937 | | // Accumulate error weighting of all the texels using this weight |
938 | | // Start with a small value to avoid div-by-zero later |
939 | 693k | vfloat error_change0(1e-10f); |
940 | 693k | vfloat error_change1(0.0f); |
941 | | |
942 | | // Accumulate error weighting of all the texels using this weight |
943 | 693k | vint weight_texel_count(di.weight_texel_count + i); |
944 | 693k | unsigned int max_texel_count = hmax_s(weight_texel_count); |
945 | 693k | promise(max_texel_count > 0); |
946 | | |
947 | 6.00M | for (unsigned int j = 0; j < max_texel_count; j++) |
948 | 5.30M | { |
949 | 5.30M | const uint8_t* texel = di.weight_texels_tr[j] + i; |
950 | 5.30M | vfloat contrib_weight = loada(di.weights_texel_contribs_tr[j] + i); |
951 | | |
952 | 5.30M | if (!constant_wes) |
953 | 2.10M | { |
954 | 2.10M | weight_error_scale = gatherf_byte_inds<vfloat>(ei.weight_error_scale, texel); |
955 | 2.10M | } |
956 | | |
957 | 5.30M | vfloat scale = weight_error_scale * contrib_weight; |
958 | 5.30M | vfloat old_weight = gatherf_byte_inds<vfloat>(infilled_weights, texel); |
959 | 5.30M | vfloat ideal_weight = gatherf_byte_inds<vfloat>(ei.weights, texel); |
960 | | |
961 | 5.30M | error_change0 += contrib_weight * scale; |
962 | 5.30M | error_change1 += (old_weight - ideal_weight) * scale; |
963 | 5.30M | } |
964 | | |
965 | 693k | vfloat step = (error_change1 * chd_scale) / error_change0; |
966 | 693k | step = clamp(-stepsize, stepsize, step); |
967 | | |
968 | | // Update the weight; note this can store negative values |
969 | 693k | storea(weight_val + step, dec_weight_ideal_value + i); |
970 | 693k | } |
971 | 127k | } |
972 | | |
973 | | /* See header for documentation. */ |
974 | | void compute_quantized_weights_for_decimation( |
975 | | const decimation_info& di, |
976 | | float low_bound, |
977 | | float high_bound, |
978 | | const float* dec_weight_ideal_value, |
979 | | float* weight_set_out, |
980 | | uint8_t* quantized_weight_set, |
981 | | quant_method quant_level |
982 | 275k | ) { |
983 | 275k | int weight_count = di.weight_count; |
984 | 275k | promise(weight_count > 0); |
985 | 275k | const quant_and_transfer_table& qat = quant_and_xfer_tables[quant_level]; |
986 | | |
987 | | // The available quant levels, stored with a minus 1 bias |
988 | 275k | static const float quant_levels_m1[12] { |
989 | 275k | 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 7.0f, 9.0f, 11.0f, 15.0f, 19.0f, 23.0f, 31.0f |
990 | 275k | }; |
991 | | |
992 | 275k | vint steps_m1(get_quant_level(quant_level) - 1); |
993 | 275k | float quant_level_m1 = quant_levels_m1[quant_level]; |
994 | | |
995 | | // Quantize the weight set using both the specified low/high bounds and standard 0..1 bounds |
996 | | |
997 | | // TODO: Oddity to investigate; triggered by test in issue #265. |
998 | 275k | if (high_bound <= low_bound) |
999 | 3 | { |
1000 | 3 | low_bound = 0.0f; |
1001 | 3 | high_bound = 1.0f; |
1002 | 3 | } |
1003 | | |
1004 | 275k | float rscale = high_bound - low_bound; |
1005 | 275k | float scale = 1.0f / rscale; |
1006 | | |
1007 | 275k | float scaled_low_bound = low_bound * scale; |
1008 | 275k | rscale *= 1.0f / 64.0f; |
1009 | | |
1010 | 275k | vfloat scalev(scale); |
1011 | 275k | vfloat scaled_low_boundv(scaled_low_bound); |
1012 | 275k | vfloat quant_level_m1v(quant_level_m1); |
1013 | 275k | vfloat rscalev(rscale); |
1014 | 275k | vfloat low_boundv(low_bound); |
1015 | | |
1016 | | // This runs to the rounded-up SIMD size, which is safe as the loop tail is filled with known |
1017 | | // safe data in compute_ideal_weights_for_decimation and arrays are always 64 elements |
1018 | 275k | if (get_quant_level(quant_level) <= 16) |
1019 | 251k | { |
1020 | 251k | vtable_16x8 table; |
1021 | 251k | vtable_prepare(table, qat.quant_to_unquant); |
1022 | | |
1023 | 1.65M | for (int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH) |
1024 | 1.40M | { |
1025 | 1.40M | vfloat ix = loada(dec_weight_ideal_value + i) * scalev - scaled_low_boundv; |
1026 | 1.40M | ix = clampzo(ix); |
1027 | | |
1028 | | // Look up the two closest indexes and return the one that was closest |
1029 | 1.40M | vfloat ix1 = ix * quant_level_m1v; |
1030 | | |
1031 | 1.40M | vint weightl = float_to_int(ix1); |
1032 | 1.40M | vint weighth = min(weightl + vint(1), steps_m1); |
1033 | | |
1034 | 1.40M | vint ixli = vtable_lookup_32bit(table, weightl); |
1035 | 1.40M | vint ixhi = vtable_lookup_32bit(table, weighth); |
1036 | | |
1037 | 1.40M | vfloat ixl = int_to_float(ixli); |
1038 | 1.40M | vfloat ixh = int_to_float(ixhi); |
1039 | | |
1040 | 1.40M | vmask mask = (ixl + ixh) < (vfloat(128.0f) * ix); |
1041 | 1.40M | vint weight = select(ixli, ixhi, mask); |
1042 | 1.40M | ixl = select(ixl, ixh, mask); |
1043 | | |
1044 | | // Invert the weight-scaling that was done initially |
1045 | 1.40M | storea(ixl * rscalev + low_boundv, weight_set_out + i); |
1046 | 1.40M | pack_and_store_low_bytes(weight, quantized_weight_set + i); |
1047 | 1.40M | } |
1048 | 251k | } |
1049 | 23.8k | else |
1050 | 23.8k | { |
1051 | 23.8k | vtable_32x8 table; |
1052 | 23.8k | vtable_prepare(table, qat.quant_to_unquant); |
1053 | | |
1054 | 111k | for (int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH) |
1055 | 87.8k | { |
1056 | 87.8k | vfloat ix = loada(dec_weight_ideal_value + i) * scalev - scaled_low_boundv; |
1057 | 87.8k | ix = clampzo(ix); |
1058 | | |
1059 | | // Look up the two closest indexes and return the one that was closest |
1060 | 87.8k | vfloat ix1 = ix * quant_level_m1v; |
1061 | | |
1062 | 87.8k | vint weightl = float_to_int(ix1); |
1063 | 87.8k | vint weighth = min(weightl + vint(1), steps_m1); |
1064 | | |
1065 | 87.8k | vint ixli = vtable_lookup_32bit(table, weightl); |
1066 | 87.8k | vint ixhi = vtable_lookup_32bit(table, weighth); |
1067 | | |
1068 | 87.8k | vfloat ixl = int_to_float(ixli); |
1069 | 87.8k | vfloat ixh = int_to_float(ixhi); |
1070 | | |
1071 | 87.8k | vmask mask = (ixl + ixh) < (vfloat(128.0f) * ix); |
1072 | 87.8k | vint weight = select(ixli, ixhi, mask); |
1073 | 87.8k | ixl = select(ixl, ixh, mask); |
1074 | | |
1075 | | // Invert the weight-scaling that was done initially |
1076 | 87.8k | storea(ixl * rscalev + low_boundv, weight_set_out + i); |
1077 | 87.8k | pack_and_store_low_bytes(weight, quantized_weight_set + i); |
1078 | 87.8k | } |
1079 | 23.8k | } |
1080 | 275k | } |
1081 | | |
1082 | | /** |
1083 | | * @brief Compute the RGB + offset for a HDR endpoint mode #7. |
1084 | | * |
1085 | | * Since the matrix needed has a regular structure we can simplify the inverse calculation. This |
1086 | | * gives us ~24 multiplications vs. 96 for a generic inverse. |
1087 | | * |
1088 | | * mat[0] = vfloat4(rgba_ws.x, 0.0f, 0.0f, wght_ws.x); |
1089 | | * mat[1] = vfloat4( 0.0f, rgba_ws.y, 0.0f, wght_ws.y); |
1090 | | * mat[2] = vfloat4( 0.0f, 0.0f, rgba_ws.z, wght_ws.z); |
1091 | | * mat[3] = vfloat4(wght_ws.x, wght_ws.y, wght_ws.z, psum); |
1092 | | * mat = invert(mat); |
1093 | | * |
1094 | | * @param rgba_weight_sum Sum of partition component error weights. |
1095 | | * @param weight_weight_sum Sum of partition component error weights * texel weight. |
1096 | | * @param rgbq_sum Sum of partition component error weights * texel weight * color data. |
1097 | | * @param psum Sum of RGB color weights * texel weight^2. |
1098 | | */ |
1099 | | static inline vfloat4 compute_rgbo_vector( |
1100 | | vfloat4 rgba_weight_sum, |
1101 | | vfloat4 weight_weight_sum, |
1102 | | vfloat4 rgbq_sum, |
1103 | | float psum |
1104 | 58.8k | ) { |
1105 | 58.8k | float X = rgba_weight_sum.lane<0>(); |
1106 | 58.8k | float Y = rgba_weight_sum.lane<1>(); |
1107 | 58.8k | float Z = rgba_weight_sum.lane<2>(); |
1108 | 58.8k | float P = weight_weight_sum.lane<0>(); |
1109 | 58.8k | float Q = weight_weight_sum.lane<1>(); |
1110 | 58.8k | float R = weight_weight_sum.lane<2>(); |
1111 | 58.8k | float S = psum; |
1112 | | |
1113 | 58.8k | float PP = P * P; |
1114 | 58.8k | float QQ = Q * Q; |
1115 | 58.8k | float RR = R * R; |
1116 | | |
1117 | 58.8k | float SZmRR = S * Z - RR; |
1118 | 58.8k | float DT = SZmRR * Y - Z * QQ; |
1119 | 58.8k | float YP = Y * P; |
1120 | 58.8k | float QX = Q * X; |
1121 | 58.8k | float YX = Y * X; |
1122 | 58.8k | float mZYP = -Z * YP; |
1123 | 58.8k | float mZQX = -Z * QX; |
1124 | 58.8k | float mRYX = -R * YX; |
1125 | 58.8k | float ZQP = Z * Q * P; |
1126 | 58.8k | float RYP = R * YP; |
1127 | 58.8k | float RQX = R * QX; |
1128 | | |
1129 | | // Compute the reciprocal of matrix determinant |
1130 | 58.8k | float rdet = 1.0f / (DT * X + mZYP * P); |
1131 | | |
1132 | | // Actually compute the adjugate, and then apply 1/det separately |
1133 | 58.8k | vfloat4 mat0(DT, ZQP, RYP, mZYP); |
1134 | 58.8k | vfloat4 mat1(ZQP, SZmRR * X - Z * PP, RQX, mZQX); |
1135 | 58.8k | vfloat4 mat2(RYP, RQX, (S * Y - QQ) * X - Y * PP, mRYX); |
1136 | 58.8k | vfloat4 mat3(mZYP, mZQX, mRYX, Z * YX); |
1137 | 58.8k | vfloat4 vect = rgbq_sum * rdet; |
1138 | | |
1139 | 58.8k | return vfloat4(dot_s(mat0, vect), |
1140 | 58.8k | dot_s(mat1, vect), |
1141 | 58.8k | dot_s(mat2, vect), |
1142 | 58.8k | dot_s(mat3, vect)); |
1143 | 58.8k | } |
1144 | | |
1145 | | /* See header for documentation. */ |
1146 | | void recompute_ideal_colors_1plane( |
1147 | | const image_block& blk, |
1148 | | const partition_info& pi, |
1149 | | const decimation_info& di, |
1150 | | const uint8_t* dec_weights_uquant, |
1151 | | endpoints& ep, |
1152 | | vfloat4 rgbs_vectors[BLOCK_MAX_PARTITIONS], |
1153 | | vfloat4 rgbo_vectors[BLOCK_MAX_PARTITIONS] |
1154 | 45.8k | ) { |
1155 | 45.8k | unsigned int weight_count = di.weight_count; |
1156 | 45.8k | unsigned int total_texel_count = blk.texel_count; |
1157 | 45.8k | unsigned int partition_count = pi.partition_count; |
1158 | | |
1159 | 45.8k | promise(weight_count > 0); |
1160 | 45.8k | promise(total_texel_count > 0); |
1161 | 45.8k | promise(partition_count > 0); |
1162 | | |
1163 | 45.8k | ASTCENC_ALIGNAS float dec_weight[BLOCK_MAX_WEIGHTS]; |
1164 | 292k | for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH) |
1165 | 246k | { |
1166 | 246k | vint unquant_value(dec_weights_uquant + i); |
1167 | 246k | vfloat unquant_valuef = int_to_float(unquant_value) * vfloat(1.0f / 64.0f); |
1168 | 246k | storea(unquant_valuef, dec_weight + i); |
1169 | 246k | } |
1170 | | |
1171 | 45.8k | ASTCENC_ALIGNAS float undec_weight[BLOCK_MAX_TEXELS]; |
1172 | 45.8k | float* undec_weight_ref; |
1173 | 45.8k | if (di.max_texel_weight_count == 1) |
1174 | 38.0k | { |
1175 | 38.0k | undec_weight_ref = dec_weight; |
1176 | 38.0k | } |
1177 | 7.83k | else if (di.max_texel_weight_count <= 2) |
1178 | 6.53k | { |
1179 | 93.7k | for (unsigned int i = 0; i < total_texel_count; i += ASTCENC_SIMD_WIDTH) |
1180 | 87.1k | { |
1181 | 87.1k | vfloat weight = bilinear_infill_vla_2(di, dec_weight, i); |
1182 | 87.1k | storea(weight, undec_weight + i); |
1183 | 87.1k | } |
1184 | | |
1185 | 6.53k | undec_weight_ref = undec_weight; |
1186 | 6.53k | } |
1187 | 1.29k | else |
1188 | 1.29k | { |
1189 | 33.4k | for (unsigned int i = 0; i < total_texel_count; i += ASTCENC_SIMD_WIDTH) |
1190 | 32.1k | { |
1191 | 32.1k | vfloat weight = bilinear_infill_vla(di, dec_weight, i); |
1192 | 32.1k | storea(weight, undec_weight + i); |
1193 | 32.1k | } |
1194 | | |
1195 | 1.29k | undec_weight_ref = undec_weight; |
1196 | 1.29k | } |
1197 | | |
1198 | 45.8k | vfloat4 rgba_sum(blk.data_mean * static_cast<float>(blk.texel_count)); |
1199 | | |
1200 | 145k | for (unsigned int i = 0; i < partition_count; i++) |
1201 | 99.7k | { |
1202 | 99.7k | unsigned int texel_count = pi.partition_texel_count[i]; |
1203 | 99.7k | const uint8_t *texel_indexes = pi.texels_of_partition[i]; |
1204 | | |
1205 | | // Only compute a partition mean if more than one partition |
1206 | 99.7k | if (partition_count > 1) |
1207 | 86.5k | { |
1208 | 86.5k | rgba_sum = vfloat4::zero(); |
1209 | 86.5k | promise(texel_count > 0); |
1210 | 911k | for (unsigned int j = 0; j < texel_count; j++) |
1211 | 824k | { |
1212 | 824k | unsigned int tix = texel_indexes[j]; |
1213 | 824k | rgba_sum += blk.texel(tix); |
1214 | 824k | } |
1215 | 86.5k | } |
1216 | | |
1217 | 99.7k | rgba_sum = rgba_sum * blk.channel_weight; |
1218 | 99.7k | vfloat4 rgba_weight_sum = max(blk.channel_weight * static_cast<float>(texel_count), 1e-17f); |
1219 | 99.7k | vfloat4 scale_dir = normalize((rgba_sum / rgba_weight_sum).swz<0, 1, 2>()); |
1220 | | |
1221 | 99.7k | float scale_max = 0.0f; |
1222 | 99.7k | float scale_min = 1e10f; |
1223 | | |
1224 | 99.7k | float wmin1 = 1.0f; |
1225 | 99.7k | float wmax1 = 0.0f; |
1226 | | |
1227 | 99.7k | float left_sum_s = 0.0f; |
1228 | 99.7k | float middle_sum_s = 0.0f; |
1229 | 99.7k | float right_sum_s = 0.0f; |
1230 | | |
1231 | 99.7k | vfloat4 color_vec_x = vfloat4::zero(); |
1232 | 99.7k | vfloat4 color_vec_y = vfloat4::zero(); |
1233 | | |
1234 | 99.7k | vfloat4 scale_vec = vfloat4::zero(); |
1235 | | |
1236 | 99.7k | float weight_weight_sum_s = 1e-17f; |
1237 | | |
1238 | 99.7k | vfloat4 color_weight = blk.channel_weight; |
1239 | 99.7k | float ls_weight = hadd_rgb_s(color_weight); |
1240 | | |
1241 | 1.27M | for (unsigned int j = 0; j < texel_count; j++) |
1242 | 1.17M | { |
1243 | 1.17M | unsigned int tix = texel_indexes[j]; |
1244 | 1.17M | vfloat4 rgba = blk.texel(tix); |
1245 | | |
1246 | 1.17M | float idx0 = undec_weight_ref[tix]; |
1247 | | |
1248 | 1.17M | float om_idx0 = 1.0f - idx0; |
1249 | 1.17M | wmin1 = astc::min(idx0, wmin1); |
1250 | 1.17M | wmax1 = astc::max(idx0, wmax1); |
1251 | | |
1252 | 1.17M | float scale = dot3_s(scale_dir, rgba); |
1253 | 1.17M | scale_min = astc::min(scale, scale_min); |
1254 | 1.17M | scale_max = astc::max(scale, scale_max); |
1255 | | |
1256 | 1.17M | left_sum_s += om_idx0 * om_idx0; |
1257 | 1.17M | middle_sum_s += om_idx0 * idx0; |
1258 | 1.17M | right_sum_s += idx0 * idx0; |
1259 | 1.17M | weight_weight_sum_s += idx0; |
1260 | | |
1261 | 1.17M | vfloat4 color_idx(idx0); |
1262 | 1.17M | vfloat4 cwprod = rgba; |
1263 | 1.17M | vfloat4 cwiprod = cwprod * color_idx; |
1264 | | |
1265 | 1.17M | color_vec_y += cwiprod; |
1266 | 1.17M | color_vec_x += cwprod - cwiprod; |
1267 | | |
1268 | 1.17M | scale_vec += vfloat2(om_idx0, idx0) * (scale * ls_weight); |
1269 | 1.17M | } |
1270 | | |
1271 | 99.7k | vfloat4 left_sum = vfloat4(left_sum_s) * color_weight; |
1272 | 99.7k | vfloat4 middle_sum = vfloat4(middle_sum_s) * color_weight; |
1273 | 99.7k | vfloat4 right_sum = vfloat4(right_sum_s) * color_weight; |
1274 | 99.7k | vfloat4 lmrs_sum = vfloat3(left_sum_s, middle_sum_s, right_sum_s) * ls_weight; |
1275 | | |
1276 | 99.7k | color_vec_x = color_vec_x * color_weight; |
1277 | 99.7k | color_vec_y = color_vec_y * color_weight; |
1278 | | |
1279 | | // Initialize the luminance and scale vectors with a reasonable default |
1280 | 99.7k | float scalediv = scale_min / astc::max(scale_max, 1e-10f); |
1281 | 99.7k | scalediv = astc::clamp1f(scalediv); |
1282 | | |
1283 | 99.7k | vfloat4 sds = scale_dir * scale_max; |
1284 | | |
1285 | 99.7k | rgbs_vectors[i] = vfloat4(sds.lane<0>(), sds.lane<1>(), sds.lane<2>(), scalediv); |
1286 | | |
1287 | 99.7k | if (wmin1 >= wmax1 * 0.999f) |
1288 | 10.7k | { |
1289 | | // If all weights in the partition were equal, then just take average of all colors in |
1290 | | // the partition and use that as both endpoint colors |
1291 | 10.7k | vfloat4 avg = (color_vec_x + color_vec_y) / rgba_weight_sum; |
1292 | | |
1293 | 10.7k | vmask4 notnan_mask = avg == avg; |
1294 | 10.7k | ep.endpt0[i] = select(ep.endpt0[i], avg, notnan_mask); |
1295 | 10.7k | ep.endpt1[i] = select(ep.endpt1[i], avg, notnan_mask); |
1296 | | |
1297 | 10.7k | rgbs_vectors[i] = vfloat4(sds.lane<0>(), sds.lane<1>(), sds.lane<2>(), 1.0f); |
1298 | 10.7k | } |
1299 | 88.9k | else |
1300 | 88.9k | { |
1301 | | // Otherwise, complete the analytic calculation of ideal-endpoint-values for the given |
1302 | | // set of texel weights and pixel colors |
1303 | 88.9k | vfloat4 color_det1 = (left_sum * right_sum) - (middle_sum * middle_sum); |
1304 | 88.9k | vfloat4 color_rdet1 = 1.0f / color_det1; |
1305 | | |
1306 | 88.9k | float ls_det1 = (lmrs_sum.lane<0>() * lmrs_sum.lane<2>()) - (lmrs_sum.lane<1>() * lmrs_sum.lane<1>()); |
1307 | 88.9k | float ls_rdet1 = 1.0f / ls_det1; |
1308 | | |
1309 | 88.9k | vfloat4 color_mss1 = (left_sum * left_sum) |
1310 | 88.9k | + (2.0f * middle_sum * middle_sum) |
1311 | 88.9k | + (right_sum * right_sum); |
1312 | | |
1313 | 88.9k | float ls_mss1 = (lmrs_sum.lane<0>() * lmrs_sum.lane<0>()) |
1314 | 88.9k | + (2.0f * lmrs_sum.lane<1>() * lmrs_sum.lane<1>()) |
1315 | 88.9k | + (lmrs_sum.lane<2>() * lmrs_sum.lane<2>()); |
1316 | | |
1317 | 88.9k | vfloat4 ep0 = (right_sum * color_vec_x - middle_sum * color_vec_y) * color_rdet1; |
1318 | 88.9k | vfloat4 ep1 = (left_sum * color_vec_y - middle_sum * color_vec_x) * color_rdet1; |
1319 | | |
1320 | 88.9k | vmask4 det_mask = abs(color_det1) > (color_mss1 * 1e-4f); |
1321 | 88.9k | vmask4 notnan_mask = (ep0 == ep0) & (ep1 == ep1); |
1322 | 88.9k | vmask4 full_mask = det_mask & notnan_mask; |
1323 | | |
1324 | 88.9k | ep.endpt0[i] = select(ep.endpt0[i], ep0, full_mask); |
1325 | 88.9k | ep.endpt1[i] = select(ep.endpt1[i], ep1, full_mask); |
1326 | | |
1327 | 88.9k | float scale_ep0 = (lmrs_sum.lane<2>() * scale_vec.lane<0>() - lmrs_sum.lane<1>() * scale_vec.lane<1>()) * ls_rdet1; |
1328 | 88.9k | float scale_ep1 = (lmrs_sum.lane<0>() * scale_vec.lane<1>() - lmrs_sum.lane<1>() * scale_vec.lane<0>()) * ls_rdet1; |
1329 | | |
1330 | 88.9k | if (fabsf(ls_det1) > (ls_mss1 * 1e-4f) && scale_ep0 == scale_ep0 && scale_ep1 == scale_ep1 && scale_ep0 < scale_ep1) |
1331 | 82.6k | { |
1332 | 82.6k | float scalediv2 = scale_ep0 / scale_ep1; |
1333 | 82.6k | vfloat4 sdsm = scale_dir * scale_ep1; |
1334 | 82.6k | rgbs_vectors[i] = vfloat4(sdsm.lane<0>(), sdsm.lane<1>(), sdsm.lane<2>(), scalediv2); |
1335 | 82.6k | } |
1336 | 88.9k | } |
1337 | | |
1338 | | // Calculations specific to mode #7, the HDR RGB-scale mode - skip if known LDR |
1339 | 99.7k | if (blk.rgb_lns[0] || blk.alpha_lns[0]) |
1340 | 46.6k | { |
1341 | 46.6k | vfloat4 weight_weight_sum = vfloat4(weight_weight_sum_s) * color_weight; |
1342 | 46.6k | float psum = right_sum_s * hadd_rgb_s(color_weight); |
1343 | | |
1344 | 46.6k | vfloat4 rgbq_sum = color_vec_x + color_vec_y; |
1345 | 46.6k | rgbq_sum.set_lane<3>(hadd_rgb_s(color_vec_y)); |
1346 | | |
1347 | 46.6k | vfloat4 rgbovec = compute_rgbo_vector(rgba_weight_sum, weight_weight_sum, rgbq_sum, psum); |
1348 | 46.6k | rgbo_vectors[i] = rgbovec; |
1349 | | |
1350 | | // We can get a failure due to the use of a singular (non-invertible) matrix |
1351 | | // If it failed, compute rgbo_vectors[] with a different method ... |
1352 | 46.6k | if (astc::isnan(dot_s(rgbovec, rgbovec))) |
1353 | 4.43k | { |
1354 | 4.43k | vfloat4 v0 = ep.endpt0[i]; |
1355 | 4.43k | vfloat4 v1 = ep.endpt1[i]; |
1356 | | |
1357 | 4.43k | float avgdif = hadd_rgb_s(v1 - v0) * (1.0f / 3.0f); |
1358 | 4.43k | avgdif = astc::max(avgdif, 0.0f); |
1359 | | |
1360 | 4.43k | vfloat4 avg = (v0 + v1) * 0.5f; |
1361 | 4.43k | vfloat4 ep0 = avg - vfloat4(avgdif) * 0.5f; |
1362 | 4.43k | rgbo_vectors[i] = vfloat4(ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>(), avgdif); |
1363 | 4.43k | } |
1364 | 46.6k | } |
1365 | 99.7k | } |
1366 | 45.8k | } |
1367 | | |
1368 | | /* See header for documentation. */ |
1369 | | void recompute_ideal_colors_2planes( |
1370 | | const image_block& blk, |
1371 | | const block_size_descriptor& bsd, |
1372 | | const decimation_info& di, |
1373 | | const uint8_t* dec_weights_uquant_plane1, |
1374 | | const uint8_t* dec_weights_uquant_plane2, |
1375 | | endpoints& ep, |
1376 | | vfloat4& rgbs_vector, |
1377 | | vfloat4& rgbo_vector, |
1378 | | int plane2_component |
1379 | 25.6k | ) { |
1380 | 25.6k | unsigned int weight_count = di.weight_count; |
1381 | 25.6k | unsigned int total_texel_count = blk.texel_count; |
1382 | | |
1383 | 25.6k | promise(total_texel_count > 0); |
1384 | 25.6k | promise(weight_count > 0); |
1385 | | |
1386 | 25.6k | ASTCENC_ALIGNAS float dec_weight_plane1[BLOCK_MAX_WEIGHTS_2PLANE]; |
1387 | 25.6k | ASTCENC_ALIGNAS float dec_weight_plane2[BLOCK_MAX_WEIGHTS_2PLANE]; |
1388 | | |
1389 | 25.6k | assert(weight_count <= BLOCK_MAX_WEIGHTS_2PLANE); |
1390 | | |
1391 | 131k | for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH) |
1392 | 105k | { |
1393 | 105k | vint unquant_value1(dec_weights_uquant_plane1 + i); |
1394 | 105k | vfloat unquant_value1f = int_to_float(unquant_value1) * vfloat(1.0f / 64.0f); |
1395 | 105k | storea(unquant_value1f, dec_weight_plane1 + i); |
1396 | | |
1397 | 105k | vint unquant_value2(dec_weights_uquant_plane2 + i); |
1398 | 105k | vfloat unquant_value2f = int_to_float(unquant_value2) * vfloat(1.0f / 64.0f); |
1399 | 105k | storea(unquant_value2f, dec_weight_plane2 + i); |
1400 | 105k | } |
1401 | | |
1402 | 25.6k | ASTCENC_ALIGNAS float undec_weight_plane1[BLOCK_MAX_TEXELS]; |
1403 | 25.6k | ASTCENC_ALIGNAS float undec_weight_plane2[BLOCK_MAX_TEXELS]; |
1404 | | |
1405 | 25.6k | float* undec_weight_plane1_ref; |
1406 | 25.6k | float* undec_weight_plane2_ref; |
1407 | | |
1408 | 25.6k | if (di.max_texel_weight_count == 1) |
1409 | 12.1k | { |
1410 | 12.1k | undec_weight_plane1_ref = dec_weight_plane1; |
1411 | 12.1k | undec_weight_plane2_ref = dec_weight_plane2; |
1412 | 12.1k | } |
1413 | 13.5k | else if (di.max_texel_weight_count <= 2) |
1414 | 8.02k | { |
1415 | 48.8k | for (unsigned int i = 0; i < total_texel_count; i += ASTCENC_SIMD_WIDTH) |
1416 | 40.8k | { |
1417 | 40.8k | vfloat weight = bilinear_infill_vla_2(di, dec_weight_plane1, i); |
1418 | 40.8k | storea(weight, undec_weight_plane1 + i); |
1419 | | |
1420 | 40.8k | weight = bilinear_infill_vla_2(di, dec_weight_plane2, i); |
1421 | 40.8k | storea(weight, undec_weight_plane2 + i); |
1422 | 40.8k | } |
1423 | | |
1424 | 8.02k | undec_weight_plane1_ref = undec_weight_plane1; |
1425 | 8.02k | undec_weight_plane2_ref = undec_weight_plane2; |
1426 | 8.02k | } |
1427 | 5.47k | else |
1428 | 5.47k | { |
1429 | 79.6k | for (unsigned int i = 0; i < total_texel_count; i += ASTCENC_SIMD_WIDTH) |
1430 | 74.2k | { |
1431 | 74.2k | vfloat weight = bilinear_infill_vla(di, dec_weight_plane1, i); |
1432 | 74.2k | storea(weight, undec_weight_plane1 + i); |
1433 | | |
1434 | 74.2k | weight = bilinear_infill_vla(di, dec_weight_plane2, i); |
1435 | 74.2k | storea(weight, undec_weight_plane2 + i); |
1436 | 74.2k | } |
1437 | | |
1438 | 5.47k | undec_weight_plane1_ref = undec_weight_plane1; |
1439 | 5.47k | undec_weight_plane2_ref = undec_weight_plane2; |
1440 | 5.47k | } |
1441 | | |
1442 | 25.6k | unsigned int texel_count = bsd.texel_count; |
1443 | 25.6k | vfloat4 rgba_weight_sum = max(blk.channel_weight * static_cast<float>(texel_count), 1e-17f); |
1444 | 25.6k | vfloat4 scale_dir = normalize(blk.data_mean.swz<0, 1, 2>()); |
1445 | | |
1446 | 25.6k | float scale_max = 0.0f; |
1447 | 25.6k | float scale_min = 1e10f; |
1448 | | |
1449 | 25.6k | float wmin1 = 1.0f; |
1450 | 25.6k | float wmax1 = 0.0f; |
1451 | | |
1452 | 25.6k | float wmin2 = 1.0f; |
1453 | 25.6k | float wmax2 = 0.0f; |
1454 | | |
1455 | 25.6k | float left1_sum_s = 0.0f; |
1456 | 25.6k | float middle1_sum_s = 0.0f; |
1457 | 25.6k | float right1_sum_s = 0.0f; |
1458 | | |
1459 | 25.6k | float left2_sum_s = 0.0f; |
1460 | 25.6k | float middle2_sum_s = 0.0f; |
1461 | 25.6k | float right2_sum_s = 0.0f; |
1462 | | |
1463 | 25.6k | vfloat4 color_vec_x = vfloat4::zero(); |
1464 | 25.6k | vfloat4 color_vec_y = vfloat4::zero(); |
1465 | | |
1466 | 25.6k | vfloat4 scale_vec = vfloat4::zero(); |
1467 | | |
1468 | 25.6k | vfloat4 weight_weight_sum = vfloat4(1e-17f); |
1469 | | |
1470 | 25.6k | vmask4 p2_mask = vint4::lane_id() == vint4(plane2_component); |
1471 | 25.6k | vfloat4 color_weight = blk.channel_weight; |
1472 | 25.6k | float ls_weight = hadd_rgb_s(color_weight); |
1473 | | |
1474 | 676k | for (unsigned int j = 0; j < texel_count; j++) |
1475 | 650k | { |
1476 | 650k | vfloat4 rgba = blk.texel(j); |
1477 | | |
1478 | 650k | float idx0 = undec_weight_plane1_ref[j]; |
1479 | | |
1480 | 650k | float om_idx0 = 1.0f - idx0; |
1481 | 650k | wmin1 = astc::min(idx0, wmin1); |
1482 | 650k | wmax1 = astc::max(idx0, wmax1); |
1483 | | |
1484 | 650k | float scale = dot3_s(scale_dir, rgba); |
1485 | 650k | scale_min = astc::min(scale, scale_min); |
1486 | 650k | scale_max = astc::max(scale, scale_max); |
1487 | | |
1488 | 650k | left1_sum_s += om_idx0 * om_idx0; |
1489 | 650k | middle1_sum_s += om_idx0 * idx0; |
1490 | 650k | right1_sum_s += idx0 * idx0; |
1491 | | |
1492 | 650k | float idx1 = undec_weight_plane2_ref[j]; |
1493 | | |
1494 | 650k | float om_idx1 = 1.0f - idx1; |
1495 | 650k | wmin2 = astc::min(idx1, wmin2); |
1496 | 650k | wmax2 = astc::max(idx1, wmax2); |
1497 | | |
1498 | 650k | left2_sum_s += om_idx1 * om_idx1; |
1499 | 650k | middle2_sum_s += om_idx1 * idx1; |
1500 | 650k | right2_sum_s += idx1 * idx1; |
1501 | | |
1502 | 650k | vfloat4 color_idx = select(vfloat4(idx0), vfloat4(idx1), p2_mask); |
1503 | | |
1504 | 650k | vfloat4 cwprod = rgba; |
1505 | 650k | vfloat4 cwiprod = cwprod * color_idx; |
1506 | | |
1507 | 650k | color_vec_y += cwiprod; |
1508 | 650k | color_vec_x += cwprod - cwiprod; |
1509 | | |
1510 | 650k | scale_vec += vfloat2(om_idx0, idx0) * (ls_weight * scale); |
1511 | 650k | weight_weight_sum += color_idx; |
1512 | 650k | } |
1513 | | |
1514 | 25.6k | vfloat4 left1_sum = vfloat4(left1_sum_s) * color_weight; |
1515 | 25.6k | vfloat4 middle1_sum = vfloat4(middle1_sum_s) * color_weight; |
1516 | 25.6k | vfloat4 right1_sum = vfloat4(right1_sum_s) * color_weight; |
1517 | 25.6k | vfloat4 lmrs_sum = vfloat3(left1_sum_s, middle1_sum_s, right1_sum_s) * ls_weight; |
1518 | | |
1519 | 25.6k | vfloat4 left2_sum = vfloat4(left2_sum_s) * color_weight; |
1520 | 25.6k | vfloat4 middle2_sum = vfloat4(middle2_sum_s) * color_weight; |
1521 | 25.6k | vfloat4 right2_sum = vfloat4(right2_sum_s) * color_weight; |
1522 | | |
1523 | 25.6k | color_vec_x = color_vec_x * color_weight; |
1524 | 25.6k | color_vec_y = color_vec_y * color_weight; |
1525 | | |
1526 | | // Initialize the luminance and scale vectors with a reasonable default |
1527 | 25.6k | float scalediv = scale_min / astc::max(scale_max, 1e-10f); |
1528 | 25.6k | scalediv = astc::clamp1f(scalediv); |
1529 | | |
1530 | 25.6k | vfloat4 sds = scale_dir * scale_max; |
1531 | | |
1532 | 25.6k | rgbs_vector = vfloat4(sds.lane<0>(), sds.lane<1>(), sds.lane<2>(), scalediv); |
1533 | | |
1534 | 25.6k | if (wmin1 >= wmax1 * 0.999f) |
1535 | 606 | { |
1536 | | // If all weights in the partition were equal, then just take average of all colors in |
1537 | | // the partition and use that as both endpoint colors |
1538 | 606 | vfloat4 avg = (color_vec_x + color_vec_y) / rgba_weight_sum; |
1539 | | |
1540 | 606 | vmask4 p1_mask = vint4::lane_id() != vint4(plane2_component); |
1541 | 606 | vmask4 notnan_mask = avg == avg; |
1542 | 606 | vmask4 full_mask = p1_mask & notnan_mask; |
1543 | | |
1544 | 606 | ep.endpt0[0] = select(ep.endpt0[0], avg, full_mask); |
1545 | 606 | ep.endpt1[0] = select(ep.endpt1[0], avg, full_mask); |
1546 | | |
1547 | 606 | rgbs_vector = vfloat4(sds.lane<0>(), sds.lane<1>(), sds.lane<2>(), 1.0f); |
1548 | 606 | } |
1549 | 25.0k | else |
1550 | 25.0k | { |
1551 | | // Otherwise, complete the analytic calculation of ideal-endpoint-values for the given |
1552 | | // set of texel weights and pixel colors |
1553 | 25.0k | vfloat4 color_det1 = (left1_sum * right1_sum) - (middle1_sum * middle1_sum); |
1554 | 25.0k | vfloat4 color_rdet1 = 1.0f / color_det1; |
1555 | | |
1556 | 25.0k | float ls_det1 = (lmrs_sum.lane<0>() * lmrs_sum.lane<2>()) - (lmrs_sum.lane<1>() * lmrs_sum.lane<1>()); |
1557 | 25.0k | float ls_rdet1 = 1.0f / ls_det1; |
1558 | | |
1559 | 25.0k | vfloat4 color_mss1 = (left1_sum * left1_sum) |
1560 | 25.0k | + (2.0f * middle1_sum * middle1_sum) |
1561 | 25.0k | + (right1_sum * right1_sum); |
1562 | | |
1563 | 25.0k | float ls_mss1 = (lmrs_sum.lane<0>() * lmrs_sum.lane<0>()) |
1564 | 25.0k | + (2.0f * lmrs_sum.lane<1>() * lmrs_sum.lane<1>()) |
1565 | 25.0k | + (lmrs_sum.lane<2>() * lmrs_sum.lane<2>()); |
1566 | | |
1567 | 25.0k | vfloat4 ep0 = (right1_sum * color_vec_x - middle1_sum * color_vec_y) * color_rdet1; |
1568 | 25.0k | vfloat4 ep1 = (left1_sum * color_vec_y - middle1_sum * color_vec_x) * color_rdet1; |
1569 | | |
1570 | 25.0k | float scale_ep0 = (lmrs_sum.lane<2>() * scale_vec.lane<0>() - lmrs_sum.lane<1>() * scale_vec.lane<1>()) * ls_rdet1; |
1571 | 25.0k | float scale_ep1 = (lmrs_sum.lane<0>() * scale_vec.lane<1>() - lmrs_sum.lane<1>() * scale_vec.lane<0>()) * ls_rdet1; |
1572 | | |
1573 | 25.0k | vmask4 p1_mask = vint4::lane_id() != vint4(plane2_component); |
1574 | 25.0k | vmask4 det_mask = abs(color_det1) > (color_mss1 * 1e-4f); |
1575 | 25.0k | vmask4 notnan_mask = (ep0 == ep0) & (ep1 == ep1); |
1576 | 25.0k | vmask4 full_mask = p1_mask & det_mask & notnan_mask; |
1577 | | |
1578 | 25.0k | ep.endpt0[0] = select(ep.endpt0[0], ep0, full_mask); |
1579 | 25.0k | ep.endpt1[0] = select(ep.endpt1[0], ep1, full_mask); |
1580 | | |
1581 | 25.0k | if (fabsf(ls_det1) > (ls_mss1 * 1e-4f) && scale_ep0 == scale_ep0 && scale_ep1 == scale_ep1 && scale_ep0 < scale_ep1) |
1582 | 22.9k | { |
1583 | 22.9k | float scalediv2 = scale_ep0 / scale_ep1; |
1584 | 22.9k | vfloat4 sdsm = scale_dir * scale_ep1; |
1585 | 22.9k | rgbs_vector = vfloat4(sdsm.lane<0>(), sdsm.lane<1>(), sdsm.lane<2>(), scalediv2); |
1586 | 22.9k | } |
1587 | 25.0k | } |
1588 | | |
1589 | 25.6k | if (wmin2 >= wmax2 * 0.999f) |
1590 | 781 | { |
1591 | | // If all weights in the partition were equal, then just take average of all colors in |
1592 | | // the partition and use that as both endpoint colors |
1593 | 781 | vfloat4 avg = (color_vec_x + color_vec_y) / rgba_weight_sum; |
1594 | | |
1595 | 781 | vmask4 notnan_mask = avg == avg; |
1596 | 781 | vmask4 full_mask = p2_mask & notnan_mask; |
1597 | | |
1598 | 781 | ep.endpt0[0] = select(ep.endpt0[0], avg, full_mask); |
1599 | 781 | ep.endpt1[0] = select(ep.endpt1[0], avg, full_mask); |
1600 | 781 | } |
1601 | 24.8k | else |
1602 | 24.8k | { |
1603 | | // Otherwise, complete the analytic calculation of ideal-endpoint-values for the given |
1604 | | // set of texel weights and pixel colors |
1605 | 24.8k | vfloat4 color_det2 = (left2_sum * right2_sum) - (middle2_sum * middle2_sum); |
1606 | 24.8k | vfloat4 color_rdet2 = 1.0f / color_det2; |
1607 | | |
1608 | 24.8k | vfloat4 color_mss2 = (left2_sum * left2_sum) |
1609 | 24.8k | + (2.0f * middle2_sum * middle2_sum) |
1610 | 24.8k | + (right2_sum * right2_sum); |
1611 | | |
1612 | 24.8k | vfloat4 ep0 = (right2_sum * color_vec_x - middle2_sum * color_vec_y) * color_rdet2; |
1613 | 24.8k | vfloat4 ep1 = (left2_sum * color_vec_y - middle2_sum * color_vec_x) * color_rdet2; |
1614 | | |
1615 | 24.8k | vmask4 det_mask = abs(color_det2) > (color_mss2 * 1e-4f); |
1616 | 24.8k | vmask4 notnan_mask = (ep0 == ep0) & (ep1 == ep1); |
1617 | 24.8k | vmask4 full_mask = p2_mask & det_mask & notnan_mask; |
1618 | | |
1619 | 24.8k | ep.endpt0[0] = select(ep.endpt0[0], ep0, full_mask); |
1620 | 24.8k | ep.endpt1[0] = select(ep.endpt1[0], ep1, full_mask); |
1621 | 24.8k | } |
1622 | | |
1623 | | // Calculations specific to mode #7, the HDR RGB-scale mode - skip if known LDR |
1624 | 25.6k | if (blk.rgb_lns[0] || blk.alpha_lns[0]) |
1625 | 12.2k | { |
1626 | 12.2k | weight_weight_sum = weight_weight_sum * color_weight; |
1627 | 12.2k | float psum = dot3_s(select(right1_sum, right2_sum, p2_mask), color_weight); |
1628 | | |
1629 | 12.2k | vfloat4 rgbq_sum = color_vec_x + color_vec_y; |
1630 | 12.2k | rgbq_sum.set_lane<3>(hadd_rgb_s(color_vec_y)); |
1631 | | |
1632 | 12.2k | rgbo_vector = compute_rgbo_vector(rgba_weight_sum, weight_weight_sum, rgbq_sum, psum); |
1633 | | |
1634 | | // We can get a failure due to the use of a singular (non-invertible) matrix |
1635 | | // If it failed, compute rgbo_vectors[] with a different method ... |
1636 | 12.2k | if (astc::isnan(dot_s(rgbo_vector, rgbo_vector))) |
1637 | 431 | { |
1638 | 431 | vfloat4 v0 = ep.endpt0[0]; |
1639 | 431 | vfloat4 v1 = ep.endpt1[0]; |
1640 | | |
1641 | 431 | float avgdif = hadd_rgb_s(v1 - v0) * (1.0f / 3.0f); |
1642 | 431 | avgdif = astc::max(avgdif, 0.0f); |
1643 | | |
1644 | 431 | vfloat4 avg = (v0 + v1) * 0.5f; |
1645 | 431 | vfloat4 ep0 = avg - vfloat4(avgdif) * 0.5f; |
1646 | | |
1647 | 431 | rgbo_vector = vfloat4(ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>(), avgdif); |
1648 | 431 | } |
1649 | 12.2k | } |
1650 | 25.6k | } |
1651 | | |
1652 | | #endif |