/src/astc-encoder/Source/astcenc_image.cpp
Line | Count | Source |
1 | | // SPDX-License-Identifier: Apache-2.0 |
2 | | // ---------------------------------------------------------------------------- |
3 | | // Copyright 2011-2026 Arm Limited |
4 | | // |
5 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not |
6 | | // use this file except in compliance with the License. You may obtain a copy |
7 | | // of the License at: |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, software |
12 | | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT |
13 | | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the |
14 | | // License for the specific language governing permissions and limitations |
15 | | // under the License. |
16 | | // ---------------------------------------------------------------------------- |
17 | | |
18 | | /** |
19 | | * @brief Functions for creating in-memory ASTC image structures. |
20 | | */ |
21 | | |
22 | | #include <cassert> |
23 | | #include <cstring> |
24 | | |
25 | | #include "astcenc_internal.h" |
26 | | |
27 | | /** |
28 | | * @brief Loader pipeline function type for data fetch from memory. |
29 | | */ |
30 | | using pixel_loader = vfloat4(*)(const void*, size_t); |
31 | | |
32 | | /** |
33 | | * @brief Loader pipeline function type for swizzling data in a vector. |
34 | | */ |
35 | | using pixel_swizzler = vfloat4(*)(vfloat4, const astcenc_swizzle&); |
36 | | |
37 | | /** |
38 | | * @brief Loader pipeline function type for converting data in a vector to LNS. |
39 | | */ |
40 | | using pixel_converter = vfloat4(*)(vfloat4, vmask4); |
41 | | |
42 | | /** |
43 | | * @brief Load a 8-bit UNORM texel from a data array. |
44 | | * |
45 | | * @param data The data pointer. |
46 | | * @param base_offset The index offset to the start of the pixel. |
47 | | */ |
48 | | static vfloat4 load_texel_u8( |
49 | | const void* data, |
50 | | size_t base_offset |
51 | 25.9k | ) { |
52 | 25.9k | const uint8_t* data8 = static_cast<const uint8_t*>(data); |
53 | 25.9k | return int_to_float(vint4(data8 + base_offset)) / 255.0f; |
54 | 25.9k | } |
55 | | |
56 | | /** |
57 | | * @brief Load a 16-bit fp16 texel from a data array. |
58 | | * |
59 | | * @param data The data pointer. |
60 | | * @param base_offset The index offset to the start of the pixel. |
61 | | */ |
62 | | static vfloat4 load_texel_f16( |
63 | | const void* data, |
64 | | size_t base_offset |
65 | 0 | ) { |
66 | 0 | const uint16_t* data16 = static_cast<const uint16_t*>(data); |
67 | 0 | int r = data16[base_offset ]; |
68 | 0 | int g = data16[base_offset + 1]; |
69 | 0 | int b = data16[base_offset + 2]; |
70 | 0 | int a = data16[base_offset + 3]; |
71 | 0 | return float16_to_float(vint4(r, g, b, a)); |
72 | 0 | } |
73 | | |
74 | | /** |
75 | | * @brief Load a 32-bit float texel from a data array. |
76 | | * |
77 | | * @param data The data pointer. |
78 | | * @param base_offset The index offset to the start of the pixel. |
79 | | */ |
80 | | static vfloat4 load_texel_f32( |
81 | | const void* data, |
82 | | size_t base_offset |
83 | 0 | ) { |
84 | 0 | const float* data32 = static_cast<const float*>(data); |
85 | 0 | return vfloat4(data32 + base_offset); |
86 | 0 | } |
87 | | |
88 | | /** |
89 | | * @brief Dummy no-op swizzle function. |
90 | | * |
91 | | * @param data The source RGBA vector to swizzle. |
92 | | * @param swz The swizzle to use. |
93 | | */ |
94 | | static vfloat4 swz_texel_skip( |
95 | | vfloat4 data, |
96 | | const astcenc_swizzle& swz |
97 | 25.9k | ) { |
98 | 25.9k | (void)swz; |
99 | 25.9k | return data; |
100 | 25.9k | } |
101 | | |
102 | | /** |
103 | | * @brief Swizzle a texel into a new arrangement. |
104 | | * |
105 | | * @param data The source RGBA vector to swizzle. |
106 | | * @param swz The swizzle to use. |
107 | | */ |
108 | | static vfloat4 swz_texel( |
109 | | vfloat4 data, |
110 | | const astcenc_swizzle& swz |
111 | 0 | ) { |
112 | 0 | ASTCENC_ALIGNAS float datas[6]; |
113 | |
|
114 | 0 | storea(data, datas); |
115 | 0 | datas[ASTCENC_SWZ_0] = 0.0f; |
116 | 0 | datas[ASTCENC_SWZ_1] = 1.0f; |
117 | |
|
118 | 0 | return vfloat4(datas[swz.r], datas[swz.g], datas[swz.b], datas[swz.a]); |
119 | 0 | } |
120 | | |
121 | | /** |
122 | | * @brief Encode a texel that is entirely LDR linear. |
123 | | * |
124 | | * @param data The RGBA data to encode. |
125 | | * @param lns_mask The mask for the HDR channels than need LNS encoding. |
126 | | */ |
127 | | static vfloat4 encode_texel_unorm( |
128 | | vfloat4 data, |
129 | | vmask4 lns_mask |
130 | 0 | ) { |
131 | 0 | (void)lns_mask; |
132 | 0 | return data * 65535.0f; |
133 | 0 | } |
134 | | |
135 | | /** |
136 | | * @brief Encode a texel that includes at least some HDR LNS texels. |
137 | | * |
138 | | * @param data The RGBA data to encode. |
139 | | * @param lns_mask The mask for the HDR channels than need LNS encoding. |
140 | | */ |
141 | | static vfloat4 encode_texel_lns( |
142 | | vfloat4 data, |
143 | | vmask4 lns_mask |
144 | 25.9k | ) { |
145 | 25.9k | vfloat4 datav_unorm = data * 65535.0f; |
146 | 25.9k | vfloat4 datav_lns = float_to_lns(data); |
147 | 25.9k | return select(datav_unorm, datav_lns, lns_mask); |
148 | 25.9k | } |
149 | | |
150 | | /* See header for documentation. */ |
151 | | void load_image_block( |
152 | | astcenc_profile decode_mode, |
153 | | const astcenc_image& img, |
154 | | image_block& blk, |
155 | | const block_size_descriptor& bsd, |
156 | | size_t pos_x, |
157 | | size_t pos_y, |
158 | | size_t pos_z, |
159 | | const astcenc_swizzle& swz |
160 | 1.12k | ) { |
161 | 1.12k | size_t size_x = img.dim_x; |
162 | 1.12k | size_t size_y = img.dim_y; |
163 | 1.12k | size_t size_z = img.dim_z; |
164 | | |
165 | 1.12k | blk.pos_x = pos_x; |
166 | 1.12k | blk.pos_y = pos_y; |
167 | 1.12k | blk.pos_z = pos_z; |
168 | | |
169 | | // True if any non-identity swizzle |
170 | 1.12k | bool needs_swz = (swz.r != ASTCENC_SWZ_R) || (swz.g != ASTCENC_SWZ_G) || |
171 | 1.12k | (swz.b != ASTCENC_SWZ_B) || (swz.a != ASTCENC_SWZ_A); |
172 | | |
173 | 1.12k | vfloat4 data_min(1e38f); |
174 | 1.12k | vfloat4 data_mean(0.0f); |
175 | 1.12k | vfloat4 data_mean_scale(1.0f / static_cast<float>(bsd.texel_count)); |
176 | 1.12k | vfloat4 data_max(-1e38f); |
177 | 1.12k | vmask4 grayscalev(true); |
178 | 1.12k | size_t idx = 0; |
179 | | |
180 | | // This works because we impose the same choice everywhere during encode |
181 | 1.12k | uint8_t rgb_lns = (decode_mode == ASTCENC_PRF_HDR) || |
182 | 1.12k | (decode_mode == ASTCENC_PRF_HDR_RGB_LDR_A) ? 1 : 0; |
183 | 1.12k | uint8_t a_lns = decode_mode == ASTCENC_PRF_HDR ? 1 : 0; |
184 | 1.12k | vint4 use_lns(rgb_lns, rgb_lns, rgb_lns, a_lns); |
185 | 1.12k | vmask4 lns_mask = use_lns != vint4::zero(); |
186 | | |
187 | | // Set up the function pointers for loading pipeline as needed |
188 | 1.12k | pixel_loader loader = load_texel_u8; |
189 | 1.12k | if (img.data_type == ASTCENC_TYPE_F16) |
190 | 0 | { |
191 | 0 | loader = load_texel_f16; |
192 | 0 | } |
193 | 1.12k | else if (img.data_type == ASTCENC_TYPE_F32) |
194 | 0 | { |
195 | 0 | loader = load_texel_f32; |
196 | 0 | } |
197 | | |
198 | 1.12k | pixel_swizzler swizzler = swz_texel_skip; |
199 | 1.12k | if (needs_swz) |
200 | 0 | { |
201 | 0 | swizzler = swz_texel; |
202 | 0 | } |
203 | | |
204 | 1.12k | pixel_converter converter = encode_texel_unorm; |
205 | 1.12k | if (any(lns_mask)) |
206 | 1.12k | { |
207 | 1.12k | converter = encode_texel_lns; |
208 | 1.12k | } |
209 | | |
210 | 2.24k | for (size_t z = 0; z < bsd.dim_z; z++) |
211 | 1.12k | { |
212 | 1.12k | size_t zi = astc::min(pos_z + z, size_z - 1); |
213 | 1.12k | void* plane = img.data[zi]; |
214 | | |
215 | 6.09k | for (size_t y = 0; y < bsd.dim_y; y++) |
216 | 4.97k | { |
217 | 4.97k | size_t yi = astc::min(pos_y + y, size_y - 1); |
218 | | |
219 | 30.9k | for (size_t x = 0; x < bsd.dim_x; x++) |
220 | 25.9k | { |
221 | 25.9k | size_t xi = astc::min(pos_x + x, size_x - 1); |
222 | | |
223 | 25.9k | vfloat4 datav = loader(plane, (4 * size_x * yi) + (4 * xi)); |
224 | 25.9k | datav = swizzler(datav, swz); |
225 | 25.9k | datav = converter(datav, lns_mask); |
226 | | |
227 | | // Compute block metadata |
228 | 25.9k | data_min = min(data_min, datav); |
229 | 25.9k | data_mean += datav * data_mean_scale; |
230 | 25.9k | data_max = max(data_max, datav); |
231 | | |
232 | 25.9k | grayscalev = grayscalev & (datav.swz<0,0,0,0>() == datav.swz<1,1,2,2>()); |
233 | | |
234 | 25.9k | blk.data_r[idx] = datav.lane<0>(); |
235 | 25.9k | blk.data_g[idx] = datav.lane<1>(); |
236 | 25.9k | blk.data_b[idx] = datav.lane<2>(); |
237 | 25.9k | blk.data_a[idx] = datav.lane<3>(); |
238 | | |
239 | 25.9k | blk.rgb_lns[idx] = rgb_lns; |
240 | 25.9k | blk.alpha_lns[idx] = a_lns; |
241 | | |
242 | 25.9k | idx++; |
243 | 25.9k | } |
244 | 4.97k | } |
245 | 1.12k | } |
246 | | |
247 | | // Reverse the encoding so we store origin block in the original format |
248 | 1.12k | vfloat4 data_enc = blk.texel(0); |
249 | 1.12k | vfloat4 data_enc_unorm = data_enc / 65535.0f; |
250 | 1.12k | vfloat4 data_enc_lns = vfloat4::zero(); |
251 | | |
252 | 1.12k | if (rgb_lns || a_lns) |
253 | 1.12k | { |
254 | 1.12k | data_enc_lns = float16_to_float(lns_to_sf16(float_to_int(data_enc))); |
255 | 1.12k | } |
256 | | |
257 | 1.12k | blk.origin_texel = select(data_enc_unorm, data_enc_lns, lns_mask); |
258 | | |
259 | | // Store block metadata |
260 | 1.12k | blk.data_min = data_min; |
261 | 1.12k | blk.data_mean = data_mean; |
262 | 1.12k | blk.data_max = data_max; |
263 | 1.12k | blk.grayscale = all(grayscalev); |
264 | 1.12k | } |
265 | | |
266 | | /* See header for documentation. */ |
267 | | void load_image_block_fast_ldr( |
268 | | astcenc_profile decode_mode, |
269 | | const astcenc_image& img, |
270 | | image_block& blk, |
271 | | const block_size_descriptor& bsd, |
272 | | size_t pos_x, |
273 | | size_t pos_y, |
274 | | size_t pos_z, |
275 | | const astcenc_swizzle& swz |
276 | 1.11k | ) { |
277 | 1.11k | (void)swz; |
278 | 1.11k | (void)decode_mode; |
279 | | |
280 | 1.11k | size_t size_x = img.dim_x; |
281 | 1.11k | size_t size_y = img.dim_y; |
282 | | |
283 | 1.11k | blk.pos_x = pos_x; |
284 | 1.11k | blk.pos_y = pos_y; |
285 | 1.11k | blk.pos_z = pos_z; |
286 | | |
287 | 1.11k | vfloat4 data_min(1e38f); |
288 | 1.11k | vfloat4 data_mean = vfloat4::zero(); |
289 | 1.11k | vfloat4 data_max(-1e38f); |
290 | 1.11k | vmask4 grayscalev(true); |
291 | 1.11k | size_t idx = 0; |
292 | | |
293 | 1.11k | const uint8_t* plane = static_cast<const uint8_t*>(img.data[0]); |
294 | 6.44k | for (size_t y = pos_y; y < pos_y + bsd.dim_y; y++) |
295 | 5.33k | { |
296 | 5.33k | size_t yi = astc::min(y, size_y - 1); |
297 | | |
298 | 36.5k | for (size_t x = pos_x; x < pos_x + bsd.dim_x; x++) |
299 | 31.2k | { |
300 | 31.2k | size_t xi = astc::min(x, size_x - 1); |
301 | | |
302 | 31.2k | vint4 datavi = vint4(plane + (4 * size_x * yi) + (4 * xi)); |
303 | 31.2k | vfloat4 datav = int_to_float(datavi) * (65535.0f / 255.0f); |
304 | | |
305 | | // Compute block metadata |
306 | 31.2k | data_min = min(data_min, datav); |
307 | 31.2k | data_mean += datav; |
308 | 31.2k | data_max = max(data_max, datav); |
309 | | |
310 | 31.2k | grayscalev = grayscalev & (datav.swz<0,0,0,0>() == datav.swz<1,1,2,2>()); |
311 | | |
312 | 31.2k | blk.data_r[idx] = datav.lane<0>(); |
313 | 31.2k | blk.data_g[idx] = datav.lane<1>(); |
314 | 31.2k | blk.data_b[idx] = datav.lane<2>(); |
315 | 31.2k | blk.data_a[idx] = datav.lane<3>(); |
316 | | |
317 | 31.2k | idx++; |
318 | 31.2k | } |
319 | 5.33k | } |
320 | | |
321 | | // Reverse the encoding so we store origin block in the original format |
322 | 1.11k | blk.origin_texel = blk.texel(0) / 65535.0f; |
323 | | |
324 | | // Store block metadata |
325 | 1.11k | blk.rgb_lns[0] = 0; |
326 | 1.11k | blk.alpha_lns[0] = 0; |
327 | 1.11k | blk.data_min = data_min; |
328 | 1.11k | blk.data_mean = data_mean / static_cast<float>(bsd.texel_count); |
329 | 1.11k | blk.data_max = data_max; |
330 | 1.11k | blk.grayscale = all(grayscalev); |
331 | 1.11k | } |
332 | | |
333 | | /* See header for documentation. */ |
334 | | void store_image_block( |
335 | | astcenc_image& img, |
336 | | const image_block& blk, |
337 | | const block_size_descriptor& bsd, |
338 | | size_t pos_x, |
339 | | size_t pos_y, |
340 | | size_t pos_z, |
341 | | const astcenc_swizzle& swz |
342 | 0 | ) { |
343 | 0 | size_t size_x = img.dim_x; |
344 | 0 | size_t start_x = pos_x; |
345 | 0 | size_t end_x = astc::min(size_x, pos_x + bsd.dim_x); |
346 | 0 | size_t count_x = end_x - start_x; |
347 | 0 | size_t nudge_x = bsd.dim_x - count_x; |
348 | |
|
349 | 0 | size_t size_y = img.dim_y; |
350 | 0 | size_t start_y = pos_y; |
351 | 0 | size_t end_y = astc::min(size_y, pos_y + bsd.dim_y); |
352 | 0 | size_t count_y = end_y - start_y; |
353 | 0 | size_t nudge_y = (bsd.dim_y - count_y) * bsd.dim_x; |
354 | |
|
355 | 0 | size_t size_z = img.dim_z; |
356 | 0 | size_t start_z = pos_z; |
357 | 0 | size_t end_z = astc::min(size_z, pos_z + bsd.dim_z); |
358 | |
|
359 | 0 | size_t idx = 0; |
360 | | |
361 | | // True if any non-identity swizzle |
362 | 0 | bool needs_swz = (swz.r != ASTCENC_SWZ_R) || (swz.g != ASTCENC_SWZ_G) || |
363 | 0 | (swz.b != ASTCENC_SWZ_B) || (swz.a != ASTCENC_SWZ_A); |
364 | | |
365 | | // True if any swizzle uses Z reconstruct |
366 | 0 | bool needs_z = (swz.r == ASTCENC_SWZ_Z) || (swz.g == ASTCENC_SWZ_Z) || |
367 | 0 | (swz.b == ASTCENC_SWZ_Z) || (swz.a == ASTCENC_SWZ_Z); |
368 | |
|
369 | 0 | if (img.data_type == ASTCENC_TYPE_U8) |
370 | 0 | { |
371 | 0 | for (size_t z = start_z; z < end_z; z++) |
372 | 0 | { |
373 | | // Fetch the image plane |
374 | 0 | uint8_t* data8 = static_cast<uint8_t*>(img.data[z]); |
375 | |
|
376 | 0 | for (size_t y = start_y; y < end_y; y++) |
377 | 0 | { |
378 | 0 | uint8_t* data8_row = data8 + (4 * size_x * y) + (4 * start_x); |
379 | |
|
380 | 0 | for (size_t x = 0; x < count_x; x += ASTCENC_SIMD_WIDTH) |
381 | 0 | { |
382 | 0 | size_t max_texels = ASTCENC_SIMD_WIDTH; |
383 | 0 | size_t used_texels = astc::min(count_x - x, max_texels); |
384 | | |
385 | | // Unaligned load as rows are not always SIMD_WIDTH long |
386 | 0 | vfloat data_r(blk.data_r + idx); |
387 | 0 | vfloat data_g(blk.data_g + idx); |
388 | 0 | vfloat data_b(blk.data_b + idx); |
389 | 0 | vfloat data_a(blk.data_a + idx); |
390 | | |
391 | | // Clamp values to [0.0, 1.0] range before unorm conversion |
392 | | // - Values > 1.0 are possible for all HDR blocks |
393 | | // - Values < 0.0 are possible for HDR void-extent blocks |
394 | 0 | vint data_ri = float_to_int_rtn(clampzo(data_r) * 255.0f); |
395 | 0 | vint data_gi = float_to_int_rtn(clampzo(data_g) * 255.0f); |
396 | 0 | vint data_bi = float_to_int_rtn(clampzo(data_b) * 255.0f); |
397 | 0 | vint data_ai = float_to_int_rtn(clampzo(data_a) * 255.0f); |
398 | |
|
399 | 0 | if (needs_swz) |
400 | 0 | { |
401 | 0 | vint swizzle_table[7]; |
402 | 0 | swizzle_table[ASTCENC_SWZ_0] = vint(0); |
403 | 0 | swizzle_table[ASTCENC_SWZ_1] = vint(255); |
404 | 0 | swizzle_table[ASTCENC_SWZ_R] = data_ri; |
405 | 0 | swizzle_table[ASTCENC_SWZ_G] = data_gi; |
406 | 0 | swizzle_table[ASTCENC_SWZ_B] = data_bi; |
407 | 0 | swizzle_table[ASTCENC_SWZ_A] = data_ai; |
408 | |
|
409 | 0 | if (needs_z) |
410 | 0 | { |
411 | 0 | vfloat data_x = (data_r * vfloat(2.0f)) - vfloat(1.0f); |
412 | 0 | vfloat data_y = (data_a * vfloat(2.0f)) - vfloat(1.0f); |
413 | 0 | vfloat data_z = vfloat(1.0f) - (data_x * data_x) - (data_y * data_y); |
414 | 0 | data_z = max(data_z, 0.0f); |
415 | 0 | data_z = (sqrt(data_z) * vfloat(0.5f)) + vfloat(0.5f); |
416 | |
|
417 | 0 | swizzle_table[ASTCENC_SWZ_Z] = float_to_int_rtn(min(data_z, 1.0f) * 255.0f); |
418 | 0 | } |
419 | |
|
420 | 0 | data_ri = swizzle_table[swz.r]; |
421 | 0 | data_gi = swizzle_table[swz.g]; |
422 | 0 | data_bi = swizzle_table[swz.b]; |
423 | 0 | data_ai = swizzle_table[swz.a]; |
424 | 0 | } |
425 | | |
426 | | // Errors are NaN encoded - convert to magenta error color |
427 | | // Branch is OK here - it is almost never true so predicts well |
428 | 0 | vmask nan_mask = data_r != data_r; |
429 | 0 | if (any(nan_mask)) |
430 | 0 | { |
431 | 0 | data_ri = select(data_ri, vint(0xFF), nan_mask); |
432 | 0 | data_gi = select(data_gi, vint(0x00), nan_mask); |
433 | 0 | data_bi = select(data_bi, vint(0xFF), nan_mask); |
434 | 0 | data_ai = select(data_ai, vint(0xFF), nan_mask); |
435 | 0 | } |
436 | |
|
437 | 0 | vint data_rgbai = interleave_rgba8(data_ri, data_gi, data_bi, data_ai); |
438 | | // Static cast must be safe, as used_texels must be less than vector length |
439 | 0 | vmask store_mask = vint::lane_id() < vint(static_cast<int>(used_texels)); |
440 | 0 | store_lanes_masked(data8_row, data_rgbai, store_mask); |
441 | |
|
442 | 0 | data8_row += ASTCENC_SIMD_WIDTH * 4; |
443 | 0 | idx += used_texels; |
444 | 0 | } |
445 | 0 | idx += nudge_x; |
446 | 0 | } |
447 | 0 | idx += nudge_y; |
448 | 0 | } |
449 | 0 | } |
450 | 0 | else if (img.data_type == ASTCENC_TYPE_F16) |
451 | 0 | { |
452 | 0 | for (size_t z = start_z; z < end_z; z++) |
453 | 0 | { |
454 | | // Fetch the image plane |
455 | 0 | uint16_t* data16 = static_cast<uint16_t*>(img.data[z]); |
456 | |
|
457 | 0 | for (size_t y = start_y; y < end_y; y++) |
458 | 0 | { |
459 | 0 | uint16_t* data16_row = data16 + (4 * size_x * y) + (4 * start_x); |
460 | |
|
461 | 0 | for (size_t x = 0; x < count_x; x++) |
462 | 0 | { |
463 | 0 | vint4 color; |
464 | | |
465 | | // NaNs are handled inline - no need to special case |
466 | 0 | if (needs_swz) |
467 | 0 | { |
468 | 0 | float data[7]; |
469 | 0 | data[ASTCENC_SWZ_0] = 0.0f; |
470 | 0 | data[ASTCENC_SWZ_1] = 1.0f; |
471 | 0 | data[ASTCENC_SWZ_R] = blk.data_r[idx]; |
472 | 0 | data[ASTCENC_SWZ_G] = blk.data_g[idx]; |
473 | 0 | data[ASTCENC_SWZ_B] = blk.data_b[idx]; |
474 | 0 | data[ASTCENC_SWZ_A] = blk.data_a[idx]; |
475 | |
|
476 | 0 | if (needs_z) |
477 | 0 | { |
478 | 0 | float xN = (data[0] * 2.0f) - 1.0f; |
479 | 0 | float yN = (data[3] * 2.0f) - 1.0f; |
480 | 0 | float zN = 1.0f - xN * xN - yN * yN; |
481 | 0 | if (zN < 0.0f) |
482 | 0 | { |
483 | 0 | zN = 0.0f; |
484 | 0 | } |
485 | 0 | data[ASTCENC_SWZ_Z] = (astc::sqrt(zN) * 0.5f) + 0.5f; |
486 | 0 | } |
487 | |
|
488 | 0 | vfloat4 colorf(data[swz.r], data[swz.g], data[swz.b], data[swz.a]); |
489 | 0 | color = float_to_float16(colorf); |
490 | 0 | } |
491 | 0 | else |
492 | 0 | { |
493 | 0 | vfloat4 colorf = blk.texel(idx); |
494 | 0 | color = float_to_float16(colorf); |
495 | 0 | } |
496 | | |
497 | | // TODO: Vectorize with store N shorts? |
498 | 0 | data16_row[0] = static_cast<uint16_t>(color.lane<0>()); |
499 | 0 | data16_row[1] = static_cast<uint16_t>(color.lane<1>()); |
500 | 0 | data16_row[2] = static_cast<uint16_t>(color.lane<2>()); |
501 | 0 | data16_row[3] = static_cast<uint16_t>(color.lane<3>()); |
502 | 0 | data16_row += 4; |
503 | 0 | idx++; |
504 | 0 | } |
505 | 0 | idx += nudge_x; |
506 | 0 | } |
507 | 0 | idx += nudge_y; |
508 | 0 | } |
509 | 0 | } |
510 | 0 | else // if (img.data_type == ASTCENC_TYPE_F32) |
511 | 0 | { |
512 | 0 | assert(img.data_type == ASTCENC_TYPE_F32); |
513 | | |
514 | 0 | for (size_t z = start_z; z < end_z; z++) |
515 | 0 | { |
516 | | // Fetch the image plane |
517 | 0 | float* data32 = static_cast<float*>(img.data[z]); |
518 | |
|
519 | 0 | for (size_t y = start_y; y < end_y; y++) |
520 | 0 | { |
521 | 0 | float* data32_row = data32 + (4 * size_x * y) + (4 * start_x); |
522 | |
|
523 | 0 | for (size_t x = 0; x < count_x; x++) |
524 | 0 | { |
525 | 0 | vfloat4 color = blk.texel(idx); |
526 | | |
527 | | // NaNs are handled inline - no need to special case |
528 | 0 | if (needs_swz) |
529 | 0 | { |
530 | 0 | float data[7]; |
531 | 0 | data[ASTCENC_SWZ_0] = 0.0f; |
532 | 0 | data[ASTCENC_SWZ_1] = 1.0f; |
533 | 0 | data[ASTCENC_SWZ_R] = color.lane<0>(); |
534 | 0 | data[ASTCENC_SWZ_G] = color.lane<1>(); |
535 | 0 | data[ASTCENC_SWZ_B] = color.lane<2>(); |
536 | 0 | data[ASTCENC_SWZ_A] = color.lane<3>(); |
537 | |
|
538 | 0 | if (needs_z) |
539 | 0 | { |
540 | 0 | float xN = (data[0] * 2.0f) - 1.0f; |
541 | 0 | float yN = (data[3] * 2.0f) - 1.0f; |
542 | 0 | float zN = 1.0f - xN * xN - yN * yN; |
543 | 0 | if (zN < 0.0f) |
544 | 0 | { |
545 | 0 | zN = 0.0f; |
546 | 0 | } |
547 | 0 | data[ASTCENC_SWZ_Z] = (astc::sqrt(zN) * 0.5f) + 0.5f; |
548 | 0 | } |
549 | |
|
550 | 0 | color = vfloat4(data[swz.r], data[swz.g], data[swz.b], data[swz.a]); |
551 | 0 | } |
552 | |
|
553 | 0 | store(color, data32_row); |
554 | 0 | data32_row += 4; |
555 | 0 | idx++; |
556 | 0 | } |
557 | 0 | idx += nudge_x; |
558 | 0 | } |
559 | 0 | idx += nudge_y; |
560 | 0 | } |
561 | 0 | } |
562 | 0 | } |