/src/astc-encoder/Source/astcenc_image.cpp
Line | Count | Source |
1 | | // SPDX-License-Identifier: Apache-2.0 |
2 | | // ---------------------------------------------------------------------------- |
3 | | // Copyright 2011-2026 Arm Limited |
4 | | // |
5 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not |
6 | | // use this file except in compliance with the License. You may obtain a copy |
7 | | // of the License at: |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, software |
12 | | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT |
13 | | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the |
14 | | // License for the specific language governing permissions and limitations |
15 | | // under the License. |
16 | | // ---------------------------------------------------------------------------- |
17 | | |
18 | | /** |
19 | | * @brief Functions for creating in-memory ASTC image structures. |
20 | | */ |
21 | | |
22 | | #include <cassert> |
23 | | #include <cstring> |
24 | | |
25 | | #include "astcenc_internal.h" |
26 | | |
27 | | /** |
28 | | * @brief Loader pipeline function type for data fetch from memory. |
29 | | */ |
30 | | using pixel_loader = vfloat4(*)(const void*, int); |
31 | | |
32 | | /** |
33 | | * @brief Loader pipeline function type for swizzling data in a vector. |
34 | | */ |
35 | | using pixel_swizzler = vfloat4(*)(vfloat4, const astcenc_swizzle&); |
36 | | |
37 | | /** |
38 | | * @brief Loader pipeline function type for converting data in a vector to LNS. |
39 | | */ |
40 | | using pixel_converter = vfloat4(*)(vfloat4, vmask4); |
41 | | |
42 | | /** |
43 | | * @brief Load a 8-bit UNORM texel from a data array. |
44 | | * |
45 | | * @param data The data pointer. |
46 | | * @param base_offset The index offset to the start of the pixel. |
47 | | */ |
48 | | static vfloat4 load_texel_u8( |
49 | | const void* data, |
50 | | int base_offset |
51 | 27.9k | ) { |
52 | 27.9k | const uint8_t* data8 = static_cast<const uint8_t*>(data); |
53 | 27.9k | return int_to_float(vint4(data8 + base_offset)) / 255.0f; |
54 | 27.9k | } |
55 | | |
56 | | /** |
57 | | * @brief Load a 16-bit fp16 texel from a data array. |
58 | | * |
59 | | * @param data The data pointer. |
60 | | * @param base_offset The index offset to the start of the pixel. |
61 | | */ |
62 | | static vfloat4 load_texel_f16( |
63 | | const void* data, |
64 | | int base_offset |
65 | 0 | ) { |
66 | 0 | const uint16_t* data16 = static_cast<const uint16_t*>(data); |
67 | 0 | int r = data16[base_offset ]; |
68 | 0 | int g = data16[base_offset + 1]; |
69 | 0 | int b = data16[base_offset + 2]; |
70 | 0 | int a = data16[base_offset + 3]; |
71 | 0 | return float16_to_float(vint4(r, g, b, a)); |
72 | 0 | } |
73 | | |
74 | | /** |
75 | | * @brief Load a 32-bit float texel from a data array. |
76 | | * |
77 | | * @param data The data pointer. |
78 | | * @param base_offset The index offset to the start of the pixel. |
79 | | */ |
80 | | static vfloat4 load_texel_f32( |
81 | | const void* data, |
82 | | int base_offset |
83 | 0 | ) { |
84 | 0 | const float* data32 = static_cast<const float*>(data); |
85 | 0 | return vfloat4(data32 + base_offset); |
86 | 0 | } |
87 | | |
88 | | /** |
89 | | * @brief Dummy no-op swizzle function. |
90 | | * |
91 | | * @param data The source RGBA vector to swizzle. |
92 | | * @param swz The swizzle to use. |
93 | | */ |
94 | | static vfloat4 swz_texel_skip( |
95 | | vfloat4 data, |
96 | | const astcenc_swizzle& swz |
97 | 27.9k | ) { |
98 | 27.9k | (void)swz; |
99 | 27.9k | return data; |
100 | 27.9k | } |
101 | | |
102 | | /** |
103 | | * @brief Swizzle a texel into a new arrangement. |
104 | | * |
105 | | * @param data The source RGBA vector to swizzle. |
106 | | * @param swz The swizzle to use. |
107 | | */ |
108 | | static vfloat4 swz_texel( |
109 | | vfloat4 data, |
110 | | const astcenc_swizzle& swz |
111 | 0 | ) { |
112 | 0 | ASTCENC_ALIGNAS float datas[6]; |
113 | |
|
114 | 0 | storea(data, datas); |
115 | 0 | datas[ASTCENC_SWZ_0] = 0.0f; |
116 | 0 | datas[ASTCENC_SWZ_1] = 1.0f; |
117 | |
|
118 | 0 | return vfloat4(datas[swz.r], datas[swz.g], datas[swz.b], datas[swz.a]); |
119 | 0 | } |
120 | | |
121 | | /** |
122 | | * @brief Encode a texel that is entirely LDR linear. |
123 | | * |
124 | | * @param data The RGBA data to encode. |
125 | | * @param lns_mask The mask for the HDR channels than need LNS encoding. |
126 | | */ |
127 | | static vfloat4 encode_texel_unorm( |
128 | | vfloat4 data, |
129 | | vmask4 lns_mask |
130 | 0 | ) { |
131 | 0 | (void)lns_mask; |
132 | 0 | return data * 65535.0f; |
133 | 0 | } |
134 | | |
135 | | /** |
136 | | * @brief Encode a texel that includes at least some HDR LNS texels. |
137 | | * |
138 | | * @param data The RGBA data to encode. |
139 | | * @param lns_mask The mask for the HDR channels than need LNS encoding. |
140 | | */ |
141 | | static vfloat4 encode_texel_lns( |
142 | | vfloat4 data, |
143 | | vmask4 lns_mask |
144 | 27.9k | ) { |
145 | 27.9k | vfloat4 datav_unorm = data * 65535.0f; |
146 | 27.9k | vfloat4 datav_lns = float_to_lns(data); |
147 | 27.9k | return select(datav_unorm, datav_lns, lns_mask); |
148 | 27.9k | } |
149 | | |
150 | | /* See header for documentation. */ |
151 | | void load_image_block( |
152 | | astcenc_profile decode_mode, |
153 | | const astcenc_image& img, |
154 | | image_block& blk, |
155 | | const block_size_descriptor& bsd, |
156 | | unsigned int xpos, |
157 | | unsigned int ypos, |
158 | | unsigned int zpos, |
159 | | const astcenc_swizzle& swz |
160 | 1.12k | ) { |
161 | 1.12k | unsigned int xsize = img.dim_x; |
162 | 1.12k | unsigned int ysize = img.dim_y; |
163 | 1.12k | unsigned int zsize = img.dim_z; |
164 | | |
165 | 1.12k | blk.xpos = xpos; |
166 | 1.12k | blk.ypos = ypos; |
167 | 1.12k | blk.zpos = zpos; |
168 | | |
169 | | // True if any non-identity swizzle |
170 | 1.12k | bool needs_swz = (swz.r != ASTCENC_SWZ_R) || (swz.g != ASTCENC_SWZ_G) || |
171 | 1.12k | (swz.b != ASTCENC_SWZ_B) || (swz.a != ASTCENC_SWZ_A); |
172 | | |
173 | 1.12k | int idx = 0; |
174 | | |
175 | 1.12k | vfloat4 data_min(1e38f); |
176 | 1.12k | vfloat4 data_mean(0.0f); |
177 | 1.12k | vfloat4 data_mean_scale(1.0f / static_cast<float>(bsd.texel_count)); |
178 | 1.12k | vfloat4 data_max(-1e38f); |
179 | 1.12k | vmask4 grayscalev(true); |
180 | | |
181 | | // This works because we impose the same choice everywhere during encode |
182 | 1.12k | uint8_t rgb_lns = (decode_mode == ASTCENC_PRF_HDR) || |
183 | 1.12k | (decode_mode == ASTCENC_PRF_HDR_RGB_LDR_A) ? 1 : 0; |
184 | 1.12k | uint8_t a_lns = decode_mode == ASTCENC_PRF_HDR ? 1 : 0; |
185 | 1.12k | vint4 use_lns(rgb_lns, rgb_lns, rgb_lns, a_lns); |
186 | 1.12k | vmask4 lns_mask = use_lns != vint4::zero(); |
187 | | |
188 | | // Set up the function pointers for loading pipeline as needed |
189 | 1.12k | pixel_loader loader = load_texel_u8; |
190 | 1.12k | if (img.data_type == ASTCENC_TYPE_F16) |
191 | 0 | { |
192 | 0 | loader = load_texel_f16; |
193 | 0 | } |
194 | 1.12k | else if (img.data_type == ASTCENC_TYPE_F32) |
195 | 0 | { |
196 | 0 | loader = load_texel_f32; |
197 | 0 | } |
198 | | |
199 | 1.12k | pixel_swizzler swizzler = swz_texel_skip; |
200 | 1.12k | if (needs_swz) |
201 | 0 | { |
202 | 0 | swizzler = swz_texel; |
203 | 0 | } |
204 | | |
205 | 1.12k | pixel_converter converter = encode_texel_unorm; |
206 | 1.12k | if (any(lns_mask)) |
207 | 1.12k | { |
208 | 1.12k | converter = encode_texel_lns; |
209 | 1.12k | } |
210 | | |
211 | 2.24k | for (unsigned int z = 0; z < bsd.zdim; z++) |
212 | 1.12k | { |
213 | 1.12k | unsigned int zi = astc::min(zpos + z, zsize - 1); |
214 | 1.12k | void* plane = img.data[zi]; |
215 | | |
216 | 6.24k | for (unsigned int y = 0; y < bsd.ydim; y++) |
217 | 5.11k | { |
218 | 5.11k | unsigned int yi = astc::min(ypos + y, ysize - 1); |
219 | | |
220 | 33.0k | for (unsigned int x = 0; x < bsd.xdim; x++) |
221 | 27.9k | { |
222 | 27.9k | unsigned int xi = astc::min(xpos + x, xsize - 1); |
223 | | |
224 | 27.9k | vfloat4 datav = loader(plane, (4 * xsize * yi) + (4 * xi)); |
225 | 27.9k | datav = swizzler(datav, swz); |
226 | 27.9k | datav = converter(datav, lns_mask); |
227 | | |
228 | | // Compute block metadata |
229 | 27.9k | data_min = min(data_min, datav); |
230 | 27.9k | data_mean += datav * data_mean_scale; |
231 | 27.9k | data_max = max(data_max, datav); |
232 | | |
233 | 27.9k | grayscalev = grayscalev & (datav.swz<0,0,0,0>() == datav.swz<1,1,2,2>()); |
234 | | |
235 | 27.9k | blk.data_r[idx] = datav.lane<0>(); |
236 | 27.9k | blk.data_g[idx] = datav.lane<1>(); |
237 | 27.9k | blk.data_b[idx] = datav.lane<2>(); |
238 | 27.9k | blk.data_a[idx] = datav.lane<3>(); |
239 | | |
240 | 27.9k | blk.rgb_lns[idx] = rgb_lns; |
241 | 27.9k | blk.alpha_lns[idx] = a_lns; |
242 | | |
243 | 27.9k | idx++; |
244 | 27.9k | } |
245 | 5.11k | } |
246 | 1.12k | } |
247 | | |
248 | | // Reverse the encoding so we store origin block in the original format |
249 | 1.12k | vfloat4 data_enc = blk.texel(0); |
250 | 1.12k | vfloat4 data_enc_unorm = data_enc / 65535.0f; |
251 | 1.12k | vfloat4 data_enc_lns = vfloat4::zero(); |
252 | | |
253 | 1.12k | if (rgb_lns || a_lns) |
254 | 1.12k | { |
255 | 1.12k | data_enc_lns = float16_to_float(lns_to_sf16(float_to_int(data_enc))); |
256 | 1.12k | } |
257 | | |
258 | 1.12k | blk.origin_texel = select(data_enc_unorm, data_enc_lns, lns_mask); |
259 | | |
260 | | // Store block metadata |
261 | 1.12k | blk.data_min = data_min; |
262 | 1.12k | blk.data_mean = data_mean; |
263 | 1.12k | blk.data_max = data_max; |
264 | 1.12k | blk.grayscale = all(grayscalev); |
265 | 1.12k | } |
266 | | |
267 | | /* See header for documentation. */ |
268 | | void load_image_block_fast_ldr( |
269 | | astcenc_profile decode_mode, |
270 | | const astcenc_image& img, |
271 | | image_block& blk, |
272 | | const block_size_descriptor& bsd, |
273 | | unsigned int xpos, |
274 | | unsigned int ypos, |
275 | | unsigned int zpos, |
276 | | const astcenc_swizzle& swz |
277 | 1.09k | ) { |
278 | 1.09k | (void)swz; |
279 | 1.09k | (void)decode_mode; |
280 | | |
281 | 1.09k | unsigned int xsize = img.dim_x; |
282 | 1.09k | unsigned int ysize = img.dim_y; |
283 | | |
284 | 1.09k | blk.xpos = xpos; |
285 | 1.09k | blk.ypos = ypos; |
286 | 1.09k | blk.zpos = zpos; |
287 | | |
288 | 1.09k | vfloat4 data_min(1e38f); |
289 | 1.09k | vfloat4 data_mean = vfloat4::zero(); |
290 | 1.09k | vfloat4 data_max(-1e38f); |
291 | 1.09k | vmask4 grayscalev(true); |
292 | 1.09k | int idx = 0; |
293 | | |
294 | 1.09k | const uint8_t* plane = static_cast<const uint8_t*>(img.data[0]); |
295 | 6.22k | for (unsigned int y = ypos; y < ypos + bsd.ydim; y++) |
296 | 5.13k | { |
297 | 5.13k | unsigned int yi = astc::min(y, ysize - 1); |
298 | | |
299 | 34.0k | for (unsigned int x = xpos; x < xpos + bsd.xdim; x++) |
300 | 28.9k | { |
301 | 28.9k | unsigned int xi = astc::min(x, xsize - 1); |
302 | | |
303 | 28.9k | vint4 datavi = vint4(plane + (4 * xsize * yi) + (4 * xi)); |
304 | 28.9k | vfloat4 datav = int_to_float(datavi) * (65535.0f / 255.0f); |
305 | | |
306 | | // Compute block metadata |
307 | 28.9k | data_min = min(data_min, datav); |
308 | 28.9k | data_mean += datav; |
309 | 28.9k | data_max = max(data_max, datav); |
310 | | |
311 | 28.9k | grayscalev = grayscalev & (datav.swz<0,0,0,0>() == datav.swz<1,1,2,2>()); |
312 | | |
313 | 28.9k | blk.data_r[idx] = datav.lane<0>(); |
314 | 28.9k | blk.data_g[idx] = datav.lane<1>(); |
315 | 28.9k | blk.data_b[idx] = datav.lane<2>(); |
316 | 28.9k | blk.data_a[idx] = datav.lane<3>(); |
317 | | |
318 | 28.9k | idx++; |
319 | 28.9k | } |
320 | 5.13k | } |
321 | | |
322 | | // Reverse the encoding so we store origin block in the original format |
323 | 1.09k | blk.origin_texel = blk.texel(0) / 65535.0f; |
324 | | |
325 | | // Store block metadata |
326 | 1.09k | blk.rgb_lns[0] = 0; |
327 | 1.09k | blk.alpha_lns[0] = 0; |
328 | 1.09k | blk.data_min = data_min; |
329 | 1.09k | blk.data_mean = data_mean / static_cast<float>(bsd.texel_count); |
330 | 1.09k | blk.data_max = data_max; |
331 | 1.09k | blk.grayscale = all(grayscalev); |
332 | 1.09k | } |
333 | | |
334 | | /* See header for documentation. */ |
335 | | void store_image_block( |
336 | | astcenc_image& img, |
337 | | const image_block& blk, |
338 | | const block_size_descriptor& bsd, |
339 | | unsigned int xpos, |
340 | | unsigned int ypos, |
341 | | unsigned int zpos, |
342 | | const astcenc_swizzle& swz |
343 | 3.95k | ) { |
344 | 3.95k | unsigned int x_size = img.dim_x; |
345 | 3.95k | unsigned int x_start = xpos; |
346 | 3.95k | unsigned int x_end = astc::min(x_size, xpos + bsd.xdim); |
347 | 3.95k | unsigned int x_count = x_end - x_start; |
348 | 3.95k | unsigned int x_nudge = bsd.xdim - x_count; |
349 | | |
350 | 3.95k | unsigned int y_size = img.dim_y; |
351 | 3.95k | unsigned int y_start = ypos; |
352 | 3.95k | unsigned int y_end = astc::min(y_size, ypos + bsd.ydim); |
353 | 3.95k | unsigned int y_count = y_end - y_start; |
354 | 3.95k | unsigned int y_nudge = (bsd.ydim - y_count) * bsd.xdim; |
355 | | |
356 | 3.95k | unsigned int z_size = img.dim_z; |
357 | 3.95k | unsigned int z_start = zpos; |
358 | 3.95k | unsigned int z_end = astc::min(z_size, zpos + bsd.zdim); |
359 | | |
360 | | // True if any non-identity swizzle |
361 | 3.95k | bool needs_swz = (swz.r != ASTCENC_SWZ_R) || (swz.g != ASTCENC_SWZ_G) || |
362 | 3.95k | (swz.b != ASTCENC_SWZ_B) || (swz.a != ASTCENC_SWZ_A); |
363 | | |
364 | | // True if any swizzle uses Z reconstruct |
365 | 3.95k | bool needs_z = (swz.r == ASTCENC_SWZ_Z) || (swz.g == ASTCENC_SWZ_Z) || |
366 | 3.95k | (swz.b == ASTCENC_SWZ_Z) || (swz.a == ASTCENC_SWZ_Z); |
367 | | |
368 | 3.95k | int idx = 0; |
369 | 3.95k | if (img.data_type == ASTCENC_TYPE_U8) |
370 | 3.95k | { |
371 | 7.90k | for (unsigned int z = z_start; z < z_end; z++) |
372 | 3.95k | { |
373 | | // Fetch the image plane |
374 | 3.95k | uint8_t* data8 = static_cast<uint8_t*>(img.data[z]); |
375 | | |
376 | 33.8k | for (unsigned int y = y_start; y < y_end; y++) |
377 | 29.8k | { |
378 | 29.8k | uint8_t* data8_row = data8 + (4 * x_size * y) + (4 * x_start); |
379 | | |
380 | 108k | for (unsigned int x = 0; x < x_count; x += ASTCENC_SIMD_WIDTH) |
381 | 78.9k | { |
382 | 78.9k | unsigned int max_texels = ASTCENC_SIMD_WIDTH; |
383 | 78.9k | unsigned int used_texels = astc::min(x_count - x, max_texels); |
384 | | |
385 | | // Unaligned load as rows are not always SIMD_WIDTH long |
386 | 78.9k | vfloat data_r(blk.data_r + idx); |
387 | 78.9k | vfloat data_g(blk.data_g + idx); |
388 | 78.9k | vfloat data_b(blk.data_b + idx); |
389 | 78.9k | vfloat data_a(blk.data_a + idx); |
390 | | |
391 | | // Clamp values to [0.0, 1.0] range before unorm conversion |
392 | | // - Values > 1.0 are possible for all HDR blocks |
393 | | // - Values < 0.0 are possible for HDR void-extent blocks |
394 | 78.9k | vint data_ri = float_to_int_rtn(clampzo(data_r) * 255.0f); |
395 | 78.9k | vint data_gi = float_to_int_rtn(clampzo(data_g) * 255.0f); |
396 | 78.9k | vint data_bi = float_to_int_rtn(clampzo(data_b) * 255.0f); |
397 | 78.9k | vint data_ai = float_to_int_rtn(clampzo(data_a) * 255.0f); |
398 | | |
399 | 78.9k | if (needs_swz) |
400 | 0 | { |
401 | 0 | vint swizzle_table[7]; |
402 | 0 | swizzle_table[ASTCENC_SWZ_0] = vint(0); |
403 | 0 | swizzle_table[ASTCENC_SWZ_1] = vint(255); |
404 | 0 | swizzle_table[ASTCENC_SWZ_R] = data_ri; |
405 | 0 | swizzle_table[ASTCENC_SWZ_G] = data_gi; |
406 | 0 | swizzle_table[ASTCENC_SWZ_B] = data_bi; |
407 | 0 | swizzle_table[ASTCENC_SWZ_A] = data_ai; |
408 | |
|
409 | 0 | if (needs_z) |
410 | 0 | { |
411 | 0 | vfloat data_x = (data_r * vfloat(2.0f)) - vfloat(1.0f); |
412 | 0 | vfloat data_y = (data_a * vfloat(2.0f)) - vfloat(1.0f); |
413 | 0 | vfloat data_z = vfloat(1.0f) - (data_x * data_x) - (data_y * data_y); |
414 | 0 | data_z = max(data_z, 0.0f); |
415 | 0 | data_z = (sqrt(data_z) * vfloat(0.5f)) + vfloat(0.5f); |
416 | |
|
417 | 0 | swizzle_table[ASTCENC_SWZ_Z] = float_to_int_rtn(min(data_z, 1.0f) * 255.0f); |
418 | 0 | } |
419 | |
|
420 | 0 | data_ri = swizzle_table[swz.r]; |
421 | 0 | data_gi = swizzle_table[swz.g]; |
422 | 0 | data_bi = swizzle_table[swz.b]; |
423 | 0 | data_ai = swizzle_table[swz.a]; |
424 | 0 | } |
425 | | |
426 | | // Errors are NaN encoded - convert to magenta error color |
427 | | // Branch is OK here - it is almost never true so predicts well |
428 | 78.9k | vmask nan_mask = data_r != data_r; |
429 | 78.9k | if (any(nan_mask)) |
430 | 19.8k | { |
431 | 19.8k | data_ri = select(data_ri, vint(0xFF), nan_mask); |
432 | 19.8k | data_gi = select(data_gi, vint(0x00), nan_mask); |
433 | 19.8k | data_bi = select(data_bi, vint(0xFF), nan_mask); |
434 | 19.8k | data_ai = select(data_ai, vint(0xFF), nan_mask); |
435 | 19.8k | } |
436 | | |
437 | 78.9k | vint data_rgbai = interleave_rgba8(data_ri, data_gi, data_bi, data_ai); |
438 | 78.9k | vmask store_mask = vint::lane_id() < vint(used_texels); |
439 | 78.9k | store_lanes_masked(data8_row, data_rgbai, store_mask); |
440 | | |
441 | 78.9k | data8_row += ASTCENC_SIMD_WIDTH * 4; |
442 | 78.9k | idx += used_texels; |
443 | 78.9k | } |
444 | 29.8k | idx += x_nudge; |
445 | 29.8k | } |
446 | 3.95k | idx += y_nudge; |
447 | 3.95k | } |
448 | 3.95k | } |
449 | 0 | else if (img.data_type == ASTCENC_TYPE_F16) |
450 | 0 | { |
451 | 0 | for (unsigned int z = z_start; z < z_end; z++) |
452 | 0 | { |
453 | | // Fetch the image plane |
454 | 0 | uint16_t* data16 = static_cast<uint16_t*>(img.data[z]); |
455 | |
|
456 | 0 | for (unsigned int y = y_start; y < y_end; y++) |
457 | 0 | { |
458 | 0 | uint16_t* data16_row = data16 + (4 * x_size * y) + (4 * x_start); |
459 | |
|
460 | 0 | for (unsigned int x = 0; x < x_count; x++) |
461 | 0 | { |
462 | 0 | vint4 color; |
463 | | |
464 | | // NaNs are handled inline - no need to special case |
465 | 0 | if (needs_swz) |
466 | 0 | { |
467 | 0 | float data[7]; |
468 | 0 | data[ASTCENC_SWZ_0] = 0.0f; |
469 | 0 | data[ASTCENC_SWZ_1] = 1.0f; |
470 | 0 | data[ASTCENC_SWZ_R] = blk.data_r[idx]; |
471 | 0 | data[ASTCENC_SWZ_G] = blk.data_g[idx]; |
472 | 0 | data[ASTCENC_SWZ_B] = blk.data_b[idx]; |
473 | 0 | data[ASTCENC_SWZ_A] = blk.data_a[idx]; |
474 | |
|
475 | 0 | if (needs_z) |
476 | 0 | { |
477 | 0 | float xN = (data[0] * 2.0f) - 1.0f; |
478 | 0 | float yN = (data[3] * 2.0f) - 1.0f; |
479 | 0 | float zN = 1.0f - xN * xN - yN * yN; |
480 | 0 | if (zN < 0.0f) |
481 | 0 | { |
482 | 0 | zN = 0.0f; |
483 | 0 | } |
484 | 0 | data[ASTCENC_SWZ_Z] = (astc::sqrt(zN) * 0.5f) + 0.5f; |
485 | 0 | } |
486 | |
|
487 | 0 | vfloat4 colorf(data[swz.r], data[swz.g], data[swz.b], data[swz.a]); |
488 | 0 | color = float_to_float16(colorf); |
489 | 0 | } |
490 | 0 | else |
491 | 0 | { |
492 | 0 | vfloat4 colorf = blk.texel(idx); |
493 | 0 | color = float_to_float16(colorf); |
494 | 0 | } |
495 | | |
496 | | // TODO: Vectorize with store N shorts? |
497 | 0 | data16_row[0] = static_cast<uint16_t>(color.lane<0>()); |
498 | 0 | data16_row[1] = static_cast<uint16_t>(color.lane<1>()); |
499 | 0 | data16_row[2] = static_cast<uint16_t>(color.lane<2>()); |
500 | 0 | data16_row[3] = static_cast<uint16_t>(color.lane<3>()); |
501 | 0 | data16_row += 4; |
502 | 0 | idx++; |
503 | 0 | } |
504 | 0 | idx += x_nudge; |
505 | 0 | } |
506 | 0 | idx += y_nudge; |
507 | 0 | } |
508 | 0 | } |
509 | 0 | else // if (img.data_type == ASTCENC_TYPE_F32) |
510 | 0 | { |
511 | 0 | assert(img.data_type == ASTCENC_TYPE_F32); |
512 | | |
513 | 0 | for (unsigned int z = z_start; z < z_end; z++) |
514 | 0 | { |
515 | | // Fetch the image plane |
516 | 0 | float* data32 = static_cast<float*>(img.data[z]); |
517 | |
|
518 | 0 | for (unsigned int y = y_start; y < y_end; y++) |
519 | 0 | { |
520 | 0 | float* data32_row = data32 + (4 * x_size * y) + (4 * x_start); |
521 | |
|
522 | 0 | for (unsigned int x = 0; x < x_count; x++) |
523 | 0 | { |
524 | 0 | vfloat4 color = blk.texel(idx); |
525 | | |
526 | | // NaNs are handled inline - no need to special case |
527 | 0 | if (needs_swz) |
528 | 0 | { |
529 | 0 | float data[7]; |
530 | 0 | data[ASTCENC_SWZ_0] = 0.0f; |
531 | 0 | data[ASTCENC_SWZ_1] = 1.0f; |
532 | 0 | data[ASTCENC_SWZ_R] = color.lane<0>(); |
533 | 0 | data[ASTCENC_SWZ_G] = color.lane<1>(); |
534 | 0 | data[ASTCENC_SWZ_B] = color.lane<2>(); |
535 | 0 | data[ASTCENC_SWZ_A] = color.lane<3>(); |
536 | |
|
537 | 0 | if (needs_z) |
538 | 0 | { |
539 | 0 | float xN = (data[0] * 2.0f) - 1.0f; |
540 | 0 | float yN = (data[3] * 2.0f) - 1.0f; |
541 | 0 | float zN = 1.0f - xN * xN - yN * yN; |
542 | 0 | if (zN < 0.0f) |
543 | 0 | { |
544 | 0 | zN = 0.0f; |
545 | 0 | } |
546 | 0 | data[ASTCENC_SWZ_Z] = (astc::sqrt(zN) * 0.5f) + 0.5f; |
547 | 0 | } |
548 | |
|
549 | 0 | color = vfloat4(data[swz.r], data[swz.g], data[swz.b], data[swz.a]); |
550 | 0 | } |
551 | |
|
552 | 0 | store(color, data32_row); |
553 | 0 | data32_row += 4; |
554 | 0 | idx++; |
555 | 0 | } |
556 | 0 | idx += x_nudge; |
557 | 0 | } |
558 | 0 | idx += y_nudge; |
559 | 0 | } |
560 | 0 | } |
561 | 3.95k | } |