/src/astc-encoder/Source/astcenc_image.cpp

Source
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2026 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------

/**
 * @brief Functions for creating in-memory ASTC image structures.
 */

#include <cassert>
#include <cstring>

#include "astcenc_internal.h"

/**
 * @brief Loader pipeline function type for data fetch from memory.
 */
using pixel_loader = vfloat4(*)(const void*, size_t);

/**
 * @brief Loader pipeline function type for swizzling data in a vector.
 */
using pixel_swizzler = vfloat4(*)(vfloat4, const astcenc_swizzle&);

/**
 * @brief Loader pipeline function type for converting data in a vector to LNS.
 */
using pixel_converter = vfloat4(*)(vfloat4, vmask4);

/**
 * @brief Load a 8-bit UNORM texel from a data array.
 *
 * @param data          The data pointer.
 * @param base_offset   The index offset to the start of the pixel.
 */
static vfloat4 load_texel_u8(
  const void* data,
  size_t base_offset
) {
  const uint8_t* data8 = static_cast<const uint8_t*>(data);
  return int_to_float(vint4(data8 + base_offset)) / 255.0f;
}

/**
 * @brief Load a 16-bit fp16 texel from a data array.
 *
 * @param data          The data pointer.
 * @param base_offset   The index offset to the start of the pixel.
 */
static vfloat4 load_texel_f16(
  const void* data,
  size_t base_offset
) {
  const uint16_t* data16 = static_cast<const uint16_t*>(data);
  int r = data16[base_offset    ];
  int g = data16[base_offset + 1];
  int b = data16[base_offset + 2];
  int a = data16[base_offset + 3];
  return float16_to_float(vint4(r, g, b, a));
}

/**
 * @brief Load a 32-bit float texel from a data array.
 *
 * @param data          The data pointer.
 * @param base_offset   The index offset to the start of the pixel.
 */
static vfloat4 load_texel_f32(
  const void* data,
  size_t base_offset
) {
  const float* data32 = static_cast<const float*>(data);
  return vfloat4(data32 + base_offset);
}

/**
 * @brief Dummy no-op swizzle function.
 *
 * @param data   The source RGBA vector to swizzle.
 * @param swz    The swizzle to use.
 */
static vfloat4 swz_texel_skip(
  vfloat4 data,
  const astcenc_swizzle& swz
) {
  (void)swz;
  return data;
}

/**
 * @brief Swizzle a texel into a new arrangement.
 *
 * @param data   The source RGBA vector to swizzle.
 * @param swz    The swizzle to use.
 */
static vfloat4 swz_texel(
  vfloat4 data,
  const astcenc_swizzle& swz
) {
  ASTCENC_ALIGNAS float datas[6];

  storea(data, datas);
  datas[ASTCENC_SWZ_0] = 0.0f;
  datas[ASTCENC_SWZ_1] = 1.0f;

  return vfloat4(datas[swz.r], datas[swz.g], datas[swz.b], datas[swz.a]);
}

/**
 * @brief Encode a texel that is entirely LDR linear.
 *
 * @param data       The RGBA data to encode.
 * @param lns_mask   The mask for the HDR channels than need LNS encoding.
 */
static vfloat4 encode_texel_unorm(
  vfloat4 data,
  vmask4 lns_mask
) {
  (void)lns_mask;
  return data * 65535.0f;
}

/**
 * @brief Encode a texel that includes at least some HDR LNS texels.
 *
 * @param data       The RGBA data to encode.
 * @param lns_mask   The mask for the HDR channels than need LNS encoding.
 */
static vfloat4 encode_texel_lns(
  vfloat4 data,
  vmask4 lns_mask
) {
  vfloat4 datav_unorm = data * 65535.0f;
  vfloat4 datav_lns = float_to_lns(data);
  return select(datav_unorm, datav_lns, lns_mask);
}

/* See header for documentation. */
void load_image_block(
  astcenc_profile decode_mode,
  const astcenc_image& img,
  image_block& blk,
  const block_size_descriptor& bsd,
  size_t pos_x,
  size_t pos_y,
  size_t pos_z,
  const astcenc_swizzle& swz
) {
  size_t size_x = img.dim_x;
  size_t size_y = img.dim_y;
  size_t size_z = img.dim_z;

  blk.pos_x = pos_x;
  blk.pos_y = pos_y;
  blk.pos_z = pos_z;

  // True if any non-identity swizzle
  bool needs_swz = (swz.r != ASTCENC_SWZ_R) || (swz.g != ASTCENC_SWZ_G) ||
                   (swz.b != ASTCENC_SWZ_B) || (swz.a != ASTCENC_SWZ_A);

  vfloat4 data_min(1e38f);
  vfloat4 data_mean(0.0f);
  vfloat4 data_mean_scale(1.0f / static_cast<float>(bsd.texel_count));
  vfloat4 data_max(-1e38f);
  vmask4 grayscalev(true);
  size_t idx = 0;

  // This works because we impose the same choice everywhere during encode
  uint8_t rgb_lns = (decode_mode == ASTCENC_PRF_HDR) ||
                    (decode_mode == ASTCENC_PRF_HDR_RGB_LDR_A) ? 1 : 0;
  uint8_t a_lns = decode_mode == ASTCENC_PRF_HDR ? 1 : 0;
  vint4 use_lns(rgb_lns, rgb_lns, rgb_lns, a_lns);
  vmask4 lns_mask = use_lns != vint4::zero();

  // Set up the function pointers for loading pipeline as needed
  pixel_loader loader = load_texel_u8;
  if (img.data_type == ASTCENC_TYPE_F16)
  {
    loader = load_texel_f16;
  }
  else if  (img.data_type == ASTCENC_TYPE_F32)
  {
    loader = load_texel_f32;
  }

  pixel_swizzler swizzler = swz_texel_skip;
  if (needs_swz)
  {
    swizzler = swz_texel;
  }

  pixel_converter converter = encode_texel_unorm;
  if (any(lns_mask))
  {
    converter = encode_texel_lns;
  }

  for (size_t z = 0; z < bsd.dim_z; z++)
  {
    size_t zi = astc::min(pos_z + z, size_z - 1);
    void* plane = img.data[zi];

    for (size_t y = 0; y < bsd.dim_y; y++)
    {
      size_t yi = astc::min(pos_y + y, size_y - 1);

      for (size_t x = 0; x < bsd.dim_x; x++)
      {
        size_t xi = astc::min(pos_x + x, size_x - 1);

        vfloat4 datav = loader(plane, (4 * size_x * yi) + (4 * xi));
        datav = swizzler(datav, swz);
        datav = converter(datav, lns_mask);

        // Compute block metadata
        data_min = min(data_min, datav);
        data_mean += datav * data_mean_scale;
        data_max = max(data_max, datav);

        grayscalev = grayscalev & (datav.swz<0,0,0,0>() == datav.swz<1,1,2,2>());

        blk.data_r[idx] = datav.lane<0>();
        blk.data_g[idx] = datav.lane<1>();
        blk.data_b[idx] = datav.lane<2>();
        blk.data_a[idx] = datav.lane<3>();

        blk.rgb_lns[idx] = rgb_lns;
        blk.alpha_lns[idx] = a_lns;

        idx++;
      }
    }
  }

  // Reverse the encoding so we store origin block in the original format
  vfloat4 data_enc = blk.texel(0);
  vfloat4 data_enc_unorm = data_enc / 65535.0f;
  vfloat4 data_enc_lns = vfloat4::zero();

  if (rgb_lns || a_lns)
  {
    data_enc_lns = float16_to_float(lns_to_sf16(float_to_int(data_enc)));
  }

  blk.origin_texel = select(data_enc_unorm, data_enc_lns, lns_mask);

  // Store block metadata
  blk.data_min = data_min;
  blk.data_mean = data_mean;
  blk.data_max = data_max;
  blk.grayscale = all(grayscalev);
}

/* See header for documentation. */
void load_image_block_fast_ldr(
  astcenc_profile decode_mode,
  const astcenc_image& img,
  image_block& blk,
  const block_size_descriptor& bsd,
  size_t pos_x,
  size_t pos_y,
  size_t pos_z,
  const astcenc_swizzle& swz
) {
  (void)swz;
  (void)decode_mode;

  size_t size_x = img.dim_x;
  size_t size_y = img.dim_y;

  blk.pos_x = pos_x;
  blk.pos_y = pos_y;
  blk.pos_z = pos_z;

  vfloat4 data_min(1e38f);
  vfloat4 data_mean = vfloat4::zero();
  vfloat4 data_max(-1e38f);
  vmask4 grayscalev(true);
  size_t idx = 0;

  const uint8_t* plane = static_cast<const uint8_t*>(img.data[0]);
  for (size_t y = pos_y; y < pos_y + bsd.dim_y; y++)
  {
    size_t yi = astc::min(y, size_y - 1);

    for (size_t x = pos_x; x < pos_x + bsd.dim_x; x++)
    {
      size_t xi = astc::min(x, size_x - 1);

      vint4 datavi = vint4(plane + (4 * size_x * yi) + (4 * xi));
      vfloat4 datav = int_to_float(datavi) * (65535.0f / 255.0f);

      // Compute block metadata
      data_min = min(data_min, datav);
      data_mean += datav;
      data_max = max(data_max, datav);

      grayscalev = grayscalev & (datav.swz<0,0,0,0>() == datav.swz<1,1,2,2>());

      blk.data_r[idx] = datav.lane<0>();
      blk.data_g[idx] = datav.lane<1>();
      blk.data_b[idx] = datav.lane<2>();
      blk.data_a[idx] = datav.lane<3>();

      idx++;
    }
  }

  // Reverse the encoding so we store origin block in the original format
  blk.origin_texel = blk.texel(0) / 65535.0f;

  // Store block metadata
  blk.rgb_lns[0] = 0;
  blk.alpha_lns[0] = 0;
  blk.data_min = data_min;
  blk.data_mean = data_mean / static_cast<float>(bsd.texel_count);
  blk.data_max = data_max;
  blk.grayscale = all(grayscalev);
}

/* See header for documentation. */
void store_image_block(
  astcenc_image& img,
  const image_block& blk,
  const block_size_descriptor& bsd,
  size_t pos_x,
  size_t pos_y,
  size_t pos_z,
  const astcenc_swizzle& swz
) {
  size_t size_x = img.dim_x;
  size_t start_x = pos_x;
  size_t end_x = astc::min(size_x, pos_x + bsd.dim_x);
  size_t count_x = end_x - start_x;
  size_t nudge_x = bsd.dim_x - count_x;

  size_t size_y = img.dim_y;
  size_t start_y = pos_y;
  size_t end_y = astc::min(size_y, pos_y + bsd.dim_y);
  size_t count_y = end_y - start_y;
  size_t nudge_y = (bsd.dim_y - count_y) * bsd.dim_x;

  size_t size_z = img.dim_z;
  size_t start_z = pos_z;
  size_t end_z = astc::min(size_z, pos_z + bsd.dim_z);

  size_t idx = 0;

  // True if any non-identity swizzle
  bool needs_swz = (swz.r != ASTCENC_SWZ_R) || (swz.g != ASTCENC_SWZ_G) ||
                   (swz.b != ASTCENC_SWZ_B) || (swz.a != ASTCENC_SWZ_A);

  // True if any swizzle uses Z reconstruct
  bool needs_z = (swz.r == ASTCENC_SWZ_Z) || (swz.g == ASTCENC_SWZ_Z) ||
                 (swz.b == ASTCENC_SWZ_Z) || (swz.a == ASTCENC_SWZ_Z);

  if (img.data_type == ASTCENC_TYPE_U8)
  {
    for (size_t z = start_z; z < end_z; z++)
    {
      // Fetch the image plane
      uint8_t* data8 = static_cast<uint8_t*>(img.data[z]);

      for (size_t y = start_y; y < end_y; y++)
      {
        uint8_t* data8_row = data8 + (4 * size_x * y) + (4 * start_x);

        for (size_t x = 0; x < count_x; x += ASTCENC_SIMD_WIDTH)
        {
          size_t max_texels = ASTCENC_SIMD_WIDTH;
          size_t used_texels = astc::min(count_x - x, max_texels);

          // Unaligned load as rows are not always SIMD_WIDTH long
          vfloat data_r(blk.data_r + idx);
          vfloat data_g(blk.data_g + idx);
          vfloat data_b(blk.data_b + idx);
          vfloat data_a(blk.data_a + idx);

          // Clamp values to [0.0, 1.0] range before unorm conversion
          //   - Values > 1.0 are possible for all HDR blocks
          //   - Values < 0.0 are possible for HDR void-extent blocks
          vint data_ri = float_to_int_rtn(clampzo(data_r) * 255.0f);
          vint data_gi = float_to_int_rtn(clampzo(data_g) * 255.0f);
          vint data_bi = float_to_int_rtn(clampzo(data_b) * 255.0f);
          vint data_ai = float_to_int_rtn(clampzo(data_a) * 255.0f);

          if (needs_swz)
          {
            vint swizzle_table[7];
            swizzle_table[ASTCENC_SWZ_0] = vint(0);
            swizzle_table[ASTCENC_SWZ_1] = vint(255);
            swizzle_table[ASTCENC_SWZ_R] = data_ri;
            swizzle_table[ASTCENC_SWZ_G] = data_gi;
            swizzle_table[ASTCENC_SWZ_B] = data_bi;
            swizzle_table[ASTCENC_SWZ_A] = data_ai;

            if (needs_z)
            {
              vfloat data_x = (data_r * vfloat(2.0f)) - vfloat(1.0f);
              vfloat data_y = (data_a * vfloat(2.0f)) - vfloat(1.0f);
              vfloat data_z = vfloat(1.0f) - (data_x * data_x) - (data_y * data_y);
              data_z = max(data_z, 0.0f);
              data_z = (sqrt(data_z) * vfloat(0.5f)) + vfloat(0.5f);

              swizzle_table[ASTCENC_SWZ_Z] = float_to_int_rtn(min(data_z, 1.0f) * 255.0f);
            }

            data_ri = swizzle_table[swz.r];
            data_gi = swizzle_table[swz.g];
            data_bi = swizzle_table[swz.b];
            data_ai = swizzle_table[swz.a];
          }

          // Errors are NaN encoded - convert to magenta error color
          // Branch is OK here - it is almost never true so predicts well
          vmask nan_mask = data_r != data_r;
          if (any(nan_mask))
          {
            data_ri = select(data_ri, vint(0xFF), nan_mask);
            data_gi = select(data_gi, vint(0x00), nan_mask);
            data_bi = select(data_bi, vint(0xFF), nan_mask);
            data_ai = select(data_ai, vint(0xFF), nan_mask);
          }

          vint data_rgbai = interleave_rgba8(data_ri, data_gi, data_bi, data_ai);
          // Static cast must be safe, as used_texels must be less than vector length
          vmask store_mask = vint::lane_id() < vint(static_cast<int>(used_texels));
          store_lanes_masked(data8_row, data_rgbai, store_mask);

          data8_row += ASTCENC_SIMD_WIDTH * 4;
          idx += used_texels;
        }
        idx += nudge_x;
      }
      idx += nudge_y;
    }
  }
  else if (img.data_type == ASTCENC_TYPE_F16)
  {
    for (size_t z = start_z; z < end_z; z++)
    {
      // Fetch the image plane
      uint16_t* data16 = static_cast<uint16_t*>(img.data[z]);

      for (size_t y = start_y; y < end_y; y++)
      {
        uint16_t* data16_row = data16 + (4 * size_x * y) + (4 * start_x);

        for (size_t x = 0; x < count_x; x++)
        {
          vint4 color;

          // NaNs are handled inline - no need to special case
          if (needs_swz)
          {
            float data[7];
            data[ASTCENC_SWZ_0] = 0.0f;
            data[ASTCENC_SWZ_1] = 1.0f;
            data[ASTCENC_SWZ_R] = blk.data_r[idx];
            data[ASTCENC_SWZ_G] = blk.data_g[idx];
            data[ASTCENC_SWZ_B] = blk.data_b[idx];
            data[ASTCENC_SWZ_A] = blk.data_a[idx];

            if (needs_z)
            {
              float xN = (data[0] * 2.0f) - 1.0f;
              float yN = (data[3] * 2.0f) - 1.0f;
              float zN = 1.0f - xN * xN - yN * yN;
              if (zN < 0.0f)
              {
                zN = 0.0f;
              }
              data[ASTCENC_SWZ_Z] = (astc::sqrt(zN) * 0.5f) + 0.5f;
            }

            vfloat4 colorf(data[swz.r], data[swz.g], data[swz.b], data[swz.a]);
            color = float_to_float16(colorf);
          }
          else
          {
            vfloat4 colorf = blk.texel(idx);
            color = float_to_float16(colorf);
          }

          // TODO: Vectorize with store N shorts?
          data16_row[0] = static_cast<uint16_t>(color.lane<0>());
          data16_row[1] = static_cast<uint16_t>(color.lane<1>());
          data16_row[2] = static_cast<uint16_t>(color.lane<2>());
          data16_row[3] = static_cast<uint16_t>(color.lane<3>());
          data16_row += 4;
          idx++;
        }
        idx += nudge_x;
      }
      idx += nudge_y;
    }
  }
  else // if (img.data_type == ASTCENC_TYPE_F32)
  {
    assert(img.data_type == ASTCENC_TYPE_F32);

    for (size_t z = start_z; z < end_z; z++)
    {
      // Fetch the image plane
      float* data32 = static_cast<float*>(img.data[z]);

      for (size_t y = start_y; y < end_y; y++)
      {
        float* data32_row = data32 + (4 * size_x * y) + (4 * start_x);

        for (size_t x = 0; x < count_x; x++)
        {
          vfloat4 color = blk.texel(idx);

          // NaNs are handled inline - no need to special case
          if (needs_swz)
          {
            float data[7];
            data[ASTCENC_SWZ_0] = 0.0f;
            data[ASTCENC_SWZ_1] = 1.0f;
            data[ASTCENC_SWZ_R] = color.lane<0>();
            data[ASTCENC_SWZ_G] = color.lane<1>();
            data[ASTCENC_SWZ_B] = color.lane<2>();
            data[ASTCENC_SWZ_A] = color.lane<3>();

            if (needs_z)
            {
              float xN = (data[0] * 2.0f) - 1.0f;
              float yN = (data[3] * 2.0f) - 1.0f;
              float zN = 1.0f - xN * xN - yN * yN;
              if (zN < 0.0f)
              {
                zN = 0.0f;
              }
              data[ASTCENC_SWZ_Z] = (astc::sqrt(zN) * 0.5f) + 0.5f;
            }

            color = vfloat4(data[swz.r], data[swz.g], data[swz.b], data[swz.a]);
          }

          store(color, data32_row);
          data32_row += 4;
          idx++;
        }
        idx += nudge_x;
      }
      idx += nudge_y;
    }
  }
}

Coverage Report

Created: 2026-06-10 07:06

Line	Count	Source
1		// SPDX-License-Identifier: Apache-2.0
2		// ----------------------------------------------------------------------------
3		// Copyright 2011-2026 Arm Limited
4		//
5		// Licensed under the Apache License, Version 2.0 (the "License"); you may not
6		// use this file except in compliance with the License. You may obtain a copy
7		// of the License at:
8		//
9		// http://www.apache.org/licenses/LICENSE-2.0
10		//
11		// Unless required by applicable law or agreed to in writing, software
12		// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13		// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14		// License for the specific language governing permissions and limitations
15		// under the License.
16		// ----------------------------------------------------------------------------
17
18		/**
19		* @brief Functions for creating in-memory ASTC image structures.
20		*/
21
22		#include <cassert>
23		#include <cstring>
24
25		#include "astcenc_internal.h"
26
27		/**
28		* @brief Loader pipeline function type for data fetch from memory.
29		*/
30		using pixel_loader = vfloat4()(const void, size_t);
31
32		/**
33		* @brief Loader pipeline function type for swizzling data in a vector.
34		*/
35		using pixel_swizzler = vfloat4(*)(vfloat4, const astcenc_swizzle&);
36
37		/**
38		* @brief Loader pipeline function type for converting data in a vector to LNS.
39		*/
40		using pixel_converter = vfloat4(*)(vfloat4, vmask4);
41
42		/**
43		* @brief Load a 8-bit UNORM texel from a data array.
44		*
45		* @param data The data pointer.
46		* @param base_offset The index offset to the start of the pixel.
47		*/
48		static vfloat4 load_texel_u8(
49		const void* data,
50		size_t base_offset
51	25.9k	) {
52	25.9k	const uint8_t* data8 = static_cast<const uint8_t*>(data);
53	25.9k	return int_to_float(vint4(data8 + base_offset)) / 255.0f;
54	25.9k	}
55
56		/**
57		* @brief Load a 16-bit fp16 texel from a data array.
58		*
59		* @param data The data pointer.
60		* @param base_offset The index offset to the start of the pixel.
61		*/
62		static vfloat4 load_texel_f16(
63		const void* data,
64		size_t base_offset
65	0	) {
66	0	const uint16_t* data16 = static_cast<const uint16_t*>(data);
67	0	int r = data16[base_offset ];
68	0	int g = data16[base_offset + 1];
69	0	int b = data16[base_offset + 2];
70	0	int a = data16[base_offset + 3];
71	0	return float16_to_float(vint4(r, g, b, a));
72	0	}
73
74		/**
75		* @brief Load a 32-bit float texel from a data array.
76		*
77		* @param data The data pointer.
78		* @param base_offset The index offset to the start of the pixel.
79		*/
80		static vfloat4 load_texel_f32(
81		const void* data,
82		size_t base_offset
83	0	) {
84	0	const float* data32 = static_cast<const float*>(data);
85	0	return vfloat4(data32 + base_offset);
86	0	}
87
88		/**
89		* @brief Dummy no-op swizzle function.
90		*
91		* @param data The source RGBA vector to swizzle.
92		* @param swz The swizzle to use.
93		*/
94		static vfloat4 swz_texel_skip(
95		vfloat4 data,
96		const astcenc_swizzle& swz
97	25.9k	) {
98	25.9k	(void)swz;
99	25.9k	return data;
100	25.9k	}
101
102		/**
103		* @brief Swizzle a texel into a new arrangement.
104		*
105		* @param data The source RGBA vector to swizzle.
106		* @param swz The swizzle to use.
107		*/
108		static vfloat4 swz_texel(
109		vfloat4 data,
110		const astcenc_swizzle& swz
111	0	) {
112	0	ASTCENC_ALIGNAS float datas[6];
113
114	0	storea(data, datas);
115	0	datas[ASTCENC_SWZ_0] = 0.0f;
116	0	datas[ASTCENC_SWZ_1] = 1.0f;
117
118	0	return vfloat4(datas[swz.r], datas[swz.g], datas[swz.b], datas[swz.a]);
119	0	}
120
121		/**
122		* @brief Encode a texel that is entirely LDR linear.
123		*
124		* @param data The RGBA data to encode.
125		* @param lns_mask The mask for the HDR channels than need LNS encoding.
126		*/
127		static vfloat4 encode_texel_unorm(
128		vfloat4 data,
129		vmask4 lns_mask
130	0	) {
131	0	(void)lns_mask;
132	0	return data * 65535.0f;
133	0	}
134
135		/**
136		* @brief Encode a texel that includes at least some HDR LNS texels.
137		*
138		* @param data The RGBA data to encode.
139		* @param lns_mask The mask for the HDR channels than need LNS encoding.
140		*/
141		static vfloat4 encode_texel_lns(
142		vfloat4 data,
143		vmask4 lns_mask
144	25.9k	) {
145	25.9k	vfloat4 datav_unorm = data * 65535.0f;
146	25.9k	vfloat4 datav_lns = float_to_lns(data);
147	25.9k	return select(datav_unorm, datav_lns, lns_mask);
148	25.9k	}
149
150		/* See header for documentation. */
151		void load_image_block(
152		astcenc_profile decode_mode,
153		const astcenc_image& img,
154		image_block& blk,
155		const block_size_descriptor& bsd,
156		size_t pos_x,
157		size_t pos_y,
158		size_t pos_z,
159		const astcenc_swizzle& swz
160	1.12k	) {
161	1.12k	size_t size_x = img.dim_x;
162	1.12k	size_t size_y = img.dim_y;
163	1.12k	size_t size_z = img.dim_z;
164
165	1.12k	blk.pos_x = pos_x;
166	1.12k	blk.pos_y = pos_y;
167	1.12k	blk.pos_z = pos_z;
168
169		// True if any non-identity swizzle
170	1.12k	bool needs_swz = (swz.r != ASTCENC_SWZ_R) \|\| (swz.g != ASTCENC_SWZ_G) \|\|
171	1.12k	(swz.b != ASTCENC_SWZ_B) \|\| (swz.a != ASTCENC_SWZ_A);
172
173	1.12k	vfloat4 data_min(1e38f);
174	1.12k	vfloat4 data_mean(0.0f);
175	1.12k	vfloat4 data_mean_scale(1.0f / static_cast<float>(bsd.texel_count));
176	1.12k	vfloat4 data_max(-1e38f);
177	1.12k	vmask4 grayscalev(true);
178	1.12k	size_t idx = 0;
179
180		// This works because we impose the same choice everywhere during encode
181	1.12k	uint8_t rgb_lns = (decode_mode == ASTCENC_PRF_HDR) \|\|
182	1.12k	(decode_mode == ASTCENC_PRF_HDR_RGB_LDR_A) ? 1 : 0;
183	1.12k	uint8_t a_lns = decode_mode == ASTCENC_PRF_HDR ? 1 : 0;
184	1.12k	vint4 use_lns(rgb_lns, rgb_lns, rgb_lns, a_lns);
185	1.12k	vmask4 lns_mask = use_lns != vint4::zero();
186
187		// Set up the function pointers for loading pipeline as needed
188	1.12k	pixel_loader loader = load_texel_u8;
189	1.12k	if (img.data_type == ASTCENC_TYPE_F16)
190	0	{
191	0	loader = load_texel_f16;
192	0	}
193	1.12k	else if (img.data_type == ASTCENC_TYPE_F32)
194	0	{
195	0	loader = load_texel_f32;
196	0	}
197
198	1.12k	pixel_swizzler swizzler = swz_texel_skip;
199	1.12k	if (needs_swz)
200	0	{
201	0	swizzler = swz_texel;
202	0	}
203
204	1.12k	pixel_converter converter = encode_texel_unorm;
205	1.12k	if (any(lns_mask))
206	1.12k	{
207	1.12k	converter = encode_texel_lns;
208	1.12k	}
209
210	2.24k	for (size_t z = 0; z < bsd.dim_z; z++)
211	1.12k	{
212	1.12k	size_t zi = astc::min(pos_z + z, size_z - 1);
213	1.12k	void* plane = img.data[zi];
214
215	6.09k	for (size_t y = 0; y < bsd.dim_y; y++)
216	4.97k	{
217	4.97k	size_t yi = astc::min(pos_y + y, size_y - 1);
218
219	30.9k	for (size_t x = 0; x < bsd.dim_x; x++)
220	25.9k	{
221	25.9k	size_t xi = astc::min(pos_x + x, size_x - 1);
222
223	25.9k	vfloat4 datav = loader(plane, (4 * size_x * yi) + (4 * xi));
224	25.9k	datav = swizzler(datav, swz);
225	25.9k	datav = converter(datav, lns_mask);
226
227		// Compute block metadata
228	25.9k	data_min = min(data_min, datav);
229	25.9k	data_mean += datav * data_mean_scale;
230	25.9k	data_max = max(data_max, datav);
231
232	25.9k	grayscalev = grayscalev & (datav.swz<0,0,0,0>() == datav.swz<1,1,2,2>());
233
234	25.9k	blk.data_r[idx] = datav.lane<0>();
235	25.9k	blk.data_g[idx] = datav.lane<1>();
236	25.9k	blk.data_b[idx] = datav.lane<2>();
237	25.9k	blk.data_a[idx] = datav.lane<3>();
238
239	25.9k	blk.rgb_lns[idx] = rgb_lns;
240	25.9k	blk.alpha_lns[idx] = a_lns;
241
242	25.9k	idx++;
243	25.9k	}
244	4.97k	}
245	1.12k	}
246
247		// Reverse the encoding so we store origin block in the original format
248	1.12k	vfloat4 data_enc = blk.texel(0);
249	1.12k	vfloat4 data_enc_unorm = data_enc / 65535.0f;
250	1.12k	vfloat4 data_enc_lns = vfloat4::zero();
251
252	1.12k	if (rgb_lns \|\| a_lns)
253	1.12k	{
254	1.12k	data_enc_lns = float16_to_float(lns_to_sf16(float_to_int(data_enc)));
255	1.12k	}
256
257	1.12k	blk.origin_texel = select(data_enc_unorm, data_enc_lns, lns_mask);
258
259		// Store block metadata
260	1.12k	blk.data_min = data_min;
261	1.12k	blk.data_mean = data_mean;
262	1.12k	blk.data_max = data_max;
263	1.12k	blk.grayscale = all(grayscalev);
264	1.12k	}
265
266		/* See header for documentation. */
267		void load_image_block_fast_ldr(
268		astcenc_profile decode_mode,
269		const astcenc_image& img,
270		image_block& blk,
271		const block_size_descriptor& bsd,
272		size_t pos_x,
273		size_t pos_y,
274		size_t pos_z,
275		const astcenc_swizzle& swz
276	1.11k	) {
277	1.11k	(void)swz;
278	1.11k	(void)decode_mode;
279
280	1.11k	size_t size_x = img.dim_x;
281	1.11k	size_t size_y = img.dim_y;
282
283	1.11k	blk.pos_x = pos_x;
284	1.11k	blk.pos_y = pos_y;
285	1.11k	blk.pos_z = pos_z;
286
287	1.11k	vfloat4 data_min(1e38f);
288	1.11k	vfloat4 data_mean = vfloat4::zero();
289	1.11k	vfloat4 data_max(-1e38f);
290	1.11k	vmask4 grayscalev(true);
291	1.11k	size_t idx = 0;
292
293	1.11k	const uint8_t* plane = static_cast<const uint8_t*>(img.data[0]);
294	6.44k	for (size_t y = pos_y; y < pos_y + bsd.dim_y; y++)
295	5.33k	{
296	5.33k	size_t yi = astc::min(y, size_y - 1);
297
298	36.5k	for (size_t x = pos_x; x < pos_x + bsd.dim_x; x++)
299	31.2k	{
300	31.2k	size_t xi = astc::min(x, size_x - 1);
301
302	31.2k	vint4 datavi = vint4(plane + (4 * size_x * yi) + (4 * xi));
303	31.2k	vfloat4 datav = int_to_float(datavi) * (65535.0f / 255.0f);
304
305		// Compute block metadata
306	31.2k	data_min = min(data_min, datav);
307	31.2k	data_mean += datav;
308	31.2k	data_max = max(data_max, datav);
309
310	31.2k	grayscalev = grayscalev & (datav.swz<0,0,0,0>() == datav.swz<1,1,2,2>());
311
312	31.2k	blk.data_r[idx] = datav.lane<0>();
313	31.2k	blk.data_g[idx] = datav.lane<1>();
314	31.2k	blk.data_b[idx] = datav.lane<2>();
315	31.2k	blk.data_a[idx] = datav.lane<3>();
316
317	31.2k	idx++;
318	31.2k	}
319	5.33k	}
320
321		// Reverse the encoding so we store origin block in the original format
322	1.11k	blk.origin_texel = blk.texel(0) / 65535.0f;
323
324		// Store block metadata
325	1.11k	blk.rgb_lns[0] = 0;
326	1.11k	blk.alpha_lns[0] = 0;
327	1.11k	blk.data_min = data_min;
328	1.11k	blk.data_mean = data_mean / static_cast<float>(bsd.texel_count);
329	1.11k	blk.data_max = data_max;
330	1.11k	blk.grayscale = all(grayscalev);
331	1.11k	}
332
333		/* See header for documentation. */
334		void store_image_block(
335		astcenc_image& img,
336		const image_block& blk,
337		const block_size_descriptor& bsd,
338		size_t pos_x,
339		size_t pos_y,
340		size_t pos_z,
341		const astcenc_swizzle& swz
342	0	) {
343	0	size_t size_x = img.dim_x;
344	0	size_t start_x = pos_x;
345	0	size_t end_x = astc::min(size_x, pos_x + bsd.dim_x);
346	0	size_t count_x = end_x - start_x;
347	0	size_t nudge_x = bsd.dim_x - count_x;
348
349	0	size_t size_y = img.dim_y;
350	0	size_t start_y = pos_y;
351	0	size_t end_y = astc::min(size_y, pos_y + bsd.dim_y);
352	0	size_t count_y = end_y - start_y;
353	0	size_t nudge_y = (bsd.dim_y - count_y) * bsd.dim_x;
354
355	0	size_t size_z = img.dim_z;
356	0	size_t start_z = pos_z;
357	0	size_t end_z = astc::min(size_z, pos_z + bsd.dim_z);
358
359	0	size_t idx = 0;
360
361		// True if any non-identity swizzle
362	0	bool needs_swz = (swz.r != ASTCENC_SWZ_R) \|\| (swz.g != ASTCENC_SWZ_G) \|\|
363	0	(swz.b != ASTCENC_SWZ_B) \|\| (swz.a != ASTCENC_SWZ_A);
364
365		// True if any swizzle uses Z reconstruct
366	0	bool needs_z = (swz.r == ASTCENC_SWZ_Z) \|\| (swz.g == ASTCENC_SWZ_Z) \|\|
367	0	(swz.b == ASTCENC_SWZ_Z) \|\| (swz.a == ASTCENC_SWZ_Z);
368
369	0	if (img.data_type == ASTCENC_TYPE_U8)
370	0	{
371	0	for (size_t z = start_z; z < end_z; z++)
372	0	{
373		// Fetch the image plane
374	0	uint8_t* data8 = static_cast<uint8_t*>(img.data[z]);
375
376	0	for (size_t y = start_y; y < end_y; y++)
377	0	{
378	0	uint8_t* data8_row = data8 + (4 * size_x * y) + (4 * start_x);
379
380	0	for (size_t x = 0; x < count_x; x += ASTCENC_SIMD_WIDTH)
381	0	{
382	0	size_t max_texels = ASTCENC_SIMD_WIDTH;
383	0	size_t used_texels = astc::min(count_x - x, max_texels);
384
385		// Unaligned load as rows are not always SIMD_WIDTH long
386	0	vfloat data_r(blk.data_r + idx);
387	0	vfloat data_g(blk.data_g + idx);
388	0	vfloat data_b(blk.data_b + idx);
389	0	vfloat data_a(blk.data_a + idx);
390
391		// Clamp values to [0.0, 1.0] range before unorm conversion
392		// - Values > 1.0 are possible for all HDR blocks
393		// - Values < 0.0 are possible for HDR void-extent blocks
394	0	vint data_ri = float_to_int_rtn(clampzo(data_r) * 255.0f);
395	0	vint data_gi = float_to_int_rtn(clampzo(data_g) * 255.0f);
396	0	vint data_bi = float_to_int_rtn(clampzo(data_b) * 255.0f);
397	0	vint data_ai = float_to_int_rtn(clampzo(data_a) * 255.0f);
398
399	0	if (needs_swz)
400	0	{
401	0	vint swizzle_table[7];
402	0	swizzle_table[ASTCENC_SWZ_0] = vint(0);
403	0	swizzle_table[ASTCENC_SWZ_1] = vint(255);
404	0	swizzle_table[ASTCENC_SWZ_R] = data_ri;
405	0	swizzle_table[ASTCENC_SWZ_G] = data_gi;
406	0	swizzle_table[ASTCENC_SWZ_B] = data_bi;
407	0	swizzle_table[ASTCENC_SWZ_A] = data_ai;
408
409	0	if (needs_z)
410	0	{
411	0	vfloat data_x = (data_r * vfloat(2.0f)) - vfloat(1.0f);
412	0	vfloat data_y = (data_a * vfloat(2.0f)) - vfloat(1.0f);
413	0	vfloat data_z = vfloat(1.0f) - (data_x * data_x) - (data_y * data_y);
414	0	data_z = max(data_z, 0.0f);
415	0	data_z = (sqrt(data_z) * vfloat(0.5f)) + vfloat(0.5f);
416
417	0	swizzle_table[ASTCENC_SWZ_Z] = float_to_int_rtn(min(data_z, 1.0f) * 255.0f);
418	0	}
419
420	0	data_ri = swizzle_table[swz.r];
421	0	data_gi = swizzle_table[swz.g];
422	0	data_bi = swizzle_table[swz.b];
423	0	data_ai = swizzle_table[swz.a];
424	0	}
425
426		// Errors are NaN encoded - convert to magenta error color
427		// Branch is OK here - it is almost never true so predicts well
428	0	vmask nan_mask = data_r != data_r;
429	0	if (any(nan_mask))
430	0	{
431	0	data_ri = select(data_ri, vint(0xFF), nan_mask);
432	0	data_gi = select(data_gi, vint(0x00), nan_mask);
433	0	data_bi = select(data_bi, vint(0xFF), nan_mask);
434	0	data_ai = select(data_ai, vint(0xFF), nan_mask);
435	0	}
436
437	0	vint data_rgbai = interleave_rgba8(data_ri, data_gi, data_bi, data_ai);
438		// Static cast must be safe, as used_texels must be less than vector length
439	0	vmask store_mask = vint::lane_id() < vint(static_cast<int>(used_texels));
440	0	store_lanes_masked(data8_row, data_rgbai, store_mask);
441
442	0	data8_row += ASTCENC_SIMD_WIDTH * 4;
443	0	idx += used_texels;
444	0	}
445	0	idx += nudge_x;
446	0	}
447	0	idx += nudge_y;
448	0	}
449	0	}
450	0	else if (img.data_type == ASTCENC_TYPE_F16)
451	0	{
452	0	for (size_t z = start_z; z < end_z; z++)
453	0	{
454		// Fetch the image plane
455	0	uint16_t* data16 = static_cast<uint16_t*>(img.data[z]);
456
457	0	for (size_t y = start_y; y < end_y; y++)
458	0	{
459	0	uint16_t* data16_row = data16 + (4 * size_x * y) + (4 * start_x);
460
461	0	for (size_t x = 0; x < count_x; x++)
462	0	{
463	0	vint4 color;
464
465		// NaNs are handled inline - no need to special case
466	0	if (needs_swz)
467	0	{
468	0	float data[7];
469	0	data[ASTCENC_SWZ_0] = 0.0f;
470	0	data[ASTCENC_SWZ_1] = 1.0f;
471	0	data[ASTCENC_SWZ_R] = blk.data_r[idx];
472	0	data[ASTCENC_SWZ_G] = blk.data_g[idx];
473	0	data[ASTCENC_SWZ_B] = blk.data_b[idx];
474	0	data[ASTCENC_SWZ_A] = blk.data_a[idx];
475
476	0	if (needs_z)
477	0	{
478	0	float xN = (data[0] * 2.0f) - 1.0f;
479	0	float yN = (data[3] * 2.0f) - 1.0f;
480	0	float zN = 1.0f - xN * xN - yN * yN;
481	0	if (zN < 0.0f)
482	0	{
483	0	zN = 0.0f;
484	0	}
485	0	data[ASTCENC_SWZ_Z] = (astc::sqrt(zN) * 0.5f) + 0.5f;
486	0	}
487
488	0	vfloat4 colorf(data[swz.r], data[swz.g], data[swz.b], data[swz.a]);
489	0	color = float_to_float16(colorf);
490	0	}
491	0	else
492	0	{
493	0	vfloat4 colorf = blk.texel(idx);
494	0	color = float_to_float16(colorf);
495	0	}
496
497		// TODO: Vectorize with store N shorts?
498	0	data16_row[0] = static_cast<uint16_t>(color.lane<0>());
499	0	data16_row[1] = static_cast<uint16_t>(color.lane<1>());
500	0	data16_row[2] = static_cast<uint16_t>(color.lane<2>());
501	0	data16_row[3] = static_cast<uint16_t>(color.lane<3>());
502	0	data16_row += 4;
503	0	idx++;
504	0	}
505	0	idx += nudge_x;
506	0	}
507	0	idx += nudge_y;
508	0	}
509	0	}
510	0	else // if (img.data_type == ASTCENC_TYPE_F32)
511	0	{
512	0	assert(img.data_type == ASTCENC_TYPE_F32);
513
514	0	for (size_t z = start_z; z < end_z; z++)
515	0	{
516		// Fetch the image plane
517	0	float* data32 = static_cast<float*>(img.data[z]);
518
519	0	for (size_t y = start_y; y < end_y; y++)
520	0	{
521	0	float* data32_row = data32 + (4 * size_x * y) + (4 * start_x);
522
523	0	for (size_t x = 0; x < count_x; x++)
524	0	{
525	0	vfloat4 color = blk.texel(idx);
526
527		// NaNs are handled inline - no need to special case
528	0	if (needs_swz)
529	0	{
530	0	float data[7];
531	0	data[ASTCENC_SWZ_0] = 0.0f;
532	0	data[ASTCENC_SWZ_1] = 1.0f;
533	0	data[ASTCENC_SWZ_R] = color.lane<0>();
534	0	data[ASTCENC_SWZ_G] = color.lane<1>();
535	0	data[ASTCENC_SWZ_B] = color.lane<2>();
536	0	data[ASTCENC_SWZ_A] = color.lane<3>();
537
538	0	if (needs_z)
539	0	{
540	0	float xN = (data[0] * 2.0f) - 1.0f;
541	0	float yN = (data[3] * 2.0f) - 1.0f;
542	0	float zN = 1.0f - xN * xN - yN * yN;
543	0	if (zN < 0.0f)
544	0	{
545	0	zN = 0.0f;
546	0	}
547	0	data[ASTCENC_SWZ_Z] = (astc::sqrt(zN) * 0.5f) + 0.5f;
548	0	}
549
550	0	color = vfloat4(data[swz.r], data[swz.g], data[swz.b], data[swz.a]);
551	0	}
552
553	0	store(color, data32_row);
554	0	data32_row += 4;
555	0	idx++;
556	0	}
557	0	idx += nudge_x;
558	0	}
559	0	idx += nudge_y;
560	0	}
561	0	}
562	0	}