/src/astc-encoder/Source/astcenc_weight_align.cpp

Source
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2026 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------

#if !defined(ASTCENC_DECOMPRESS_ONLY)

/**
 * @brief Functions for angular-sum algorithm for weight alignment.
 *
 * This algorithm works as follows:
 * - we compute a complex number P as (cos s*i, sin s*i) for each weight,
 *   where i is the input value and s is a scaling factor based on the spacing between the weights.
 * - we then add together complex numbers for all the weights.
 * - we then compute the length and angle of the resulting sum.
 *
 * This should produce the following results:
 * - perfect alignment results in a vector whose length is equal to the sum of lengths of all inputs
 * - even distribution results in a vector of length 0.
 * - all samples identical results in perfect alignment for every scaling.
 *
 * For each scaling factor within a given set, we compute an alignment factor from 0 to 1. This
 * should then result in some scalings standing out as having particularly good alignment factors;
 * we can use this to produce a set of candidate scale/shift values for various quantization levels;
 * we should then actually try them and see what happens.
 */

#include "astcenc_internal.h"
#include "astcenc_vecmathlib.h"

#include <stdio.h>
#include <cassert>
#include <cstring>
#include <cfloat>

static constexpr unsigned int ANGULAR_STEPS { 32 };

static_assert((ANGULAR_STEPS % ASTCENC_SIMD_WIDTH) == 0,
              "ANGULAR_STEPS must be multiple of ASTCENC_SIMD_WIDTH");

static_assert(ANGULAR_STEPS >= 32,
              "ANGULAR_STEPS must be at least max(steps_for_quant_level)");

// Store a reduced sin/cos table for 64 possible weight values; this causes
// slight quality loss compared to using sin() and cos() directly. Must be 2^N.
static constexpr unsigned int SINCOS_STEPS { 64 };

static const uint8_t steps_for_quant_level[12] {
  2, 3, 4, 5, 6, 8, 10, 12, 16, 20, 24, 32
};

ASTCENC_ALIGNAS static float sin_table[SINCOS_STEPS][ANGULAR_STEPS];
ASTCENC_ALIGNAS static float cos_table[SINCOS_STEPS][ANGULAR_STEPS];

#if defined(ASTCENC_DIAGNOSTICS)
  static bool print_once { true };
#endif

/* See header for documentation. */
void prepare_angular_tables()
{
  for (unsigned int i = 0; i < ANGULAR_STEPS; i++)
  {
    float angle_step = static_cast<float>(i + 1);

    for (unsigned int j = 0; j < SINCOS_STEPS; j++)
    {
      sin_table[j][i] = static_cast<float>(sinf((2.0f * astc::PI / (SINCOS_STEPS - 1.0f)) * angle_step * static_cast<float>(j)));
      cos_table[j][i] = static_cast<float>(cosf((2.0f * astc::PI / (SINCOS_STEPS - 1.0f)) * angle_step * static_cast<float>(j)));
    }
  }
}

/**
 * @brief Compute the angular alignment factors and offsets.
 *
 * @param      weight_count              The number of (decimated) weights.
 * @param      dec_weight_ideal_value    The ideal decimated unquantized weight values.
 * @param      max_angular_steps         The maximum number of steps to be tested.
 * @param[out] offsets                   The output angular offsets array.
 */
static void compute_angular_offsets(
  unsigned int weight_count,
  const float* dec_weight_ideal_value,
  unsigned int max_angular_steps,
  float* offsets
) {
  promise(weight_count > 0);
  promise(max_angular_steps > 0);

  ASTCENC_ALIGNAS int isamplev[BLOCK_MAX_WEIGHTS];

  // Precompute isample; arrays are always allocated 64 elements long
  for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
  {
    // Ideal weight can be outside [0, 1] range, so clamp to fit table
    vfloat ideal_weight = clampzo(loada(dec_weight_ideal_value + i));

    // Convert a weight to a sincos table index
    vfloat sample = ideal_weight * (SINCOS_STEPS - 1.0f);
    vint isample = float_to_int_rtn(sample);
    storea(isample, isamplev + i);
  }

  // Arrays are multiple of SIMD width (ANGULAR_STEPS), safe to overshoot max
  vfloat mult(1.0f / (2.0f * astc::PI));

  for (unsigned int i = 0; i < max_angular_steps; i += ASTCENC_SIMD_WIDTH)
  {
    vfloat anglesum_x = vfloat::zero();
    vfloat anglesum_y = vfloat::zero();

    for (unsigned int j = 0; j < weight_count; j++)
    {
      int isample = isamplev[j];
      anglesum_x += loada(cos_table[isample] + i);
      anglesum_y += loada(sin_table[isample] + i);
    }

    vfloat angle = atan2(anglesum_y, anglesum_x);

    // Suppress NaNs generated if anglesums are both zero
    angle = select(vfloat::zero(), angle, angle == angle);

    vfloat ofs = angle * mult;
    storea(ofs, offsets + i);
  }
}

/**
 * @brief For a given step size compute the lowest and highest weight.
 *
 * Compute the lowest and highest weight that results from quantizing using the given stepsize and
 * offset, and then compute the resulting error. The cut errors indicate the error that results from
 * forcing samples that should have had one weight value one step up or down.
 *
 * @param      weight_count              The number of (decimated) weights.
 * @param      dec_weight_ideal_value    The ideal decimated unquantized weight values.
 * @param      max_angular_steps         The maximum number of steps to be tested.
 * @param      max_quant_steps           The maximum quantization level to be tested.
 * @param      offsets                   The angular offsets array.
 * @param[out] lowest_weight             Per angular step, the lowest weight.
 * @param[out] weight_span               Per angular step, the span between lowest and highest weight.
 * @param[out] error                     Per angular step, the error.
 * @param[out] cut_low_weight_error      Per angular step, the low weight cut error.
 * @param[out] cut_high_weight_error     Per angular step, the high weight cut error.
 */
static void compute_lowest_and_highest_weight(
  unsigned int weight_count,
  const float* dec_weight_ideal_value,
  unsigned int max_angular_steps,
  unsigned int max_quant_steps,
  const float* offsets,
  float* lowest_weight,
  int* weight_span,
  float* error,
  float* cut_low_weight_error,
  float* cut_high_weight_error
) {
  promise(weight_count > 0);
  promise(max_angular_steps > 0);

  vfloat rcp_stepsize = int_to_float(vint::lane_id()) + vfloat(1.0f);

  // Compute minimum/maximum weights in the weight array. Our remapping
  // is monotonic, so the min/max rounded weights relate to the min/max
  // unrounded weights in a straightforward way.
  vfloat min_weight(FLT_MAX);
  vfloat max_weight(-FLT_MAX);

  vint lane_id = vint::lane_id();
  for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
  {
    vmask active = lane_id < vint(weight_count);
    lane_id += vint(ASTCENC_SIMD_WIDTH);

    vfloat weights = loada(dec_weight_ideal_value + i);
    min_weight = min(select(min_weight, weights, active), min_weight);
    max_weight = max(select(max_weight, weights, active), max_weight);
  }

  min_weight = hmin(min_weight);
  max_weight = hmax(max_weight);

  // Arrays are ANGULAR_STEPS long, so always safe to run full vectors
  for (unsigned int sp = 0; sp < max_angular_steps; sp += ASTCENC_SIMD_WIDTH)
  {
    vfloat errval = vfloat::zero();
    vfloat cut_low_weight_err = vfloat::zero();
    vfloat cut_high_weight_err = vfloat::zero();
    vfloat offset = loada(offsets + sp);

    // We know the min and max weight values, so we can figure out
    // the corresponding indices before we enter the loop.
    vfloat minidx = round(min_weight * rcp_stepsize - offset);
    vfloat maxidx = round(max_weight * rcp_stepsize - offset);

    for (unsigned int j = 0; j < weight_count; j++)
    {
      vfloat sval = load1(dec_weight_ideal_value + j) * rcp_stepsize - offset;
      vfloat svalrte = round(sval);
      vfloat diff = sval - svalrte;
      errval += diff * diff;

      // Accumulate errors for minimum index
      vmask mask = svalrte == minidx;
      vfloat accum = cut_low_weight_err + vfloat(1.0f) - vfloat(2.0f) * diff;
      cut_low_weight_err = select(cut_low_weight_err, accum, mask);

      // Accumulate errors for maximum index
      mask = svalrte == maxidx;
      accum = cut_high_weight_err + vfloat(1.0f) + vfloat(2.0f) * diff;
      cut_high_weight_err = select(cut_high_weight_err, accum, mask);
    }

    // Write out min weight and weight span; clamp span to a usable range
    vint span = float_to_int(maxidx - minidx + vfloat(1));
    span = min(span, vint(max_quant_steps + 3));
    span = max(span, vint(2));
    storea(minidx, lowest_weight + sp);
    storea(span, weight_span + sp);

    // The cut_(lowest/highest)_weight_error indicate the error that results from  forcing
    // samples that should have had the weight value one step (up/down).
    vfloat ssize = 1.0f / rcp_stepsize;
    vfloat errscale = ssize * ssize;
    storea(errval * errscale, error + sp);
    storea(cut_low_weight_err * errscale, cut_low_weight_error + sp);
    storea(cut_high_weight_err * errscale, cut_high_weight_error + sp);

    rcp_stepsize = rcp_stepsize + vfloat(ASTCENC_SIMD_WIDTH);
  }
}

/**
 * @brief The main function for the angular algorithm.
 *
 * @param      weight_count              The number of (decimated) weights.
 * @param      dec_weight_ideal_value    The ideal decimated unquantized weight values.
 * @param      max_quant_level           The maximum quantization level to be tested.
 * @param[out] low_value                 Per angular step, the lowest weight value.
 * @param[out] high_value                Per angular step, the highest weight value.
 */
static void compute_angular_endpoints_for_quant_levels(
  unsigned int weight_count,
  const float* dec_weight_ideal_value,
  unsigned int max_quant_level,
  float low_value[TUNE_MAX_ANGULAR_QUANT + 1],
  float high_value[TUNE_MAX_ANGULAR_QUANT + 1]
) {
  unsigned int max_quant_steps = steps_for_quant_level[max_quant_level];
  unsigned int max_angular_steps = steps_for_quant_level[max_quant_level];

  ASTCENC_ALIGNAS float angular_offsets[ANGULAR_STEPS];

  compute_angular_offsets(weight_count, dec_weight_ideal_value,
                          max_angular_steps, angular_offsets);

  ASTCENC_ALIGNAS float lowest_weight[ANGULAR_STEPS];
  ASTCENC_ALIGNAS int32_t weight_span[ANGULAR_STEPS];
  ASTCENC_ALIGNAS float error[ANGULAR_STEPS];
  ASTCENC_ALIGNAS float cut_low_weight_error[ANGULAR_STEPS];
  ASTCENC_ALIGNAS float cut_high_weight_error[ANGULAR_STEPS];

  compute_lowest_and_highest_weight(weight_count, dec_weight_ideal_value,
                                    max_angular_steps, max_quant_steps,
                                    angular_offsets, lowest_weight, weight_span, error,
                                    cut_low_weight_error, cut_high_weight_error);

  // For each quantization level, find the best error terms. Use packed vectors so data-dependent
  // branches can become selects. This involves some integer to float casts, but the values are
  // small enough so they never round the wrong way.
  vfloat4 best_results[36];

  // Initialize the array to some safe defaults
  promise(max_quant_steps > 0);
  for (unsigned int i = 0; i < (max_quant_steps + 4); i++)
  {
    // Lane<0> = Best error
    // Lane<1> = Best scale; -1 indicates no solution found
    // Lane<2> = Cut low weight
    best_results[i] = vfloat4(ERROR_CALC_DEFAULT, -1.0f, 0.0f, 0.0f);
  }

  promise(max_angular_steps > 0);
  for (unsigned int i = 0; i < max_angular_steps; i++)
  {
    float i_flt = static_cast<float>(i);

    int idx_span = weight_span[i];

    float error_cut_low = error[i] + cut_low_weight_error[i];
    float error_cut_high = error[i] + cut_high_weight_error[i];
    float error_cut_low_high = error[i] + cut_low_weight_error[i] + cut_high_weight_error[i];

    // Check best error against record N
    vfloat4 best_result = best_results[idx_span];
    vfloat4 new_result = vfloat4(error[i], i_flt, 0.0f, 0.0f);
    vmask4 mask = vfloat4(best_result.lane<0>()) > vfloat4(error[i]);
    best_results[idx_span] = select(best_result, new_result, mask);

    // Check best error against record N-1 with either cut low or cut high
    best_result = best_results[idx_span - 1];

    new_result = vfloat4(error_cut_low, i_flt, 1.0f, 0.0f);
    mask = vfloat4(best_result.lane<0>()) > vfloat4(error_cut_low);
    best_result = select(best_result, new_result, mask);

    new_result = vfloat4(error_cut_high, i_flt, 0.0f, 0.0f);
    mask = vfloat4(best_result.lane<0>()) > vfloat4(error_cut_high);
    best_results[idx_span - 1] = select(best_result, new_result, mask);

    // Check best error against record N-2 with both cut low and high
    best_result = best_results[idx_span - 2];
    new_result = vfloat4(error_cut_low_high, i_flt, 1.0f, 0.0f);
    mask = vfloat4(best_result.lane<0>()) > vfloat4(error_cut_low_high);
    best_results[idx_span - 2] = select(best_result, new_result, mask);
  }

  for (unsigned int i = 0; i <= max_quant_level; i++)
  {
    unsigned int q = steps_for_quant_level[i];
    int bsi = static_cast<int>(best_results[q].lane<1>());

    // Did we find anything?
#if defined(ASTCENC_DIAGNOSTICS)
    if ((bsi < 0) && print_once)
    {
      print_once = false;
      printf("INFO: Unable to find full encoding within search error limit.\n\n");
    }
#endif

    bsi = astc::max(0, bsi);

    float lwi = lowest_weight[bsi] + best_results[q].lane<2>();
    float hwi = lwi + static_cast<float>(q) - 1.0f;

    float stepsize = 1.0f / (1.0f + static_cast<float>(bsi));
    low_value[i]  = (angular_offsets[bsi] + lwi) * stepsize;
    high_value[i] = (angular_offsets[bsi] + hwi) * stepsize;
  }
}

/* See header for documentation. */
void compute_angular_endpoints_1plane(
  bool only_always,
  const block_size_descriptor& bsd,
  const float* dec_weight_ideal_value,
  unsigned int max_weight_quant,
  compression_working_buffers& tmpbuf
) {
  float (&low_value)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_low_value1;
  float (&high_value)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_high_value1;

  float (&low_values)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_low_values1;
  float (&high_values)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_high_values1;

  unsigned int max_decimation_modes = only_always ? bsd.decimation_mode_count_always
                                                  : bsd.decimation_mode_count_selected;
  promise(max_decimation_modes > 0);
  for (unsigned int i = 0; i < max_decimation_modes; i++)
  {
    const decimation_mode& dm = bsd.decimation_modes[i];
    if (!dm.is_ref_1plane(static_cast<quant_method>(max_weight_quant)))
    {
      continue;
    }

    unsigned int weight_count = bsd.get_decimation_info(i).weight_count;

    unsigned int max_precision = dm.maxprec_1plane;
    if (max_precision > TUNE_MAX_ANGULAR_QUANT)
    {
      max_precision = TUNE_MAX_ANGULAR_QUANT;
    }

    if (max_precision > max_weight_quant)
    {
      max_precision = max_weight_quant;
    }

    compute_angular_endpoints_for_quant_levels(
        weight_count,
        dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS,
        max_precision, low_values[i], high_values[i]);
  }

  unsigned int max_block_modes = only_always ? bsd.block_mode_count_1plane_always
                                             : bsd.block_mode_count_1plane_selected;
  promise(max_block_modes > 0);
  for (unsigned int i = 0; i < max_block_modes; i++)
  {
    const block_mode& bm = bsd.block_modes[i];
    assert(!bm.is_dual_plane);

    unsigned int quant_mode = bm.quant_mode;
    unsigned int decim_mode = bm.decimation_mode;

    if (quant_mode <= TUNE_MAX_ANGULAR_QUANT)
    {
      low_value[i] = low_values[decim_mode][quant_mode];
      high_value[i] = high_values[decim_mode][quant_mode];
    }
    else
    {
      low_value[i] = 0.0f;
      high_value[i] = 1.0f;
    }
  }
}

/* See header for documentation. */
void compute_angular_endpoints_2planes(
  const block_size_descriptor& bsd,
  const float* dec_weight_ideal_value,
  unsigned int max_weight_quant,
  compression_working_buffers& tmpbuf
) {
  float (&low_value1)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_low_value1;
  float (&high_value1)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_high_value1;
  float (&low_value2)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_low_value2;
  float (&high_value2)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_high_value2;

  float (&low_values1)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_low_values1;
  float (&high_values1)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_high_values1;
  float (&low_values2)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_low_values2;
  float (&high_values2)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_high_values2;

  promise(bsd.decimation_mode_count_selected > 0);
  for (unsigned int i = 0; i < bsd.decimation_mode_count_selected; i++)
  {
    const decimation_mode& dm = bsd.decimation_modes[i];
    if (!dm.is_ref_2plane(static_cast<quant_method>(max_weight_quant)))
    {
      continue;
    }

    unsigned int weight_count = bsd.get_decimation_info(i).weight_count;

    unsigned int max_precision = dm.maxprec_2planes;
    if (max_precision > TUNE_MAX_ANGULAR_QUANT)
    {
      max_precision = TUNE_MAX_ANGULAR_QUANT;
    }

    if (max_precision > max_weight_quant)
    {
      max_precision = max_weight_quant;
    }

    compute_angular_endpoints_for_quant_levels(
        weight_count,
        dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS,
        max_precision, low_values1[i], high_values1[i]);

    compute_angular_endpoints_for_quant_levels(
        weight_count,
        dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET,
        max_precision, low_values2[i], high_values2[i]);
  }

  unsigned int start = bsd.block_mode_count_1plane_selected;
  unsigned int end = bsd.block_mode_count_1plane_2plane_selected;
  for (unsigned int i = start; i < end; i++)
  {
    const block_mode& bm = bsd.block_modes[i];
    unsigned int quant_mode = bm.quant_mode;
    unsigned int decim_mode = bm.decimation_mode;

    if (quant_mode <= TUNE_MAX_ANGULAR_QUANT)
    {
      low_value1[i] = low_values1[decim_mode][quant_mode];
      high_value1[i] = high_values1[decim_mode][quant_mode];
      low_value2[i] = low_values2[decim_mode][quant_mode];
      high_value2[i] = high_values2[decim_mode][quant_mode];
    }
    else
    {
      low_value1[i] = 0.0f;
      high_value1[i] = 1.0f;
      low_value2[i] = 0.0f;
      high_value2[i] = 1.0f;
    }
  }
}

#endif

Coverage Report

Created: 2026-05-14 06:55

Line	Count	Source
1		// SPDX-License-Identifier: Apache-2.0
2		// ----------------------------------------------------------------------------
3		// Copyright 2011-2026 Arm Limited
4		//
5		// Licensed under the Apache License, Version 2.0 (the "License"); you may not
6		// use this file except in compliance with the License. You may obtain a copy
7		// of the License at:
8		//
9		// http://www.apache.org/licenses/LICENSE-2.0
10		//
11		// Unless required by applicable law or agreed to in writing, software
12		// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13		// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14		// License for the specific language governing permissions and limitations
15		// under the License.
16		// ----------------------------------------------------------------------------
17
18		#if !defined(ASTCENC_DECOMPRESS_ONLY)
19
20		/**
21		* @brief Functions for angular-sum algorithm for weight alignment.
22		*
23		* This algorithm works as follows:
24		* - we compute a complex number P as (cos si, sin si) for each weight,
25		* where i is the input value and s is a scaling factor based on the spacing between the weights.
26		* - we then add together complex numbers for all the weights.
27		* - we then compute the length and angle of the resulting sum.
28		*
29		* This should produce the following results:
30		* - perfect alignment results in a vector whose length is equal to the sum of lengths of all inputs
31		* - even distribution results in a vector of length 0.
32		* - all samples identical results in perfect alignment for every scaling.
33		*
34		* For each scaling factor within a given set, we compute an alignment factor from 0 to 1. This
35		* should then result in some scalings standing out as having particularly good alignment factors;
36		* we can use this to produce a set of candidate scale/shift values for various quantization levels;
37		* we should then actually try them and see what happens.
38		*/
39
40		#include "astcenc_internal.h"
41		#include "astcenc_vecmathlib.h"
42
43		#include <stdio.h>
44		#include <cassert>
45		#include <cstring>
46		#include <cfloat>
47
48		static constexpr unsigned int ANGULAR_STEPS { 32 };
49
50		static_assert((ANGULAR_STEPS % ASTCENC_SIMD_WIDTH) == 0,
51		"ANGULAR_STEPS must be multiple of ASTCENC_SIMD_WIDTH");
52
53		static_assert(ANGULAR_STEPS >= 32,
54		"ANGULAR_STEPS must be at least max(steps_for_quant_level)");
55
56		// Store a reduced sin/cos table for 64 possible weight values; this causes
57		// slight quality loss compared to using sin() and cos() directly. Must be 2^N.
58		static constexpr unsigned int SINCOS_STEPS { 64 };
59
60		static const uint8_t steps_for_quant_level[12] {
61		2, 3, 4, 5, 6, 8, 10, 12, 16, 20, 24, 32
62		};
63
64		ASTCENC_ALIGNAS static float sin_table[SINCOS_STEPS][ANGULAR_STEPS];
65		ASTCENC_ALIGNAS static float cos_table[SINCOS_STEPS][ANGULAR_STEPS];
66
67		#if defined(ASTCENC_DIAGNOSTICS)
68		static bool print_once { true };
69		#endif
70
71		/* See header for documentation. */
72		void prepare_angular_tables()
73	3.62k	{
74	119k	for (unsigned int i = 0; i < ANGULAR_STEPS; i++)
75	116k	{
76	116k	float angle_step = static_cast<float>(i + 1);
77
78	7.54M	for (unsigned int j = 0; j < SINCOS_STEPS; j++)
79	7.43M	{
80	7.43M	sin_table[j][i] = static_cast<float>(sinf((2.0f * astc::PI / (SINCOS_STEPS - 1.0f)) * angle_step * static_cast<float>(j)));
81	7.43M	cos_table[j][i] = static_cast<float>(cosf((2.0f * astc::PI / (SINCOS_STEPS - 1.0f)) * angle_step * static_cast<float>(j)));
82	7.43M	}
83	116k	}
84	3.62k	}
85
86		/**
87		* @brief Compute the angular alignment factors and offsets.
88		*
89		* @param weight_count The number of (decimated) weights.
90		* @param dec_weight_ideal_value The ideal decimated unquantized weight values.
91		* @param max_angular_steps The maximum number of steps to be tested.
92		* @param[out] offsets The output angular offsets array.
93		*/
94		static void compute_angular_offsets(
95		unsigned int weight_count,
96		const float* dec_weight_ideal_value,
97		unsigned int max_angular_steps,
98		float* offsets
99	148k	) {
100	148k	promise(weight_count > 0);
101	148k	promise(max_angular_steps > 0);
102
103	148k	ASTCENC_ALIGNAS int isamplev[BLOCK_MAX_WEIGHTS];
104
105		// Precompute isample; arrays are always allocated 64 elements long
106	965k	for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
107	816k	{
108		// Ideal weight can be outside [0, 1] range, so clamp to fit table
109	816k	vfloat ideal_weight = clampzo(loada(dec_weight_ideal_value + i));
110
111		// Convert a weight to a sincos table index
112	816k	vfloat sample = ideal_weight * (SINCOS_STEPS - 1.0f);
113	816k	vint isample = float_to_int_rtn(sample);
114	816k	storea(isample, isamplev + i);
115	816k	}
116
117		// Arrays are multiple of SIMD width (ANGULAR_STEPS), safe to overshoot max
118	148k	vfloat mult(1.0f / (2.0f * astc::PI));
119
120	457k	for (unsigned int i = 0; i < max_angular_steps; i += ASTCENC_SIMD_WIDTH)
121	308k	{
122	308k	vfloat anglesum_x = vfloat::zero();
123	308k	vfloat anglesum_y = vfloat::zero();
124
125	5.90M	for (unsigned int j = 0; j < weight_count; j++)
126	5.60M	{
127	5.60M	int isample = isamplev[j];
128	5.60M	anglesum_x += loada(cos_table[isample] + i);
129	5.60M	anglesum_y += loada(sin_table[isample] + i);
130	5.60M	}
131
132	308k	vfloat angle = atan2(anglesum_y, anglesum_x);
133
134		// Suppress NaNs generated if anglesums are both zero
135	308k	angle = select(vfloat::zero(), angle, angle == angle);
136
137	308k	vfloat ofs = angle * mult;
138	308k	storea(ofs, offsets + i);
139	308k	}
140	148k	}
141
142		/**
143		* @brief For a given step size compute the lowest and highest weight.
144		*
145		* Compute the lowest and highest weight that results from quantizing using the given stepsize and
146		* offset, and then compute the resulting error. The cut errors indicate the error that results from
147		* forcing samples that should have had one weight value one step up or down.
148		*
149		* @param weight_count The number of (decimated) weights.
150		* @param dec_weight_ideal_value The ideal decimated unquantized weight values.
151		* @param max_angular_steps The maximum number of steps to be tested.
152		* @param max_quant_steps The maximum quantization level to be tested.
153		* @param offsets The angular offsets array.
154		* @param[out] lowest_weight Per angular step, the lowest weight.
155		* @param[out] weight_span Per angular step, the span between lowest and highest weight.
156		* @param[out] error Per angular step, the error.
157		* @param[out] cut_low_weight_error Per angular step, the low weight cut error.
158		* @param[out] cut_high_weight_error Per angular step, the high weight cut error.
159		*/
160		static void compute_lowest_and_highest_weight(
161		unsigned int weight_count,
162		const float* dec_weight_ideal_value,
163		unsigned int max_angular_steps,
164		unsigned int max_quant_steps,
165		const float* offsets,
166		float* lowest_weight,
167		int* weight_span,
168		float* error,
169		float* cut_low_weight_error,
170		float* cut_high_weight_error
171	148k	) {
172	148k	promise(weight_count > 0);
173	148k	promise(max_angular_steps > 0);
174
175	148k	vfloat rcp_stepsize = int_to_float(vint::lane_id()) + vfloat(1.0f);
176
177		// Compute minimum/maximum weights in the weight array. Our remapping
178		// is monotonic, so the min/max rounded weights relate to the min/max
179		// unrounded weights in a straightforward way.
180	148k	vfloat min_weight(FLT_MAX);
181	148k	vfloat max_weight(-FLT_MAX);
182
183	148k	vint lane_id = vint::lane_id();
184	965k	for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
185	816k	{
186	816k	vmask active = lane_id < vint(weight_count);
187	816k	lane_id += vint(ASTCENC_SIMD_WIDTH);
188
189	816k	vfloat weights = loada(dec_weight_ideal_value + i);
190	816k	min_weight = min(select(min_weight, weights, active), min_weight);
191	816k	max_weight = max(select(max_weight, weights, active), max_weight);
192	816k	}
193
194	148k	min_weight = hmin(min_weight);
195	148k	max_weight = hmax(max_weight);
196
197		// Arrays are ANGULAR_STEPS long, so always safe to run full vectors
198	457k	for (unsigned int sp = 0; sp < max_angular_steps; sp += ASTCENC_SIMD_WIDTH)
199	308k	{
200	308k	vfloat errval = vfloat::zero();
201	308k	vfloat cut_low_weight_err = vfloat::zero();
202	308k	vfloat cut_high_weight_err = vfloat::zero();
203	308k	vfloat offset = loada(offsets + sp);
204
205		// We know the min and max weight values, so we can figure out
206		// the corresponding indices before we enter the loop.
207	308k	vfloat minidx = round(min_weight * rcp_stepsize - offset);
208	308k	vfloat maxidx = round(max_weight * rcp_stepsize - offset);
209
210	5.90M	for (unsigned int j = 0; j < weight_count; j++)
211	5.60M	{
212	5.60M	vfloat sval = load1(dec_weight_ideal_value + j) * rcp_stepsize - offset;
213	5.60M	vfloat svalrte = round(sval);
214	5.60M	vfloat diff = sval - svalrte;
215	5.60M	errval += diff * diff;
216
217		// Accumulate errors for minimum index
218	5.60M	vmask mask = svalrte == minidx;
219	5.60M	vfloat accum = cut_low_weight_err + vfloat(1.0f) - vfloat(2.0f) * diff;
220	5.60M	cut_low_weight_err = select(cut_low_weight_err, accum, mask);
221
222		// Accumulate errors for maximum index
223	5.60M	mask = svalrte == maxidx;
224	5.60M	accum = cut_high_weight_err + vfloat(1.0f) + vfloat(2.0f) * diff;
225	5.60M	cut_high_weight_err = select(cut_high_weight_err, accum, mask);
226	5.60M	}
227
228		// Write out min weight and weight span; clamp span to a usable range
229	308k	vint span = float_to_int(maxidx - minidx + vfloat(1));
230	308k	span = min(span, vint(max_quant_steps + 3));
231	308k	span = max(span, vint(2));
232	308k	storea(minidx, lowest_weight + sp);
233	308k	storea(span, weight_span + sp);
234
235		// The cut_(lowest/highest)_weight_error indicate the error that results from forcing
236		// samples that should have had the weight value one step (up/down).
237	308k	vfloat ssize = 1.0f / rcp_stepsize;
238	308k	vfloat errscale = ssize * ssize;
239	308k	storea(errval * errscale, error + sp);
240	308k	storea(cut_low_weight_err * errscale, cut_low_weight_error + sp);
241	308k	storea(cut_high_weight_err * errscale, cut_high_weight_error + sp);
242
243	308k	rcp_stepsize = rcp_stepsize + vfloat(ASTCENC_SIMD_WIDTH);
244	308k	}
245	148k	}
246
247		/**
248		* @brief The main function for the angular algorithm.
249		*
250		* @param weight_count The number of (decimated) weights.
251		* @param dec_weight_ideal_value The ideal decimated unquantized weight values.
252		* @param max_quant_level The maximum quantization level to be tested.
253		* @param[out] low_value Per angular step, the lowest weight value.
254		* @param[out] high_value Per angular step, the highest weight value.
255		*/
256		static void compute_angular_endpoints_for_quant_levels(
257		unsigned int weight_count,
258		const float* dec_weight_ideal_value,
259		unsigned int max_quant_level,
260		float low_value[TUNE_MAX_ANGULAR_QUANT + 1],
261		float high_value[TUNE_MAX_ANGULAR_QUANT + 1]
262	148k	) {
263	148k	unsigned int max_quant_steps = steps_for_quant_level[max_quant_level];
264	148k	unsigned int max_angular_steps = steps_for_quant_level[max_quant_level];
265
266	148k	ASTCENC_ALIGNAS float angular_offsets[ANGULAR_STEPS];
267
268	148k	compute_angular_offsets(weight_count, dec_weight_ideal_value,
269	148k	max_angular_steps, angular_offsets);
270
271	148k	ASTCENC_ALIGNAS float lowest_weight[ANGULAR_STEPS];
272	148k	ASTCENC_ALIGNAS int32_t weight_span[ANGULAR_STEPS];
273	148k	ASTCENC_ALIGNAS float error[ANGULAR_STEPS];
274	148k	ASTCENC_ALIGNAS float cut_low_weight_error[ANGULAR_STEPS];
275	148k	ASTCENC_ALIGNAS float cut_high_weight_error[ANGULAR_STEPS];
276
277	148k	compute_lowest_and_highest_weight(weight_count, dec_weight_ideal_value,
278	148k	max_angular_steps, max_quant_steps,
279	148k	angular_offsets, lowest_weight, weight_span, error,
280	148k	cut_low_weight_error, cut_high_weight_error);
281
282		// For each quantization level, find the best error terms. Use packed vectors so data-dependent
283		// branches can become selects. This involves some integer to float casts, but the values are
284		// small enough so they never round the wrong way.
285	148k	vfloat4 best_results[36];
286
287		// Initialize the array to some safe defaults
288	148k	promise(max_quant_steps > 0);
289	1.86M	for (unsigned int i = 0; i < (max_quant_steps + 4); i++)
290	1.71M	{
291		// Lane<0> = Best error
292		// Lane<1> = Best scale; -1 indicates no solution found
293		// Lane<2> = Cut low weight
294	1.71M	best_results[i] = vfloat4(ERROR_CALC_DEFAULT, -1.0f, 0.0f, 0.0f);
295	1.71M	}
296
297	148k	promise(max_angular_steps > 0);
298	1.27M	for (unsigned int i = 0; i < max_angular_steps; i++)
299	1.12M	{
300	1.12M	float i_flt = static_cast<float>(i);
301
302	1.12M	int idx_span = weight_span[i];
303
304	1.12M	float error_cut_low = error[i] + cut_low_weight_error[i];
305	1.12M	float error_cut_high = error[i] + cut_high_weight_error[i];
306	1.12M	float error_cut_low_high = error[i] + cut_low_weight_error[i] + cut_high_weight_error[i];
307
308		// Check best error against record N
309	1.12M	vfloat4 best_result = best_results[idx_span];
310	1.12M	vfloat4 new_result = vfloat4(error[i], i_flt, 0.0f, 0.0f);
311	1.12M	vmask4 mask = vfloat4(best_result.lane<0>()) > vfloat4(error[i]);
312	1.12M	best_results[idx_span] = select(best_result, new_result, mask);
313
314		// Check best error against record N-1 with either cut low or cut high
315	1.12M	best_result = best_results[idx_span - 1];
316
317	1.12M	new_result = vfloat4(error_cut_low, i_flt, 1.0f, 0.0f);
318	1.12M	mask = vfloat4(best_result.lane<0>()) > vfloat4(error_cut_low);
319	1.12M	best_result = select(best_result, new_result, mask);
320
321	1.12M	new_result = vfloat4(error_cut_high, i_flt, 0.0f, 0.0f);
322	1.12M	mask = vfloat4(best_result.lane<0>()) > vfloat4(error_cut_high);
323	1.12M	best_results[idx_span - 1] = select(best_result, new_result, mask);
324
325		// Check best error against record N-2 with both cut low and high
326	1.12M	best_result = best_results[idx_span - 2];
327	1.12M	new_result = vfloat4(error_cut_low_high, i_flt, 1.0f, 0.0f);
328	1.12M	mask = vfloat4(best_result.lane<0>()) > vfloat4(error_cut_low_high);
329	1.12M	best_results[idx_span - 2] = select(best_result, new_result, mask);
330	1.12M	}
331
332	935k	for (unsigned int i = 0; i <= max_quant_level; i++)
333	786k	{
334	786k	unsigned int q = steps_for_quant_level[i];
335	786k	int bsi = static_cast<int>(best_results[q].lane<1>());
336
337		// Did we find anything?
338		#if defined(ASTCENC_DIAGNOSTICS)
339		if ((bsi < 0) && print_once)
340		{
341		print_once = false;
342		printf("INFO: Unable to find full encoding within search error limit.\n\n");
343		}
344		#endif
345
346	786k	bsi = astc::max(0, bsi);
347
348	786k	float lwi = lowest_weight[bsi] + best_results[q].lane<2>();
349	786k	float hwi = lwi + static_cast<float>(q) - 1.0f;
350
351	786k	float stepsize = 1.0f / (1.0f + static_cast<float>(bsi));
352	786k	low_value[i] = (angular_offsets[bsi] + lwi) * stepsize;
353	786k	high_value[i] = (angular_offsets[bsi] + hwi) * stepsize;
354	786k	}
355	148k	}
356
357		/* See header for documentation. */
358		void compute_angular_endpoints_1plane(
359		bool only_always,
360		const block_size_descriptor& bsd,
361		const float* dec_weight_ideal_value,
362		unsigned int max_weight_quant,
363		compression_working_buffers& tmpbuf
364	11.1k	) {
365	11.1k	float (&low_value)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_low_value1;
366	11.1k	float (&high_value)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_high_value1;
367
368	11.1k	float (&low_values)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_low_values1;
369	11.1k	float (&high_values)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_high_values1;
370
371	11.1k	unsigned int max_decimation_modes = only_always ? bsd.decimation_mode_count_always
372	11.1k	: bsd.decimation_mode_count_selected;
373	11.1k	promise(max_decimation_modes > 0);
374	120k	for (unsigned int i = 0; i < max_decimation_modes; i++)
375	109k	{
376	109k	const decimation_mode& dm = bsd.decimation_modes[i];
377	109k	if (!dm.is_ref_1plane(static_cast<quant_method>(max_weight_quant)))
378	35.9k	{
379	35.9k	continue;
380	35.9k	}
381
382	73.0k	unsigned int weight_count = bsd.get_decimation_info(i).weight_count;
383
384	73.0k	unsigned int max_precision = dm.maxprec_1plane;
385	73.0k	if (max_precision > TUNE_MAX_ANGULAR_QUANT)
386	42.0k	{
387	42.0k	max_precision = TUNE_MAX_ANGULAR_QUANT;
388	42.0k	}
389
390	73.0k	if (max_precision > max_weight_quant)
391	26.3k	{
392	26.3k	max_precision = max_weight_quant;
393	26.3k	}
394
395	73.0k	compute_angular_endpoints_for_quant_levels(
396	73.0k	weight_count,
397	73.0k	dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS,
398	73.0k	max_precision, low_values[i], high_values[i]);
399	73.0k	}
400
401	11.1k	unsigned int max_block_modes = only_always ? bsd.block_mode_count_1plane_always
402	11.1k	: bsd.block_mode_count_1plane_selected;
403	11.1k	promise(max_block_modes > 0);
404	286k	for (unsigned int i = 0; i < max_block_modes; i++)
405	275k	{
406	275k	const block_mode& bm = bsd.block_modes[i];
407	275k	assert(!bm.is_dual_plane);
408
409	275k	unsigned int quant_mode = bm.quant_mode;
410	275k	unsigned int decim_mode = bm.decimation_mode;
411
412	275k	if (quant_mode <= TUNE_MAX_ANGULAR_QUANT)
413	222k	{
414	222k	low_value[i] = low_values[decim_mode][quant_mode];
415	222k	high_value[i] = high_values[decim_mode][quant_mode];
416	222k	}
417	53.3k	else
418	53.3k	{
419	53.3k	low_value[i] = 0.0f;
420	53.3k	high_value[i] = 1.0f;
421	53.3k	}
422	275k	}
423	11.1k	}
424
425		/* See header for documentation. */
426		void compute_angular_endpoints_2planes(
427		const block_size_descriptor& bsd,
428		const float* dec_weight_ideal_value,
429		unsigned int max_weight_quant,
430		compression_working_buffers& tmpbuf
431	7.06k	) {
432	7.06k	float (&low_value1)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_low_value1;
433	7.06k	float (&high_value1)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_high_value1;
434	7.06k	float (&low_value2)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_low_value2;
435	7.06k	float (&high_value2)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_high_value2;
436
437	7.06k	float (&low_values1)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_low_values1;
438	7.06k	float (&high_values1)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_high_values1;
439	7.06k	float (&low_values2)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_low_values2;
440	7.06k	float (&high_values2)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_high_values2;
441
442	7.06k	promise(bsd.decimation_mode_count_selected > 0);
443	72.0k	for (unsigned int i = 0; i < bsd.decimation_mode_count_selected; i++)
444	65.0k	{
445	65.0k	const decimation_mode& dm = bsd.decimation_modes[i];
446	65.0k	if (!dm.is_ref_2plane(static_cast<quant_method>(max_weight_quant)))
447	27.1k	{
448	27.1k	continue;
449	27.1k	}
450
451	37.8k	unsigned int weight_count = bsd.get_decimation_info(i).weight_count;
452
453	37.8k	unsigned int max_precision = dm.maxprec_2planes;
454	37.8k	if (max_precision > TUNE_MAX_ANGULAR_QUANT)
455	15.6k	{
456	15.6k	max_precision = TUNE_MAX_ANGULAR_QUANT;
457	15.6k	}
458
459	37.8k	if (max_precision > max_weight_quant)
460	10.1k	{
461	10.1k	max_precision = max_weight_quant;
462	10.1k	}
463
464	37.8k	compute_angular_endpoints_for_quant_levels(
465	37.8k	weight_count,
466	37.8k	dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS,
467	37.8k	max_precision, low_values1[i], high_values1[i]);
468
469	37.8k	compute_angular_endpoints_for_quant_levels(
470	37.8k	weight_count,
471	37.8k	dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET,
472	37.8k	max_precision, low_values2[i], high_values2[i]);
473	37.8k	}
474
475	7.06k	unsigned int start = bsd.block_mode_count_1plane_selected;
476	7.06k	unsigned int end = bsd.block_mode_count_1plane_2plane_selected;
477	77.2k	for (unsigned int i = start; i < end; i++)
478	70.2k	{
479	70.2k	const block_mode& bm = bsd.block_modes[i];
480	70.2k	unsigned int quant_mode = bm.quant_mode;
481	70.2k	unsigned int decim_mode = bm.decimation_mode;
482
483	70.2k	if (quant_mode <= TUNE_MAX_ANGULAR_QUANT)
484	58.8k	{
485	58.8k	low_value1[i] = low_values1[decim_mode][quant_mode];
486	58.8k	high_value1[i] = high_values1[decim_mode][quant_mode];
487	58.8k	low_value2[i] = low_values2[decim_mode][quant_mode];
488	58.8k	high_value2[i] = high_values2[decim_mode][quant_mode];
489	58.8k	}
490	11.3k	else
491	11.3k	{
492	11.3k	low_value1[i] = 0.0f;
493	11.3k	high_value1[i] = 1.0f;
494	11.3k	low_value2[i] = 0.0f;
495	11.3k	high_value2[i] = 1.0f;
496	11.3k	}
497	70.2k	}
498	7.06k	}
499
500		#endif