/src/astc-encoder/Source/astcenc_mathlib_softfloat.cpp

Source
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2021,2025 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------

/**
 * @brief Soft-float library for IEEE-754.
 */
#if (ASTCENC_F16C == 0) && (ASTCENC_NEON == 0)

#include "astcenc_mathlib.h"

/*  sized soft-float types. These are mapped to the sized integer
    types of C99, instead of C's floating-point types; this is because
    the library needs to maintain exact, bit-level control on all
    operations on these data types. */
typedef uint16_t sf16;
typedef uint32_t sf32;

/******************************************
  helper functions
 ******************************************/

/* Idiomatic count-leading zeros, generates native instruction on modern compilers. */
static uint32_t clz32(uint32_t inp)
{
    uint32_t count = 32;
    while (inp)
    {
        inp >>= 1;
        count--;
    }
    return count;
}

/* the five rounding modes that IEEE-754r defines */
typedef enum
{
  SF_UP = 0,        /* round towards positive infinity */
  SF_DOWN = 1,      /* round towards negative infinity */
  SF_TOZERO = 2,      /* round towards zero */
  SF_NEARESTEVEN = 3,   /* round toward nearest value; if mid-between, round to even value */
  SF_NEARESTAWAY = 4    /* round toward nearest value; if mid-between, round away from zero */
} roundmode;


static uint32_t rtne_shift32(uint32_t inp, uint32_t shamt)
{
  uint32_t vl1 = UINT32_C(1) << shamt;
  uint32_t inp2 = inp + (vl1 >> 1); /* added 0.5 ULP */
  uint32_t msk = (inp | UINT32_C(1)) & vl1; /* nonzero if odd. '| 1' forces it to 1 if the shamt is 0. */
  msk--;            /* negative if even, nonnegative if odd. */
  inp2 -= (msk >> 31);    /* subtract epsilon before shift if even. */
  inp2 >>= shamt;
  return inp2;
}

static uint32_t rtna_shift32(uint32_t inp, uint32_t shamt)
{
  uint32_t vl1 = (UINT32_C(1) << shamt) >> 1;
  inp += vl1;
  inp >>= shamt;
  return inp;
}

static uint32_t rtup_shift32(uint32_t inp, uint32_t shamt)
{
  uint32_t vl1 = UINT32_C(1) << shamt;
  inp += vl1;
  inp--;
  inp >>= shamt;
  return inp;
}

/* convert from FP16 to FP32. */
static sf32 sf16_to_sf32(sf16 inp)
{
  uint32_t inpx = inp;

  /*
    This table contains, for every FP16 sign/exponent value combination,
    the difference between the input FP16 value and the value obtained
    by shifting the correct FP32 result right by 13 bits.
    This table allows us to handle every case except denormals and NaN
    with just 1 table lookup, 2 shifts and 1 add.
  */

  #define WITH_MSB(a) (UINT32_C(a) | (1u << 31))
  static const uint32_t tbl[64] =
  {
    WITH_MSB(0x00000), 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000,          0x1C000,
             0x1C000,  0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000,          0x1C000,
             0x1C000,  0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000,          0x1C000,
             0x1C000,  0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, WITH_MSB(0x38000),
    WITH_MSB(0x38000), 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000,          0x54000,
             0x54000,  0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000,          0x54000,
             0x54000,  0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000,          0x54000,
             0x54000,  0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, WITH_MSB(0x70000)
  };

  uint32_t res = tbl[inpx >> 10];
  res += inpx;

  /* Normal cases: MSB of 'res' not set. */
  if ((res & WITH_MSB(0)) == 0)
  {
    return res << 13;
  }

  /* Infinity and Zero: 10 LSB of 'res' not set. */
  if ((res & 0x3FF) == 0)
  {
    return res << 13;
  }

  /* NaN: the exponent field of 'inp' is non-zero. */
  if ((inpx & 0x7C00) != 0)
  {
    /* All NaNs are quietened. */
    return (res << 13) | 0x400000;
  }

  /* Denormal cases */
  uint32_t sign = (inpx & 0x8000) << 16;
  uint32_t mskval = inpx & 0x7FFF;
  uint32_t leadingzeroes = clz32(mskval);
  mskval <<= leadingzeroes;
  return (mskval >> 8) + ((0x85 - leadingzeroes) << 23) + sign;
}

/* Conversion routine that converts from FP32 to FP16. It supports denormals and all rounding modes. If a NaN is given as input, it is quietened. */
static sf16 sf32_to_sf16(sf32 inp, roundmode rmode)
{
  /* for each possible sign/exponent combination, store a case index. This gives a 512-byte table */
  static const uint8_t tab[512] {
    0, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
    10, 10, 10, 10, 10, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
    20, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
    30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 40,
    40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
    40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
    40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
    40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
    40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
    40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
    40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 50,

    5, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
    15, 15, 15, 15, 15, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
    25, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35,
    35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 45,
    45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
    45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
    45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
    45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
    45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
    45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
    45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 55,
  };

  /* many of the cases below use a case-dependent magic constant. So we look up a magic constant before actually performing the switch. This table allows us to group cases, thereby minimizing code
     size. */
  static const uint32_t tabx[60] {
    UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0x8000), UINT32_C(0x80000000), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000),
    UINT32_C(1), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0x8000), UINT32_C(0x8001), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000),
    UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000),
    UINT32_C(0xC8001FFF), UINT32_C(0xC8000000), UINT32_C(0xC8000000), UINT32_C(0xC8000FFF), UINT32_C(0xC8001000),
    UINT32_C(0x58000000), UINT32_C(0x38001FFF), UINT32_C(0x58000000), UINT32_C(0x58000FFF), UINT32_C(0x58001000),
    UINT32_C(0x7C00), UINT32_C(0x7BFF), UINT32_C(0x7BFF), UINT32_C(0x7C00), UINT32_C(0x7C00),
    UINT32_C(0xFBFF), UINT32_C(0xFC00), UINT32_C(0xFBFF), UINT32_C(0xFC00), UINT32_C(0xFC00),
    UINT32_C(0x90000000), UINT32_C(0x90000000), UINT32_C(0x90000000), UINT32_C(0x90000000), UINT32_C(0x90000000),
    UINT32_C(0x20000000), UINT32_C(0x20000000), UINT32_C(0x20000000), UINT32_C(0x20000000), UINT32_C(0x20000000)
  };

  uint32_t p;
  uint32_t idx = rmode + tab[inp >> 23];
  uint32_t vlx = tabx[idx];
  switch (idx)
  {
    /*
      Positive number which may be Infinity or NaN.
      We need to check whether it is NaN; if it is, quieten it by setting the top bit of the mantissa.
      (If we don't do this quieting, then a NaN  that is distinguished only by having
      its low-order bits set, would be turned into an INF. */
  case 50:
  case 51:
  case 52:
  case 53:
  case 54:
  case 55:
  case 56:
  case 57:
  case 58:
  case 59:
    /*
      the input value is 0x7F800000 or 0xFF800000 if it is INF.
      By subtracting 1, we get 7F7FFFFF or FF7FFFFF, that is, bit 23 becomes zero.
      For NaNs, however, this operation will keep bit 23 with the value 1.
      We can then extract bit 23, and logical-OR bit 9 of the result with this
      bit in order to quieten the NaN (a Quiet NaN is a NaN where the top bit
      of the mantissa is set.)
    */
    p = (inp - 1) & UINT32_C(0x800000); /* zero if INF, nonzero if NaN. */
    return static_cast<sf16>(((inp + vlx) >> 13) | (p >> 14));
    /*
      positive, exponent = 0, round-mode == UP; need to check whether number actually is 0.
      If it is, then return 0, else return 1 (the smallest representable nonzero number)
    */
  case 0:
    /*
      -inp will set the MSB if the input number is nonzero.
      Thus (-inp) >> 31 will turn into 0 if the input number is 0 and 1 otherwise.
    */
    return static_cast<sf16>(static_cast<uint32_t>((-static_cast<int32_t>(inp))) >> 31);

    /*
      negative, exponent = , round-mode == DOWN, need to check whether number is
      actually 0. If it is, return 0x8000 ( float -0.0 )
      Else return the smallest negative number ( 0x8001 ) */
  case 6:
    /*
      in this case 'vlx' is 0x80000000. By subtracting the input value from it,
      we obtain a value that is 0 if the input value is in fact zero and has
      the MSB set if it isn't. We then right-shift the value by 31 places to
      get a value that is 0 if the input is -0.0 and 1 otherwise.
    */
    return static_cast<sf16>(((vlx - inp) >> 31) + UINT32_C(0x8000));

    /*
      for all other cases involving underflow/overflow, we don't need to
      do actual tests; we just return 'vlx'.
    */
  case 1:
  case 2:
  case 3:
  case 4:
  case 5:
  case 7:
  case 8:
  case 9:
  case 10:
  case 11:
  case 12:
  case 13:
  case 14:
  case 15:
  case 16:
  case 17:
  case 18:
  case 19:
  case 40:
  case 41:
  case 42:
  case 43:
  case 44:
  case 45:
  case 46:
  case 47:
  case 48:
  case 49:
    return static_cast<sf16>(vlx);

    /*
      for normal numbers, 'vlx' is the difference between the FP32 value of a number and the
      FP16 representation of the same number left-shifted by 13 places. In addition, a rounding constant is
      baked into 'vlx': for rounding-away-from zero, the constant is 2^13 - 1, causing roundoff away
      from zero. for round-to-nearest away, the constant is 2^12, causing roundoff away from zero.
      for round-to-nearest-even, the constant is 2^12 - 1. This causes correct round-to-nearest-even
      except for odd input numbers. For odd input numbers, we need to add 1 to the constant. */

    /* normal number, all rounding modes except round-to-nearest-even: */
  case 30:
  case 31:
  case 32:
  case 34:
  case 35:
  case 36:
  case 37:
  case 39:
    return static_cast<sf16>((inp + vlx) >> 13);

    /* normal number, round-to-nearest-even. */
  case 33:
  case 38:
    p = inp + vlx;
    p += (inp >> 13) & 1;
    return static_cast<sf16>(p >> 13);

    /*
      the various denormal cases. These are not expected to be common, so their performance is a bit
      less important. For each of these cases, we need to extract an exponent and a mantissa
      (including the implicit '1'!), and then right-shift the mantissa by a shift-amount that
      depends on the exponent. The shift must apply the correct rounding mode. 'vlx' is used to supply the
      sign of the resulting denormal number.
    */
  case 21:
  case 22:
  case 25:
  case 27:
    /* denormal, round towards zero. */
    p = 126 - ((inp >> 23) & 0xFF);
    return static_cast<sf16>((((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000)) >> p) | vlx);
  case 20:
  case 26:
    /* denormal, round away from zero. */
    p = 126 - ((inp >> 23) & 0xFF);
    return static_cast<sf16>(rtup_shift32((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000), p) | vlx);
  case 24:
  case 29:
    /* denormal, round to nearest-away */
    p = 126 - ((inp >> 23) & 0xFF);
    return static_cast<sf16>(rtna_shift32((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000), p) | vlx);
  case 23:
  case 28:
    /* denormal, round to nearest-even. */
    p = 126 - ((inp >> 23) & 0xFF);
    return static_cast<sf16>(rtne_shift32((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000), p) | vlx);
  }

  return 0;
}

/* convert from soft-float to native-float */
float sf16_to_float(uint16_t p)
{
  return astc::uint_as_float(sf16_to_sf32(p));
}

/* convert from native-float to soft-float */
uint16_t float_to_sf16(float p)
{
  unsigned int ip = astc::float_as_uint(p);
  return sf32_to_sf16(ip, SF_NEARESTEVEN);
}

#endif

Coverage Report

Created: 2026-06-04 06:52

Line	Count	Source
1		// SPDX-License-Identifier: Apache-2.0
2		// ----------------------------------------------------------------------------
3		// Copyright 2011-2021,2025 Arm Limited
4		//
5		// Licensed under the Apache License, Version 2.0 (the "License"); you may not
6		// use this file except in compliance with the License. You may obtain a copy
7		// of the License at:
8		//
9		// http://www.apache.org/licenses/LICENSE-2.0
10		//
11		// Unless required by applicable law or agreed to in writing, software
12		// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13		// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14		// License for the specific language governing permissions and limitations
15		// under the License.
16		// ----------------------------------------------------------------------------
17
18		/**
19		* @brief Soft-float library for IEEE-754.
20		*/
21		#if (ASTCENC_F16C == 0) && (ASTCENC_NEON == 0)
22
23		#include "astcenc_mathlib.h"
24
25		/* sized soft-float types. These are mapped to the sized integer
26		types of C99, instead of C's floating-point types; this is because
27		the library needs to maintain exact, bit-level control on all
28		operations on these data types. */
29		typedef uint16_t sf16;
30		typedef uint32_t sf32;
31
32		/******************************************
33		helper functions
34		******************************************/
35
36		/* Idiomatic count-leading zeros, generates native instruction on modern compilers. */
37		static uint32_t clz32(uint32_t inp)
38	4.25k	{
39	4.25k	uint32_t count = 32;
40	42.0k	while (inp)
41	37.7k	{
42	37.7k	inp >>= 1;
43	37.7k	count--;
44	37.7k	}
45	4.25k	return count;
46	4.25k	}
47
48		/* the five rounding modes that IEEE-754r defines */
49		typedef enum
50		{
51		SF_UP = 0, /* round towards positive infinity */
52		SF_DOWN = 1, /* round towards negative infinity */
53		SF_TOZERO = 2, /* round towards zero */
54		SF_NEARESTEVEN = 3, /* round toward nearest value; if mid-between, round to even value */
55		SF_NEARESTAWAY = 4 /* round toward nearest value; if mid-between, round away from zero */
56		} roundmode;
57
58
59		static uint32_t rtne_shift32(uint32_t inp, uint32_t shamt)
60	0	{
61	0	uint32_t vl1 = UINT32_C(1) << shamt;
62	0	uint32_t inp2 = inp + (vl1 >> 1); /* added 0.5 ULP */
63	0	uint32_t msk = (inp \| UINT32_C(1)) & vl1; /* nonzero if odd. '\| 1' forces it to 1 if the shamt is 0. */
64	0	msk--; /* negative if even, nonnegative if odd. */
65	0	inp2 -= (msk >> 31); /* subtract epsilon before shift if even. */
66	0	inp2 >>= shamt;
67	0	return inp2;
68	0	}
69
70		static uint32_t rtna_shift32(uint32_t inp, uint32_t shamt)
71	0	{
72	0	uint32_t vl1 = (UINT32_C(1) << shamt) >> 1;
73	0	inp += vl1;
74	0	inp >>= shamt;
75	0	return inp;
76	0	}
77
78		static uint32_t rtup_shift32(uint32_t inp, uint32_t shamt)
79	0	{
80	0	uint32_t vl1 = UINT32_C(1) << shamt;
81	0	inp += vl1;
82	0	inp--;
83	0	inp >>= shamt;
84	0	return inp;
85	0	}
86
87		/* convert from FP16 to FP32. */
88		static sf32 sf16_to_sf32(sf16 inp)
89	817k	{
90	817k	uint32_t inpx = inp;
91
92		/*
93		This table contains, for every FP16 sign/exponent value combination,
94		the difference between the input FP16 value and the value obtained
95		by shifting the correct FP32 result right by 13 bits.
96		This table allows us to handle every case except denormals and NaN
97		with just 1 table lookup, 2 shifts and 1 add.
98		*/
99
100	4.08M	#define WITH_MSB(a) (UINT32_C(a) \| (1u << 31))
101	817k	static const uint32_t tbl[64] =
102	817k	{
103	817k	WITH_MSB(0x00000), 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000,
104	817k	0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000,
105	817k	0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000,
106	817k	0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, WITH_MSB(0x38000),
107	817k	WITH_MSB(0x38000), 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000,
108	817k	0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000,
109	817k	0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000,
110	817k	0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, WITH_MSB(0x70000)
111	817k	};
112
113	817k	uint32_t res = tbl[inpx >> 10];
114	817k	res += inpx;
115
116		/* Normal cases: MSB of 'res' not set. */
117	817k	if ((res & WITH_MSB(0)) == 0)
118	694k	{
119	694k	return res << 13;
120	694k	}
121
122		/* Infinity and Zero: 10 LSB of 'res' not set. */
123	122k	if ((res & 0x3FF) == 0)
124	118k	{
125	118k	return res << 13;
126	118k	}
127
128		/* NaN: the exponent field of 'inp' is non-zero. */
129	4.39k	if ((inpx & 0x7C00) != 0)
130	133	{
131		/* All NaNs are quietened. */
132	133	return (res << 13) \| 0x400000;
133	133	}
134
135		/* Denormal cases */
136	4.25k	uint32_t sign = (inpx & 0x8000) << 16;
137	4.25k	uint32_t mskval = inpx & 0x7FFF;
138	4.25k	uint32_t leadingzeroes = clz32(mskval);
139	4.25k	mskval <<= leadingzeroes;
140	4.25k	return (mskval >> 8) + ((0x85 - leadingzeroes) << 23) + sign;
141	4.39k	}
142
143		/* Conversion routine that converts from FP32 to FP16. It supports denormals and all rounding modes. If a NaN is given as input, it is quietened. */
144		static sf16 sf32_to_sf16(sf32 inp, roundmode rmode)
145	24	{
146		/* for each possible sign/exponent combination, store a case index. This gives a 512-byte table */
147	24	static const uint8_t tab[512] {
148	24	0, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
149	24	10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
150	24	10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
151	24	10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
152	24	10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
153	24	10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
154	24	10, 10, 10, 10, 10, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
155	24	20, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
156	24	30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 40,
157	24	40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
158	24	40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
159	24	40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
160	24	40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
161	24	40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
162	24	40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
163	24	40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 50,
164
165	24	5, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
166	24	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
167	24	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
168	24	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
169	24	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
170	24	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
171	24	15, 15, 15, 15, 15, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
172	24	25, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35,
173	24	35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 45,
174	24	45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
175	24	45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
176	24	45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
177	24	45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
178	24	45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
179	24	45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
180	24	45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 55,
181	24	};
182
183		/* many of the cases below use a case-dependent magic constant. So we look up a magic constant before actually performing the switch. This table allows us to group cases, thereby minimizing code
184		size. */
185	24	static const uint32_t tabx[60] {
186	24	UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0x8000), UINT32_C(0x80000000), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000),
187	24	UINT32_C(1), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0x8000), UINT32_C(0x8001), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000),
188	24	UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000),
189	24	UINT32_C(0xC8001FFF), UINT32_C(0xC8000000), UINT32_C(0xC8000000), UINT32_C(0xC8000FFF), UINT32_C(0xC8001000),
190	24	UINT32_C(0x58000000), UINT32_C(0x38001FFF), UINT32_C(0x58000000), UINT32_C(0x58000FFF), UINT32_C(0x58001000),
191	24	UINT32_C(0x7C00), UINT32_C(0x7BFF), UINT32_C(0x7BFF), UINT32_C(0x7C00), UINT32_C(0x7C00),
192	24	UINT32_C(0xFBFF), UINT32_C(0xFC00), UINT32_C(0xFBFF), UINT32_C(0xFC00), UINT32_C(0xFC00),
193	24	UINT32_C(0x90000000), UINT32_C(0x90000000), UINT32_C(0x90000000), UINT32_C(0x90000000), UINT32_C(0x90000000),
194	24	UINT32_C(0x20000000), UINT32_C(0x20000000), UINT32_C(0x20000000), UINT32_C(0x20000000), UINT32_C(0x20000000)
195	24	};
196
197	24	uint32_t p;
198	24	uint32_t idx = rmode + tab[inp >> 23];
199	24	uint32_t vlx = tabx[idx];
200	24	switch (idx)
201	24	{
202		/*
203		Positive number which may be Infinity or NaN.
204		We need to check whether it is NaN; if it is, quieten it by setting the top bit of the mantissa.
205		(If we don't do this quieting, then a NaN that is distinguished only by having
206		its low-order bits set, would be turned into an INF. */
207	0	case 50:
208	0	case 51:
209	0	case 52:
210	0	case 53:
211	0	case 54:
212	0	case 55:
213	0	case 56:
214	0	case 57:
215	0	case 58:
216	0	case 59:
217		/*
218		the input value is 0x7F800000 or 0xFF800000 if it is INF.
219		By subtracting 1, we get 7F7FFFFF or FF7FFFFF, that is, bit 23 becomes zero.
220		For NaNs, however, this operation will keep bit 23 with the value 1.
221		We can then extract bit 23, and logical-OR bit 9 of the result with this
222		bit in order to quieten the NaN (a Quiet NaN is a NaN where the top bit
223		of the mantissa is set.)
224		*/
225	0	p = (inp - 1) & UINT32_C(0x800000); /* zero if INF, nonzero if NaN. */
226	0	return static_cast<sf16>(((inp + vlx) >> 13) \| (p >> 14));
227		/*
228		positive, exponent = 0, round-mode == UP; need to check whether number actually is 0.
229		If it is, then return 0, else return 1 (the smallest representable nonzero number)
230		*/
231	0	case 0:
232		/*
233		-inp will set the MSB if the input number is nonzero.
234		Thus (-inp) >> 31 will turn into 0 if the input number is 0 and 1 otherwise.
235		*/
236	0	return static_cast<sf16>(static_cast<uint32_t>((-static_cast<int32_t>(inp))) >> 31);
237
238		/*
239		negative, exponent = , round-mode == DOWN, need to check whether number is
240		actually 0. If it is, return 0x8000 ( float -0.0 )
241		Else return the smallest negative number ( 0x8001 ) */
242	0	case 6:
243		/*
244		in this case 'vlx' is 0x80000000. By subtracting the input value from it,
245		we obtain a value that is 0 if the input value is in fact zero and has
246		the MSB set if it isn't. We then right-shift the value by 31 places to
247		get a value that is 0 if the input is -0.0 and 1 otherwise.
248		*/
249	0	return static_cast<sf16>(((vlx - inp) >> 31) + UINT32_C(0x8000));
250
251		/*
252		for all other cases involving underflow/overflow, we don't need to
253		do actual tests; we just return 'vlx'.
254		*/
255	0	case 1:
256	0	case 2:
257	14	case 3:
258	14	case 4:
259	14	case 5:
260	14	case 7:
261	14	case 8:
262	14	case 9:
263	14	case 10:
264	14	case 11:
265	14	case 12:
266	14	case 13:
267	14	case 14:
268	14	case 15:
269	14	case 16:
270	14	case 17:
271	14	case 18:
272	14	case 19:
273	14	case 40:
274	14	case 41:
275	14	case 42:
276	14	case 43:
277	14	case 44:
278	14	case 45:
279	14	case 46:
280	14	case 47:
281	14	case 48:
282	14	case 49:
283	14	return static_cast<sf16>(vlx);
284
285		/*
286		for normal numbers, 'vlx' is the difference between the FP32 value of a number and the
287		FP16 representation of the same number left-shifted by 13 places. In addition, a rounding constant is
288		baked into 'vlx': for rounding-away-from zero, the constant is 2^13 - 1, causing roundoff away
289		from zero. for round-to-nearest away, the constant is 2^12, causing roundoff away from zero.
290		for round-to-nearest-even, the constant is 2^12 - 1. This causes correct round-to-nearest-even
291		except for odd input numbers. For odd input numbers, we need to add 1 to the constant. */
292
293		/* normal number, all rounding modes except round-to-nearest-even: */
294	0	case 30:
295	0	case 31:
296	0	case 32:
297	0	case 34:
298	0	case 35:
299	0	case 36:
300	0	case 37:
301	0	case 39:
302	0	return static_cast<sf16>((inp + vlx) >> 13);
303
304		/* normal number, round-to-nearest-even. */
305	10	case 33:
306	10	case 38:
307	10	p = inp + vlx;
308	10	p += (inp >> 13) & 1;
309	10	return static_cast<sf16>(p >> 13);
310
311		/*
312		the various denormal cases. These are not expected to be common, so their performance is a bit
313		less important. For each of these cases, we need to extract an exponent and a mantissa
314		(including the implicit '1'!), and then right-shift the mantissa by a shift-amount that
315		depends on the exponent. The shift must apply the correct rounding mode. 'vlx' is used to supply the
316		sign of the resulting denormal number.
317		*/
318	0	case 21:
319	0	case 22:
320	0	case 25:
321	0	case 27:
322		/* denormal, round towards zero. */
323	0	p = 126 - ((inp >> 23) & 0xFF);
324	0	return static_cast<sf16>((((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000)) >> p) \| vlx);
325	0	case 20:
326	0	case 26:
327		/* denormal, round away from zero. */
328	0	p = 126 - ((inp >> 23) & 0xFF);
329	0	return static_cast<sf16>(rtup_shift32((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000), p) \| vlx);
330	0	case 24:
331	0	case 29:
332		/* denormal, round to nearest-away */
333	0	p = 126 - ((inp >> 23) & 0xFF);
334	0	return static_cast<sf16>(rtna_shift32((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000), p) \| vlx);
335	0	case 23:
336	0	case 28:
337		/* denormal, round to nearest-even. */
338	0	p = 126 - ((inp >> 23) & 0xFF);
339	0	return static_cast<sf16>(rtne_shift32((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000), p) \| vlx);
340	24	}
341
342	0	return 0;
343	24	}
344
345		/* convert from soft-float to native-float */
346		float sf16_to_float(uint16_t p)
347	817k	{
348	817k	return astc::uint_as_float(sf16_to_sf32(p));
349	817k	}
350
351		/* convert from native-float to soft-float */
352		uint16_t float_to_sf16(float p)
353	24	{
354	24	unsigned int ip = astc::float_as_uint(p);
355	24	return sf32_to_sf16(ip, SF_NEARESTEVEN);
356	24	}
357
358		#endif