/src/astc-encoder/Source/astcenc_mathlib_softfloat.cpp
Line | Count | Source |
1 | | // SPDX-License-Identifier: Apache-2.0 |
2 | | // ---------------------------------------------------------------------------- |
3 | | // Copyright 2011-2021,2025 Arm Limited |
4 | | // |
5 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not |
6 | | // use this file except in compliance with the License. You may obtain a copy |
7 | | // of the License at: |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, software |
12 | | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT |
13 | | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the |
14 | | // License for the specific language governing permissions and limitations |
15 | | // under the License. |
16 | | // ---------------------------------------------------------------------------- |
17 | | |
18 | | /** |
19 | | * @brief Soft-float library for IEEE-754. |
20 | | */ |
21 | | #if (ASTCENC_F16C == 0) && (ASTCENC_NEON == 0) |
22 | | |
23 | | #include "astcenc_mathlib.h" |
24 | | |
25 | | /* sized soft-float types. These are mapped to the sized integer |
26 | | types of C99, instead of C's floating-point types; this is because |
27 | | the library needs to maintain exact, bit-level control on all |
28 | | operations on these data types. */ |
29 | | typedef uint16_t sf16; |
30 | | typedef uint32_t sf32; |
31 | | |
32 | | /****************************************** |
33 | | helper functions |
34 | | ******************************************/ |
35 | | |
36 | | /* Idiomatic count-leading zeros, generates native instruction on modern compilers. */ |
37 | | static uint32_t clz32(uint32_t inp) |
38 | 4.25k | { |
39 | 4.25k | uint32_t count = 32; |
40 | 42.0k | while (inp) |
41 | 37.7k | { |
42 | 37.7k | inp >>= 1; |
43 | 37.7k | count--; |
44 | 37.7k | } |
45 | 4.25k | return count; |
46 | 4.25k | } |
47 | | |
48 | | /* the five rounding modes that IEEE-754r defines */ |
49 | | typedef enum |
50 | | { |
51 | | SF_UP = 0, /* round towards positive infinity */ |
52 | | SF_DOWN = 1, /* round towards negative infinity */ |
53 | | SF_TOZERO = 2, /* round towards zero */ |
54 | | SF_NEARESTEVEN = 3, /* round toward nearest value; if mid-between, round to even value */ |
55 | | SF_NEARESTAWAY = 4 /* round toward nearest value; if mid-between, round away from zero */ |
56 | | } roundmode; |
57 | | |
58 | | |
59 | | static uint32_t rtne_shift32(uint32_t inp, uint32_t shamt) |
60 | 0 | { |
61 | 0 | uint32_t vl1 = UINT32_C(1) << shamt; |
62 | 0 | uint32_t inp2 = inp + (vl1 >> 1); /* added 0.5 ULP */ |
63 | 0 | uint32_t msk = (inp | UINT32_C(1)) & vl1; /* nonzero if odd. '| 1' forces it to 1 if the shamt is 0. */ |
64 | 0 | msk--; /* negative if even, nonnegative if odd. */ |
65 | 0 | inp2 -= (msk >> 31); /* subtract epsilon before shift if even. */ |
66 | 0 | inp2 >>= shamt; |
67 | 0 | return inp2; |
68 | 0 | } |
69 | | |
70 | | static uint32_t rtna_shift32(uint32_t inp, uint32_t shamt) |
71 | 0 | { |
72 | 0 | uint32_t vl1 = (UINT32_C(1) << shamt) >> 1; |
73 | 0 | inp += vl1; |
74 | 0 | inp >>= shamt; |
75 | 0 | return inp; |
76 | 0 | } |
77 | | |
78 | | static uint32_t rtup_shift32(uint32_t inp, uint32_t shamt) |
79 | 0 | { |
80 | 0 | uint32_t vl1 = UINT32_C(1) << shamt; |
81 | 0 | inp += vl1; |
82 | 0 | inp--; |
83 | 0 | inp >>= shamt; |
84 | 0 | return inp; |
85 | 0 | } |
86 | | |
87 | | /* convert from FP16 to FP32. */ |
88 | | static sf32 sf16_to_sf32(sf16 inp) |
89 | 817k | { |
90 | 817k | uint32_t inpx = inp; |
91 | | |
92 | | /* |
93 | | This table contains, for every FP16 sign/exponent value combination, |
94 | | the difference between the input FP16 value and the value obtained |
95 | | by shifting the correct FP32 result right by 13 bits. |
96 | | This table allows us to handle every case except denormals and NaN |
97 | | with just 1 table lookup, 2 shifts and 1 add. |
98 | | */ |
99 | | |
100 | 4.08M | #define WITH_MSB(a) (UINT32_C(a) | (1u << 31)) |
101 | 817k | static const uint32_t tbl[64] = |
102 | 817k | { |
103 | 817k | WITH_MSB(0x00000), 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, |
104 | 817k | 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, |
105 | 817k | 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, |
106 | 817k | 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, WITH_MSB(0x38000), |
107 | 817k | WITH_MSB(0x38000), 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, |
108 | 817k | 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, |
109 | 817k | 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, |
110 | 817k | 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, WITH_MSB(0x70000) |
111 | 817k | }; |
112 | | |
113 | 817k | uint32_t res = tbl[inpx >> 10]; |
114 | 817k | res += inpx; |
115 | | |
116 | | /* Normal cases: MSB of 'res' not set. */ |
117 | 817k | if ((res & WITH_MSB(0)) == 0) |
118 | 694k | { |
119 | 694k | return res << 13; |
120 | 694k | } |
121 | | |
122 | | /* Infinity and Zero: 10 LSB of 'res' not set. */ |
123 | 122k | if ((res & 0x3FF) == 0) |
124 | 118k | { |
125 | 118k | return res << 13; |
126 | 118k | } |
127 | | |
128 | | /* NaN: the exponent field of 'inp' is non-zero. */ |
129 | 4.39k | if ((inpx & 0x7C00) != 0) |
130 | 133 | { |
131 | | /* All NaNs are quietened. */ |
132 | 133 | return (res << 13) | 0x400000; |
133 | 133 | } |
134 | | |
135 | | /* Denormal cases */ |
136 | 4.25k | uint32_t sign = (inpx & 0x8000) << 16; |
137 | 4.25k | uint32_t mskval = inpx & 0x7FFF; |
138 | 4.25k | uint32_t leadingzeroes = clz32(mskval); |
139 | 4.25k | mskval <<= leadingzeroes; |
140 | 4.25k | return (mskval >> 8) + ((0x85 - leadingzeroes) << 23) + sign; |
141 | 4.39k | } |
142 | | |
143 | | /* Conversion routine that converts from FP32 to FP16. It supports denormals and all rounding modes. If a NaN is given as input, it is quietened. */ |
144 | | static sf16 sf32_to_sf16(sf32 inp, roundmode rmode) |
145 | 24 | { |
146 | | /* for each possible sign/exponent combination, store a case index. This gives a 512-byte table */ |
147 | 24 | static const uint8_t tab[512] { |
148 | 24 | 0, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, |
149 | 24 | 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, |
150 | 24 | 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, |
151 | 24 | 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, |
152 | 24 | 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, |
153 | 24 | 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, |
154 | 24 | 10, 10, 10, 10, 10, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, |
155 | 24 | 20, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, |
156 | 24 | 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 40, |
157 | 24 | 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, |
158 | 24 | 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, |
159 | 24 | 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, |
160 | 24 | 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, |
161 | 24 | 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, |
162 | 24 | 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, |
163 | 24 | 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 50, |
164 | | |
165 | 24 | 5, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, |
166 | 24 | 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, |
167 | 24 | 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, |
168 | 24 | 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, |
169 | 24 | 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, |
170 | 24 | 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, |
171 | 24 | 15, 15, 15, 15, 15, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, |
172 | 24 | 25, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, |
173 | 24 | 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 45, |
174 | 24 | 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, |
175 | 24 | 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, |
176 | 24 | 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, |
177 | 24 | 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, |
178 | 24 | 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, |
179 | 24 | 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, |
180 | 24 | 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 55, |
181 | 24 | }; |
182 | | |
183 | | /* many of the cases below use a case-dependent magic constant. So we look up a magic constant before actually performing the switch. This table allows us to group cases, thereby minimizing code |
184 | | size. */ |
185 | 24 | static const uint32_t tabx[60] { |
186 | 24 | UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0x8000), UINT32_C(0x80000000), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000), |
187 | 24 | UINT32_C(1), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0x8000), UINT32_C(0x8001), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000), |
188 | 24 | UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000), |
189 | 24 | UINT32_C(0xC8001FFF), UINT32_C(0xC8000000), UINT32_C(0xC8000000), UINT32_C(0xC8000FFF), UINT32_C(0xC8001000), |
190 | 24 | UINT32_C(0x58000000), UINT32_C(0x38001FFF), UINT32_C(0x58000000), UINT32_C(0x58000FFF), UINT32_C(0x58001000), |
191 | 24 | UINT32_C(0x7C00), UINT32_C(0x7BFF), UINT32_C(0x7BFF), UINT32_C(0x7C00), UINT32_C(0x7C00), |
192 | 24 | UINT32_C(0xFBFF), UINT32_C(0xFC00), UINT32_C(0xFBFF), UINT32_C(0xFC00), UINT32_C(0xFC00), |
193 | 24 | UINT32_C(0x90000000), UINT32_C(0x90000000), UINT32_C(0x90000000), UINT32_C(0x90000000), UINT32_C(0x90000000), |
194 | 24 | UINT32_C(0x20000000), UINT32_C(0x20000000), UINT32_C(0x20000000), UINT32_C(0x20000000), UINT32_C(0x20000000) |
195 | 24 | }; |
196 | | |
197 | 24 | uint32_t p; |
198 | 24 | uint32_t idx = rmode + tab[inp >> 23]; |
199 | 24 | uint32_t vlx = tabx[idx]; |
200 | 24 | switch (idx) |
201 | 24 | { |
202 | | /* |
203 | | Positive number which may be Infinity or NaN. |
204 | | We need to check whether it is NaN; if it is, quieten it by setting the top bit of the mantissa. |
205 | | (If we don't do this quieting, then a NaN that is distinguished only by having |
206 | | its low-order bits set, would be turned into an INF. */ |
207 | 0 | case 50: |
208 | 0 | case 51: |
209 | 0 | case 52: |
210 | 0 | case 53: |
211 | 0 | case 54: |
212 | 0 | case 55: |
213 | 0 | case 56: |
214 | 0 | case 57: |
215 | 0 | case 58: |
216 | 0 | case 59: |
217 | | /* |
218 | | the input value is 0x7F800000 or 0xFF800000 if it is INF. |
219 | | By subtracting 1, we get 7F7FFFFF or FF7FFFFF, that is, bit 23 becomes zero. |
220 | | For NaNs, however, this operation will keep bit 23 with the value 1. |
221 | | We can then extract bit 23, and logical-OR bit 9 of the result with this |
222 | | bit in order to quieten the NaN (a Quiet NaN is a NaN where the top bit |
223 | | of the mantissa is set.) |
224 | | */ |
225 | 0 | p = (inp - 1) & UINT32_C(0x800000); /* zero if INF, nonzero if NaN. */ |
226 | 0 | return static_cast<sf16>(((inp + vlx) >> 13) | (p >> 14)); |
227 | | /* |
228 | | positive, exponent = 0, round-mode == UP; need to check whether number actually is 0. |
229 | | If it is, then return 0, else return 1 (the smallest representable nonzero number) |
230 | | */ |
231 | 0 | case 0: |
232 | | /* |
233 | | -inp will set the MSB if the input number is nonzero. |
234 | | Thus (-inp) >> 31 will turn into 0 if the input number is 0 and 1 otherwise. |
235 | | */ |
236 | 0 | return static_cast<sf16>(static_cast<uint32_t>((-static_cast<int32_t>(inp))) >> 31); |
237 | | |
238 | | /* |
239 | | negative, exponent = , round-mode == DOWN, need to check whether number is |
240 | | actually 0. If it is, return 0x8000 ( float -0.0 ) |
241 | | Else return the smallest negative number ( 0x8001 ) */ |
242 | 0 | case 6: |
243 | | /* |
244 | | in this case 'vlx' is 0x80000000. By subtracting the input value from it, |
245 | | we obtain a value that is 0 if the input value is in fact zero and has |
246 | | the MSB set if it isn't. We then right-shift the value by 31 places to |
247 | | get a value that is 0 if the input is -0.0 and 1 otherwise. |
248 | | */ |
249 | 0 | return static_cast<sf16>(((vlx - inp) >> 31) + UINT32_C(0x8000)); |
250 | | |
251 | | /* |
252 | | for all other cases involving underflow/overflow, we don't need to |
253 | | do actual tests; we just return 'vlx'. |
254 | | */ |
255 | 0 | case 1: |
256 | 0 | case 2: |
257 | 14 | case 3: |
258 | 14 | case 4: |
259 | 14 | case 5: |
260 | 14 | case 7: |
261 | 14 | case 8: |
262 | 14 | case 9: |
263 | 14 | case 10: |
264 | 14 | case 11: |
265 | 14 | case 12: |
266 | 14 | case 13: |
267 | 14 | case 14: |
268 | 14 | case 15: |
269 | 14 | case 16: |
270 | 14 | case 17: |
271 | 14 | case 18: |
272 | 14 | case 19: |
273 | 14 | case 40: |
274 | 14 | case 41: |
275 | 14 | case 42: |
276 | 14 | case 43: |
277 | 14 | case 44: |
278 | 14 | case 45: |
279 | 14 | case 46: |
280 | 14 | case 47: |
281 | 14 | case 48: |
282 | 14 | case 49: |
283 | 14 | return static_cast<sf16>(vlx); |
284 | | |
285 | | /* |
286 | | for normal numbers, 'vlx' is the difference between the FP32 value of a number and the |
287 | | FP16 representation of the same number left-shifted by 13 places. In addition, a rounding constant is |
288 | | baked into 'vlx': for rounding-away-from zero, the constant is 2^13 - 1, causing roundoff away |
289 | | from zero. for round-to-nearest away, the constant is 2^12, causing roundoff away from zero. |
290 | | for round-to-nearest-even, the constant is 2^12 - 1. This causes correct round-to-nearest-even |
291 | | except for odd input numbers. For odd input numbers, we need to add 1 to the constant. */ |
292 | | |
293 | | /* normal number, all rounding modes except round-to-nearest-even: */ |
294 | 0 | case 30: |
295 | 0 | case 31: |
296 | 0 | case 32: |
297 | 0 | case 34: |
298 | 0 | case 35: |
299 | 0 | case 36: |
300 | 0 | case 37: |
301 | 0 | case 39: |
302 | 0 | return static_cast<sf16>((inp + vlx) >> 13); |
303 | | |
304 | | /* normal number, round-to-nearest-even. */ |
305 | 10 | case 33: |
306 | 10 | case 38: |
307 | 10 | p = inp + vlx; |
308 | 10 | p += (inp >> 13) & 1; |
309 | 10 | return static_cast<sf16>(p >> 13); |
310 | | |
311 | | /* |
312 | | the various denormal cases. These are not expected to be common, so their performance is a bit |
313 | | less important. For each of these cases, we need to extract an exponent and a mantissa |
314 | | (including the implicit '1'!), and then right-shift the mantissa by a shift-amount that |
315 | | depends on the exponent. The shift must apply the correct rounding mode. 'vlx' is used to supply the |
316 | | sign of the resulting denormal number. |
317 | | */ |
318 | 0 | case 21: |
319 | 0 | case 22: |
320 | 0 | case 25: |
321 | 0 | case 27: |
322 | | /* denormal, round towards zero. */ |
323 | 0 | p = 126 - ((inp >> 23) & 0xFF); |
324 | 0 | return static_cast<sf16>((((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000)) >> p) | vlx); |
325 | 0 | case 20: |
326 | 0 | case 26: |
327 | | /* denormal, round away from zero. */ |
328 | 0 | p = 126 - ((inp >> 23) & 0xFF); |
329 | 0 | return static_cast<sf16>(rtup_shift32((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000), p) | vlx); |
330 | 0 | case 24: |
331 | 0 | case 29: |
332 | | /* denormal, round to nearest-away */ |
333 | 0 | p = 126 - ((inp >> 23) & 0xFF); |
334 | 0 | return static_cast<sf16>(rtna_shift32((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000), p) | vlx); |
335 | 0 | case 23: |
336 | 0 | case 28: |
337 | | /* denormal, round to nearest-even. */ |
338 | 0 | p = 126 - ((inp >> 23) & 0xFF); |
339 | 0 | return static_cast<sf16>(rtne_shift32((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000), p) | vlx); |
340 | 24 | } |
341 | | |
342 | 0 | return 0; |
343 | 24 | } |
344 | | |
345 | | /* convert from soft-float to native-float */ |
346 | | float sf16_to_float(uint16_t p) |
347 | 817k | { |
348 | 817k | return astc::uint_as_float(sf16_to_sf32(p)); |
349 | 817k | } |
350 | | |
351 | | /* convert from native-float to soft-float */ |
352 | | uint16_t float_to_sf16(float p) |
353 | 24 | { |
354 | 24 | unsigned int ip = astc::float_as_uint(p); |
355 | 24 | return sf32_to_sf16(ip, SF_NEARESTEVEN); |
356 | 24 | } |
357 | | |
358 | | #endif |