/rust/registry/src/index.crates.io-1949cf8c6b5b557f/half-2.7.1/src/binary16/arch.rs
Line | Count | Source |
1 | | #![allow(dead_code, unused_imports)] |
2 | | use crate::leading_zeros::leading_zeros_u16; |
3 | | use core::mem; |
4 | | |
5 | | #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] |
6 | | mod x86; |
7 | | |
8 | | #[cfg(target_arch = "aarch64")] |
9 | | mod aarch64; |
10 | | |
11 | | #[cfg(all(feature = "nightly", target_arch = "loongarch64"))] |
12 | | mod loongarch64; |
13 | | |
14 | | macro_rules! convert_fn { |
15 | | (if x86_feature("f16c") { $f16c:expr } |
16 | | else if aarch64_feature("fp16") { $aarch64:expr } |
17 | | else if loongarch64_feature("lsx") { $loongarch64:expr } |
18 | | else { $fallback:expr }) => { |
19 | | cfg_if::cfg_if! { |
20 | | // Use intrinsics directly when a compile target or using no_std |
21 | | if #[cfg(all( |
22 | | any(target_arch = "x86", target_arch = "x86_64"), |
23 | | target_feature = "f16c" |
24 | | ))] { |
25 | | $f16c |
26 | | } |
27 | | else if #[cfg(all( |
28 | | target_arch = "aarch64", |
29 | | target_feature = "fp16" |
30 | | ))] { |
31 | | $aarch64 |
32 | | } |
33 | | else if #[cfg(all( |
34 | | feature = "nightly", |
35 | | target_arch = "loongarch64", |
36 | | target_feature = "lsx" |
37 | | ))] { |
38 | | $loongarch64 |
39 | | } |
40 | | |
41 | | // Use CPU feature detection if using std |
42 | | else if #[cfg(all( |
43 | | feature = "std", |
44 | | any(target_arch = "x86", target_arch = "x86_64") |
45 | | ))] { |
46 | | use std::arch::is_x86_feature_detected; |
47 | | if is_x86_feature_detected!("f16c") { |
48 | | $f16c |
49 | | } else { |
50 | | $fallback |
51 | | } |
52 | | } |
53 | | else if #[cfg(all( |
54 | | feature = "std", |
55 | | target_arch = "aarch64", |
56 | | ))] { |
57 | | use std::arch::is_aarch64_feature_detected; |
58 | | if is_aarch64_feature_detected!("fp16") { |
59 | | $aarch64 |
60 | | } else { |
61 | | $fallback |
62 | | } |
63 | | } |
64 | | else if #[cfg(all( |
65 | | feature = "std", |
66 | | feature = "nightly", |
67 | | target_arch = "loongarch64", |
68 | | ))] { |
69 | | use std::arch::is_loongarch_feature_detected; |
70 | | if is_loongarch_feature_detected!("lsx") { |
71 | | $loongarch64 |
72 | | } else { |
73 | | $fallback |
74 | | } |
75 | | } |
76 | | |
77 | | // Fallback to software |
78 | | else { |
79 | | $fallback |
80 | | } |
81 | | } |
82 | | }; |
83 | | } |
84 | | |
85 | | #[inline] |
86 | 0 | pub(crate) fn f32_to_f16(f: f32) -> u16 { |
87 | 0 | convert_fn! { |
88 | | if x86_feature("f16c") { |
89 | 0 | unsafe { x86::f32_to_f16_x86_f16c(f) } |
90 | | } else if aarch64_feature("fp16") { |
91 | | unsafe { aarch64::f32_to_f16_fp16(f) } |
92 | | } else if loongarch64_feature("lsx") { |
93 | | unsafe { loongarch64::f32_to_f16_lsx(f) } |
94 | | } else { |
95 | 0 | f32_to_f16_fallback(f) |
96 | | } |
97 | | } |
98 | 0 | } Unexecuted instantiation: half::binary16::arch::f32_to_f16 Unexecuted instantiation: half::binary16::arch::f32_to_f16 |
99 | | |
100 | | #[inline] |
101 | 0 | pub(crate) fn f64_to_f16(f: f64) -> u16 { |
102 | 0 | convert_fn! { |
103 | | if x86_feature("f16c") { |
104 | 0 | unsafe { x86::f32_to_f16_x86_f16c(f as f32) } |
105 | | } else if aarch64_feature("fp16") { |
106 | | unsafe { aarch64::f64_to_f16_fp16(f) } |
107 | | } else if loongarch64_feature("lsx") { |
108 | | f64_to_f16_fallback(f) |
109 | | } else { |
110 | 0 | f64_to_f16_fallback(f) |
111 | | } |
112 | | } |
113 | 0 | } |
114 | | |
115 | | #[inline] |
116 | 0 | pub(crate) fn f16_to_f32(i: u16) -> f32 { |
117 | 0 | convert_fn! { |
118 | | if x86_feature("f16c") { |
119 | 0 | unsafe { x86::f16_to_f32_x86_f16c(i) } |
120 | | } else if aarch64_feature("fp16") { |
121 | | unsafe { aarch64::f16_to_f32_fp16(i) } |
122 | | } else if loongarch64_feature("lsx") { |
123 | | unsafe { loongarch64::f16_to_f32_lsx(i) } |
124 | | } else { |
125 | 0 | f16_to_f32_fallback(i) |
126 | | } |
127 | | } |
128 | 0 | } Unexecuted instantiation: half::binary16::arch::f16_to_f32 Unexecuted instantiation: half::binary16::arch::f16_to_f32 |
129 | | |
130 | | #[inline] |
131 | 0 | pub(crate) fn f16_to_f64(i: u16) -> f64 { |
132 | 0 | convert_fn! { |
133 | | if x86_feature("f16c") { |
134 | 0 | unsafe { x86::f16_to_f32_x86_f16c(i) as f64 } |
135 | | } else if aarch64_feature("fp16") { |
136 | | unsafe { aarch64::f16_to_f64_fp16(i) } |
137 | | } else if loongarch64_feature("lsx") { |
138 | | unsafe { loongarch64::f16_to_f32_lsx(i) as f64 } |
139 | | } else { |
140 | 0 | f16_to_f64_fallback(i) |
141 | | } |
142 | | } |
143 | 0 | } |
144 | | |
145 | | #[inline] |
146 | 0 | pub(crate) fn f32x4_to_f16x4(f: &[f32; 4]) -> [u16; 4] { |
147 | 0 | convert_fn! { |
148 | | if x86_feature("f16c") { |
149 | 0 | unsafe { x86::f32x4_to_f16x4_x86_f16c(f) } |
150 | | } else if aarch64_feature("fp16") { |
151 | | unsafe { aarch64::f32x4_to_f16x4_fp16(f) } |
152 | | } else if loongarch64_feature("lsx") { |
153 | | unsafe { loongarch64::f32x4_to_f16x4_lsx(f) } |
154 | | } else { |
155 | 0 | f32x4_to_f16x4_fallback(f) |
156 | | } |
157 | | } |
158 | 0 | } |
159 | | |
160 | | #[inline] |
161 | 0 | pub(crate) fn f16x4_to_f32x4(i: &[u16; 4]) -> [f32; 4] { |
162 | 0 | convert_fn! { |
163 | | if x86_feature("f16c") { |
164 | 0 | unsafe { x86::f16x4_to_f32x4_x86_f16c(i) } |
165 | | } else if aarch64_feature("fp16") { |
166 | | unsafe { aarch64::f16x4_to_f32x4_fp16(i) } |
167 | | } else if loongarch64_feature("lsx") { |
168 | | unsafe { loongarch64::f16x4_to_f32x4_lsx(i) } |
169 | | } else { |
170 | 0 | f16x4_to_f32x4_fallback(i) |
171 | | } |
172 | | } |
173 | 0 | } |
174 | | |
175 | | #[inline] |
176 | 0 | pub(crate) fn f64x4_to_f16x4(f: &[f64; 4]) -> [u16; 4] { |
177 | 0 | convert_fn! { |
178 | | if x86_feature("f16c") { |
179 | 0 | unsafe { x86::f64x4_to_f16x4_x86_f16c(f) } |
180 | | } else if aarch64_feature("fp16") { |
181 | | unsafe { aarch64::f64x4_to_f16x4_fp16(f) } |
182 | | } else if loongarch64_feature("lsx") { |
183 | | unsafe { loongarch64::f64x4_to_f16x4_lsx(f) } |
184 | | } else { |
185 | 0 | f64x4_to_f16x4_fallback(f) |
186 | | } |
187 | | } |
188 | 0 | } |
189 | | |
190 | | #[inline] |
191 | 0 | pub(crate) fn f16x4_to_f64x4(i: &[u16; 4]) -> [f64; 4] { |
192 | 0 | convert_fn! { |
193 | | if x86_feature("f16c") { |
194 | 0 | unsafe { x86::f16x4_to_f64x4_x86_f16c(i) } |
195 | | } else if aarch64_feature("fp16") { |
196 | | unsafe { aarch64::f16x4_to_f64x4_fp16(i) } |
197 | | } else if loongarch64_feature("lsx") { |
198 | | unsafe { loongarch64::f16x4_to_f64x4_lsx(i) } |
199 | | } else { |
200 | 0 | f16x4_to_f64x4_fallback(i) |
201 | | } |
202 | | } |
203 | 0 | } |
204 | | |
205 | | #[inline] |
206 | 0 | pub(crate) fn f32x8_to_f16x8(f: &[f32; 8]) -> [u16; 8] { |
207 | 0 | convert_fn! { |
208 | | if x86_feature("f16c") { |
209 | 0 | unsafe { x86::f32x8_to_f16x8_x86_f16c(f) } |
210 | | } else if aarch64_feature("fp16") { |
211 | | { |
212 | | let mut result = [0u16; 8]; |
213 | | convert_chunked_slice_4(f.as_slice(), result.as_mut_slice(), |
214 | | aarch64::f32x4_to_f16x4_fp16); |
215 | | result |
216 | | } |
217 | | } else if loongarch64_feature("lsx") { |
218 | | { |
219 | | let mut result = [0u16; 8]; |
220 | | convert_chunked_slice_4(f.as_slice(), result.as_mut_slice(), |
221 | | loongarch64::f32x4_to_f16x4_lsx); |
222 | | result |
223 | | } |
224 | | } else { |
225 | 0 | f32x8_to_f16x8_fallback(f) |
226 | | } |
227 | | } |
228 | 0 | } |
229 | | |
230 | | #[inline] |
231 | 0 | pub(crate) fn f16x8_to_f32x8(i: &[u16; 8]) -> [f32; 8] { |
232 | 0 | convert_fn! { |
233 | | if x86_feature("f16c") { |
234 | 0 | unsafe { x86::f16x8_to_f32x8_x86_f16c(i) } |
235 | | } else if aarch64_feature("fp16") { |
236 | | { |
237 | | let mut result = [0f32; 8]; |
238 | | convert_chunked_slice_4(i.as_slice(), result.as_mut_slice(), |
239 | | aarch64::f16x4_to_f32x4_fp16); |
240 | | result |
241 | | } |
242 | | } else if loongarch64_feature("lsx") { |
243 | | { |
244 | | let mut result = [0f32; 8]; |
245 | | convert_chunked_slice_4(i.as_slice(), result.as_mut_slice(), |
246 | | loongarch64::f16x4_to_f32x4_lsx); |
247 | | result |
248 | | } |
249 | | } else { |
250 | 0 | f16x8_to_f32x8_fallback(i) |
251 | | } |
252 | | } |
253 | 0 | } |
254 | | |
255 | | #[inline] |
256 | 0 | pub(crate) fn f64x8_to_f16x8(f: &[f64; 8]) -> [u16; 8] { |
257 | 0 | convert_fn! { |
258 | | if x86_feature("f16c") { |
259 | 0 | unsafe { x86::f64x8_to_f16x8_x86_f16c(f) } |
260 | | } else if aarch64_feature("fp16") { |
261 | | { |
262 | | let mut result = [0u16; 8]; |
263 | | convert_chunked_slice_4(f.as_slice(), result.as_mut_slice(), |
264 | | aarch64::f64x4_to_f16x4_fp16); |
265 | | result |
266 | | } |
267 | | } else if loongarch64_feature("lsx") { |
268 | | { |
269 | | let mut result = [0u16; 8]; |
270 | | convert_chunked_slice_4(f.as_slice(), result.as_mut_slice(), |
271 | | loongarch64::f64x4_to_f16x4_lsx); |
272 | | result |
273 | | } |
274 | | } else { |
275 | 0 | f64x8_to_f16x8_fallback(f) |
276 | | } |
277 | | } |
278 | 0 | } |
279 | | |
280 | | #[inline] |
281 | 0 | pub(crate) fn f16x8_to_f64x8(i: &[u16; 8]) -> [f64; 8] { |
282 | 0 | convert_fn! { |
283 | | if x86_feature("f16c") { |
284 | 0 | unsafe { x86::f16x8_to_f64x8_x86_f16c(i) } |
285 | | } else if aarch64_feature("fp16") { |
286 | | { |
287 | | let mut result = [0f64; 8]; |
288 | | convert_chunked_slice_4(i.as_slice(), result.as_mut_slice(), |
289 | | aarch64::f16x4_to_f64x4_fp16); |
290 | | result |
291 | | } |
292 | | } else if loongarch64_feature("lsx") { |
293 | | { |
294 | | let mut result = [0f64; 8]; |
295 | | convert_chunked_slice_4(i.as_slice(), result.as_mut_slice(), |
296 | | loongarch64::f16x4_to_f64x4_lsx); |
297 | | result |
298 | | } |
299 | | } else { |
300 | 0 | f16x8_to_f64x8_fallback(i) |
301 | | } |
302 | | } |
303 | 0 | } |
304 | | |
305 | | #[inline] |
306 | 0 | pub(crate) fn f32_to_f16_slice(src: &[f32], dst: &mut [u16]) { |
307 | 0 | convert_fn! { |
308 | | if x86_feature("f16c") { |
309 | 0 | convert_chunked_slice_8(src, dst, x86::f32x8_to_f16x8_x86_f16c, |
310 | 0 | x86::f32x4_to_f16x4_x86_f16c) |
311 | | } else if aarch64_feature("fp16") { |
312 | | convert_chunked_slice_4(src, dst, aarch64::f32x4_to_f16x4_fp16) |
313 | | } else if loongarch64_feature("lsx") { |
314 | | convert_chunked_slice_4(src, dst, loongarch64::f32x4_to_f16x4_lsx) |
315 | | } else { |
316 | 0 | slice_fallback(src, dst, f32_to_f16_fallback) |
317 | | } |
318 | | } |
319 | 0 | } |
320 | | |
321 | | #[inline] |
322 | 211k | pub(crate) fn f16_to_f32_slice(src: &[u16], dst: &mut [f32]) { |
323 | 211k | convert_fn! { |
324 | | if x86_feature("f16c") { |
325 | 211k | convert_chunked_slice_8(src, dst, x86::f16x8_to_f32x8_x86_f16c, |
326 | 211k | x86::f16x4_to_f32x4_x86_f16c) |
327 | | } else if aarch64_feature("fp16") { |
328 | | convert_chunked_slice_4(src, dst, aarch64::f16x4_to_f32x4_fp16) |
329 | | } else if loongarch64_feature("lsx") { |
330 | | convert_chunked_slice_4(src, dst, loongarch64::f16x4_to_f32x4_lsx) |
331 | | } else { |
332 | 0 | slice_fallback(src, dst, f16_to_f32_fallback) |
333 | | } |
334 | | } |
335 | 211k | } Unexecuted instantiation: half::binary16::arch::f16_to_f32_slice Unexecuted instantiation: half::binary16::arch::f16_to_f32_slice Unexecuted instantiation: half::binary16::arch::f16_to_f32_slice Unexecuted instantiation: half::binary16::arch::f16_to_f32_slice Unexecuted instantiation: half::binary16::arch::f16_to_f32_slice Unexecuted instantiation: half::binary16::arch::f16_to_f32_slice Unexecuted instantiation: half::binary16::arch::f16_to_f32_slice Unexecuted instantiation: half::binary16::arch::f16_to_f32_slice Unexecuted instantiation: half::binary16::arch::f16_to_f32_slice Unexecuted instantiation: half::binary16::arch::f16_to_f32_slice Unexecuted instantiation: half::binary16::arch::f16_to_f32_slice Unexecuted instantiation: half::binary16::arch::f16_to_f32_slice half::binary16::arch::f16_to_f32_slice Line | Count | Source | 322 | 211k | pub(crate) fn f16_to_f32_slice(src: &[u16], dst: &mut [f32]) { | 323 | 211k | convert_fn! { | 324 | | if x86_feature("f16c") { | 325 | 211k | convert_chunked_slice_8(src, dst, x86::f16x8_to_f32x8_x86_f16c, | 326 | 211k | x86::f16x4_to_f32x4_x86_f16c) | 327 | | } else if aarch64_feature("fp16") { | 328 | | convert_chunked_slice_4(src, dst, aarch64::f16x4_to_f32x4_fp16) | 329 | | } else if loongarch64_feature("lsx") { | 330 | | convert_chunked_slice_4(src, dst, loongarch64::f16x4_to_f32x4_lsx) | 331 | | } else { | 332 | 0 | slice_fallback(src, dst, f16_to_f32_fallback) | 333 | | } | 334 | | } | 335 | 211k | } |
|
336 | | |
337 | | #[inline] |
338 | 0 | pub(crate) fn f64_to_f16_slice(src: &[f64], dst: &mut [u16]) { |
339 | 0 | convert_fn! { |
340 | | if x86_feature("f16c") { |
341 | 0 | convert_chunked_slice_8(src, dst, x86::f64x8_to_f16x8_x86_f16c, |
342 | 0 | x86::f64x4_to_f16x4_x86_f16c) |
343 | | } else if aarch64_feature("fp16") { |
344 | | convert_chunked_slice_4(src, dst, aarch64::f64x4_to_f16x4_fp16) |
345 | | } else if loongarch64_feature("lsx") { |
346 | | convert_chunked_slice_4(src, dst, loongarch64::f64x4_to_f16x4_lsx) |
347 | | } else { |
348 | 0 | slice_fallback(src, dst, f64_to_f16_fallback) |
349 | | } |
350 | | } |
351 | 0 | } |
352 | | |
353 | | #[inline] |
354 | 0 | pub(crate) fn f16_to_f64_slice(src: &[u16], dst: &mut [f64]) { |
355 | 0 | convert_fn! { |
356 | | if x86_feature("f16c") { |
357 | 0 | convert_chunked_slice_8(src, dst, x86::f16x8_to_f64x8_x86_f16c, |
358 | 0 | x86::f16x4_to_f64x4_x86_f16c) |
359 | | } else if aarch64_feature("fp16") { |
360 | | convert_chunked_slice_4(src, dst, aarch64::f16x4_to_f64x4_fp16) |
361 | | } else if loongarch64_feature("lsx") { |
362 | | convert_chunked_slice_4(src, dst, loongarch64::f16x4_to_f64x4_lsx) |
363 | | } else { |
364 | 0 | slice_fallback(src, dst, f16_to_f64_fallback) |
365 | | } |
366 | | } |
367 | 0 | } |
368 | | |
369 | | macro_rules! math_fn { |
370 | | (if aarch64_feature("fp16") { $aarch64:expr } |
371 | | else { $fallback:expr }) => { |
372 | | cfg_if::cfg_if! { |
373 | | // Use intrinsics directly when a compile target or using no_std |
374 | | if #[cfg(all( |
375 | | target_arch = "aarch64", |
376 | | target_feature = "fp16" |
377 | | ))] { |
378 | | $aarch64 |
379 | | } |
380 | | |
381 | | // Use CPU feature detection if using std |
382 | | else if #[cfg(all( |
383 | | feature = "std", |
384 | | target_arch = "aarch64", |
385 | | not(target_feature = "fp16") |
386 | | ))] { |
387 | | use std::arch::is_aarch64_feature_detected; |
388 | | if is_aarch64_feature_detected!("fp16") { |
389 | | $aarch64 |
390 | | } else { |
391 | | $fallback |
392 | | } |
393 | | } |
394 | | |
395 | | // Fallback to software |
396 | | else { |
397 | | $fallback |
398 | | } |
399 | | } |
400 | | }; |
401 | | } |
402 | | |
403 | | #[inline] |
404 | 0 | pub(crate) fn add_f16(a: u16, b: u16) -> u16 { |
405 | | math_fn! { |
406 | | if aarch64_feature("fp16") { |
407 | | unsafe { aarch64::add_f16_fp16(a, b) } |
408 | | } else { |
409 | 0 | add_f16_fallback(a, b) |
410 | | } |
411 | | } |
412 | 0 | } |
413 | | |
414 | | #[inline] |
415 | 0 | pub(crate) fn subtract_f16(a: u16, b: u16) -> u16 { |
416 | | math_fn! { |
417 | | if aarch64_feature("fp16") { |
418 | | unsafe { aarch64::subtract_f16_fp16(a, b) } |
419 | | } else { |
420 | 0 | subtract_f16_fallback(a, b) |
421 | | } |
422 | | } |
423 | 0 | } |
424 | | |
425 | | #[inline] |
426 | 0 | pub(crate) fn multiply_f16(a: u16, b: u16) -> u16 { |
427 | | math_fn! { |
428 | | if aarch64_feature("fp16") { |
429 | | unsafe { aarch64::multiply_f16_fp16(a, b) } |
430 | | } else { |
431 | 0 | multiply_f16_fallback(a, b) |
432 | | } |
433 | | } |
434 | 0 | } |
435 | | |
436 | | #[inline] |
437 | 0 | pub(crate) fn divide_f16(a: u16, b: u16) -> u16 { |
438 | | math_fn! { |
439 | | if aarch64_feature("fp16") { |
440 | | unsafe { aarch64::divide_f16_fp16(a, b) } |
441 | | } else { |
442 | 0 | divide_f16_fallback(a, b) |
443 | | } |
444 | | } |
445 | 0 | } |
446 | | |
447 | | #[inline] |
448 | 0 | pub(crate) fn remainder_f16(a: u16, b: u16) -> u16 { |
449 | 0 | remainder_f16_fallback(a, b) |
450 | 0 | } |
451 | | |
452 | | #[inline] |
453 | 0 | pub(crate) fn product_f16<I: Iterator<Item = u16>>(iter: I) -> u16 { |
454 | | math_fn! { |
455 | | if aarch64_feature("fp16") { |
456 | | iter.fold(0, |acc, x| unsafe { aarch64::multiply_f16_fp16(acc, x) }) |
457 | | } else { |
458 | 0 | product_f16_fallback(iter) |
459 | | } |
460 | | } |
461 | 0 | } |
462 | | |
463 | | #[inline] |
464 | 0 | pub(crate) fn sum_f16<I: Iterator<Item = u16>>(iter: I) -> u16 { |
465 | | math_fn! { |
466 | | if aarch64_feature("fp16") { |
467 | | iter.fold(0, |acc, x| unsafe { aarch64::add_f16_fp16(acc, x) }) |
468 | | } else { |
469 | 0 | sum_f16_fallback(iter) |
470 | | } |
471 | | } |
472 | 0 | } |
473 | | |
474 | | /// Chunks sliced into x8 or x4 arrays |
475 | | #[inline] |
476 | 211k | fn convert_chunked_slice_8<S: Copy + Default, D: Copy>( |
477 | 211k | src: &[S], |
478 | 211k | dst: &mut [D], |
479 | 211k | fn8: unsafe fn(&[S; 8]) -> [D; 8], |
480 | 211k | fn4: unsafe fn(&[S; 4]) -> [D; 4], |
481 | 211k | ) { |
482 | 211k | assert_eq!(src.len(), dst.len()); |
483 | | |
484 | | // TODO: Can be further optimized with array_chunks when it becomes stabilized |
485 | | |
486 | 211k | let src_chunks = src.chunks_exact(8); |
487 | 211k | let mut dst_chunks = dst.chunks_exact_mut(8); |
488 | 211k | let src_remainder = src_chunks.remainder(); |
489 | 340k | for (s, d) in src_chunks.zip(&mut dst_chunks) { |
490 | 340k | let chunk: &[S; 8] = s.try_into().unwrap(); |
491 | 340k | d.copy_from_slice(unsafe { &fn8(chunk) }); |
492 | 340k | } |
493 | | |
494 | | // Process remainder |
495 | 211k | if src_remainder.len() > 4 { |
496 | 21 | let mut buf: [S; 8] = Default::default(); |
497 | 21 | buf[..src_remainder.len()].copy_from_slice(src_remainder); |
498 | 21 | let vec = unsafe { fn8(&buf) }; |
499 | 21 | let dst_remainder = dst_chunks.into_remainder(); |
500 | 21 | dst_remainder.copy_from_slice(&vec[..dst_remainder.len()]); |
501 | 211k | } else if !src_remainder.is_empty() { |
502 | 40.7k | let mut buf: [S; 4] = Default::default(); |
503 | 40.7k | buf[..src_remainder.len()].copy_from_slice(src_remainder); |
504 | 40.7k | let vec = unsafe { fn4(&buf) }; |
505 | 40.7k | let dst_remainder = dst_chunks.into_remainder(); |
506 | 40.7k | dst_remainder.copy_from_slice(&vec[..dst_remainder.len()]); |
507 | 170k | } |
508 | 211k | } Unexecuted instantiation: half::binary16::arch::convert_chunked_slice_8::<u16, f32> Unexecuted instantiation: half::binary16::arch::convert_chunked_slice_8::<f64, u16> Unexecuted instantiation: half::binary16::arch::convert_chunked_slice_8::<f32, u16> Unexecuted instantiation: half::binary16::arch::convert_chunked_slice_8::<u16, f32> Unexecuted instantiation: half::binary16::arch::convert_chunked_slice_8::<u16, f32> Unexecuted instantiation: half::binary16::arch::convert_chunked_slice_8::<u16, f32> Unexecuted instantiation: half::binary16::arch::convert_chunked_slice_8::<u16, f32> Unexecuted instantiation: half::binary16::arch::convert_chunked_slice_8::<u16, f32> Unexecuted instantiation: half::binary16::arch::convert_chunked_slice_8::<u16, f32> Unexecuted instantiation: half::binary16::arch::convert_chunked_slice_8::<u16, f32> Unexecuted instantiation: half::binary16::arch::convert_chunked_slice_8::<u16, f32> Unexecuted instantiation: half::binary16::arch::convert_chunked_slice_8::<u16, f32> Unexecuted instantiation: half::binary16::arch::convert_chunked_slice_8::<u16, f32> half::binary16::arch::convert_chunked_slice_8::<u16, f32> Line | Count | Source | 476 | 211k | fn convert_chunked_slice_8<S: Copy + Default, D: Copy>( | 477 | 211k | src: &[S], | 478 | 211k | dst: &mut [D], | 479 | 211k | fn8: unsafe fn(&[S; 8]) -> [D; 8], | 480 | 211k | fn4: unsafe fn(&[S; 4]) -> [D; 4], | 481 | 211k | ) { | 482 | 211k | assert_eq!(src.len(), dst.len()); | 483 | | | 484 | | // TODO: Can be further optimized with array_chunks when it becomes stabilized | 485 | | | 486 | 211k | let src_chunks = src.chunks_exact(8); | 487 | 211k | let mut dst_chunks = dst.chunks_exact_mut(8); | 488 | 211k | let src_remainder = src_chunks.remainder(); | 489 | 340k | for (s, d) in src_chunks.zip(&mut dst_chunks) { | 490 | 340k | let chunk: &[S; 8] = s.try_into().unwrap(); | 491 | 340k | d.copy_from_slice(unsafe { &fn8(chunk) }); | 492 | 340k | } | 493 | | | 494 | | // Process remainder | 495 | 211k | if src_remainder.len() > 4 { | 496 | 21 | let mut buf: [S; 8] = Default::default(); | 497 | 21 | buf[..src_remainder.len()].copy_from_slice(src_remainder); | 498 | 21 | let vec = unsafe { fn8(&buf) }; | 499 | 21 | let dst_remainder = dst_chunks.into_remainder(); | 500 | 21 | dst_remainder.copy_from_slice(&vec[..dst_remainder.len()]); | 501 | 211k | } else if !src_remainder.is_empty() { | 502 | 40.7k | let mut buf: [S; 4] = Default::default(); | 503 | 40.7k | buf[..src_remainder.len()].copy_from_slice(src_remainder); | 504 | 40.7k | let vec = unsafe { fn4(&buf) }; | 505 | 40.7k | let dst_remainder = dst_chunks.into_remainder(); | 506 | 40.7k | dst_remainder.copy_from_slice(&vec[..dst_remainder.len()]); | 507 | 170k | } | 508 | 211k | } |
|
509 | | |
510 | | /// Chunks sliced into x4 arrays |
511 | | #[inline] |
512 | 0 | fn convert_chunked_slice_4<S: Copy + Default, D: Copy>( |
513 | 0 | src: &[S], |
514 | 0 | dst: &mut [D], |
515 | 0 | f: unsafe fn(&[S; 4]) -> [D; 4], |
516 | 0 | ) { |
517 | 0 | assert_eq!(src.len(), dst.len()); |
518 | | |
519 | | // TODO: Can be further optimized with array_chunks when it becomes stabilized |
520 | | |
521 | 0 | let src_chunks = src.chunks_exact(4); |
522 | 0 | let mut dst_chunks = dst.chunks_exact_mut(4); |
523 | 0 | let src_remainder = src_chunks.remainder(); |
524 | 0 | for (s, d) in src_chunks.zip(&mut dst_chunks) { |
525 | 0 | let chunk: &[S; 4] = s.try_into().unwrap(); |
526 | 0 | d.copy_from_slice(unsafe { &f(chunk) }); |
527 | 0 | } |
528 | | |
529 | | // Process remainder |
530 | 0 | if !src_remainder.is_empty() { |
531 | 0 | let mut buf: [S; 4] = Default::default(); |
532 | 0 | buf[..src_remainder.len()].copy_from_slice(src_remainder); |
533 | 0 | let vec = unsafe { f(&buf) }; |
534 | 0 | let dst_remainder = dst_chunks.into_remainder(); |
535 | 0 | dst_remainder.copy_from_slice(&vec[..dst_remainder.len()]); |
536 | 0 | } |
537 | 0 | } |
538 | | |
539 | | /////////////// Fallbacks //////////////// |
540 | | |
541 | | // In the below functions, round to nearest, with ties to even. |
542 | | // Let us call the most significant bit that will be shifted out the round_bit. |
543 | | // |
544 | | // Round up if either |
545 | | // a) Removed part > tie. |
546 | | // (mantissa & round_bit) != 0 && (mantissa & (round_bit - 1)) != 0 |
547 | | // b) Removed part == tie, and retained part is odd. |
548 | | // (mantissa & round_bit) != 0 && (mantissa & (2 * round_bit)) != 0 |
549 | | // (If removed part == tie and retained part is even, do not round up.) |
550 | | // These two conditions can be combined into one: |
551 | | // (mantissa & round_bit) != 0 && (mantissa & ((round_bit - 1) | (2 * round_bit))) != 0 |
552 | | // which can be simplified into |
553 | | // (mantissa & round_bit) != 0 && (mantissa & (3 * round_bit - 1)) != 0 |
554 | | |
555 | | #[inline] |
556 | 0 | pub(crate) const fn f32_to_f16_fallback(value: f32) -> u16 { |
557 | | // TODO: Replace mem::transmute with to_bits() once to_bits is const-stabilized |
558 | | // Convert to raw bytes |
559 | 0 | let x: u32 = unsafe { mem::transmute::<f32, u32>(value) }; |
560 | | |
561 | | // Extract IEEE754 components |
562 | 0 | let sign = x & 0x8000_0000u32; |
563 | 0 | let exp = x & 0x7F80_0000u32; |
564 | 0 | let man = x & 0x007F_FFFFu32; |
565 | | |
566 | | // Check for all exponent bits being set, which is Infinity or NaN |
567 | 0 | if exp == 0x7F80_0000u32 { |
568 | | // Set mantissa MSB for NaN (and also keep shifted mantissa bits) |
569 | 0 | let nan_bit = if man == 0 { 0 } else { 0x0200u32 }; |
570 | 0 | return ((sign >> 16) | 0x7C00u32 | nan_bit | (man >> 13)) as u16; |
571 | 0 | } |
572 | | |
573 | | // The number is normalized, start assembling half precision version |
574 | 0 | let half_sign = sign >> 16; |
575 | | // Unbias the exponent, then bias for half precision |
576 | 0 | let unbiased_exp = ((exp >> 23) as i32) - 127; |
577 | 0 | let half_exp = unbiased_exp + 15; |
578 | | |
579 | | // Check for exponent overflow, return +infinity |
580 | 0 | if half_exp >= 0x1F { |
581 | 0 | return (half_sign | 0x7C00u32) as u16; |
582 | 0 | } |
583 | | |
584 | | // Check for underflow |
585 | 0 | if half_exp <= 0 { |
586 | | // Check mantissa for what we can do |
587 | 0 | if 14 - half_exp > 24 { |
588 | | // No rounding possibility, so this is a full underflow, return signed zero |
589 | 0 | return half_sign as u16; |
590 | 0 | } |
591 | | // Don't forget about hidden leading mantissa bit when assembling mantissa |
592 | 0 | let man = man | 0x0080_0000u32; |
593 | 0 | let mut half_man = man >> (14 - half_exp); |
594 | | // Check for rounding (see comment above functions) |
595 | 0 | let round_bit = 1 << (13 - half_exp); |
596 | 0 | if (man & round_bit) != 0 && (man & (3 * round_bit - 1)) != 0 { |
597 | 0 | half_man += 1; |
598 | 0 | } |
599 | | // No exponent for subnormals |
600 | 0 | return (half_sign | half_man) as u16; |
601 | 0 | } |
602 | | |
603 | | // Rebias the exponent |
604 | 0 | let half_exp = (half_exp as u32) << 10; |
605 | 0 | let half_man = man >> 13; |
606 | | // Check for rounding (see comment above functions) |
607 | 0 | let round_bit = 0x0000_1000u32; |
608 | 0 | if (man & round_bit) != 0 && (man & (3 * round_bit - 1)) != 0 { |
609 | | // Round it |
610 | 0 | ((half_sign | half_exp | half_man) + 1) as u16 |
611 | | } else { |
612 | 0 | (half_sign | half_exp | half_man) as u16 |
613 | | } |
614 | 0 | } Unexecuted instantiation: half::binary16::arch::f32_to_f16_fallback Unexecuted instantiation: half::binary16::arch::f32_to_f16_fallback |
615 | | |
616 | | #[inline] |
617 | 0 | pub(crate) const fn f64_to_f16_fallback(value: f64) -> u16 { |
618 | | // Convert to raw bytes, truncating the last 32-bits of mantissa; that precision will always |
619 | | // be lost on half-precision. |
620 | | // TODO: Replace mem::transmute with to_bits() once to_bits is const-stabilized |
621 | 0 | let val: u64 = unsafe { mem::transmute::<f64, u64>(value) }; |
622 | 0 | let x = (val >> 32) as u32; |
623 | | |
624 | | // Extract IEEE754 components |
625 | 0 | let sign = x & 0x8000_0000u32; |
626 | 0 | let exp = x & 0x7FF0_0000u32; |
627 | 0 | let man = x & 0x000F_FFFFu32; |
628 | | |
629 | | // Check for all exponent bits being set, which is Infinity or NaN |
630 | 0 | if exp == 0x7FF0_0000u32 { |
631 | | // Set mantissa MSB for NaN (and also keep shifted mantissa bits). |
632 | | // We also have to check the last 32 bits. |
633 | 0 | let nan_bit = if man == 0 && (val as u32 == 0) { |
634 | 0 | 0 |
635 | | } else { |
636 | 0 | 0x0200u32 |
637 | | }; |
638 | 0 | return ((sign >> 16) | 0x7C00u32 | nan_bit | (man >> 10)) as u16; |
639 | 0 | } |
640 | | |
641 | | // The number is normalized, start assembling half precision version |
642 | 0 | let half_sign = sign >> 16; |
643 | | // Unbias the exponent, then bias for half precision |
644 | 0 | let unbiased_exp = ((exp >> 20) as i64) - 1023; |
645 | 0 | let half_exp = unbiased_exp + 15; |
646 | | |
647 | | // Check for exponent overflow, return +infinity |
648 | 0 | if half_exp >= 0x1F { |
649 | 0 | return (half_sign | 0x7C00u32) as u16; |
650 | 0 | } |
651 | | |
652 | | // Check for underflow |
653 | 0 | if half_exp <= 0 { |
654 | | // Check mantissa for what we can do |
655 | 0 | if 10 - half_exp > 21 { |
656 | | // No rounding possibility, so this is a full underflow, return signed zero |
657 | 0 | return half_sign as u16; |
658 | 0 | } |
659 | | // Don't forget about hidden leading mantissa bit when assembling mantissa |
660 | 0 | let man = man | 0x0010_0000u32; |
661 | 0 | let mut half_man = man >> (11 - half_exp); |
662 | | // Check for rounding (see comment above functions) |
663 | 0 | let round_bit = 1 << (10 - half_exp); |
664 | 0 | if (man & round_bit) != 0 && (man & (3 * round_bit - 1)) != 0 { |
665 | 0 | half_man += 1; |
666 | 0 | } |
667 | | // No exponent for subnormals |
668 | 0 | return (half_sign | half_man) as u16; |
669 | 0 | } |
670 | | |
671 | | // Rebias the exponent |
672 | 0 | let half_exp = (half_exp as u32) << 10; |
673 | 0 | let half_man = man >> 10; |
674 | | // Check for rounding (see comment above functions) |
675 | 0 | let round_bit = 0x0000_0200u32; |
676 | 0 | if (man & round_bit) != 0 && (man & (3 * round_bit - 1)) != 0 { |
677 | | // Round it |
678 | 0 | ((half_sign | half_exp | half_man) + 1) as u16 |
679 | | } else { |
680 | 0 | (half_sign | half_exp | half_man) as u16 |
681 | | } |
682 | 0 | } |
683 | | |
684 | | #[inline] |
685 | 0 | pub(crate) const fn f16_to_f32_fallback(i: u16) -> f32 { |
686 | | // Check for signed zero |
687 | | // TODO: Replace mem::transmute with from_bits() once from_bits is const-stabilized |
688 | 0 | if i & 0x7FFFu16 == 0 { |
689 | 0 | return unsafe { mem::transmute::<u32, f32>((i as u32) << 16) }; |
690 | 0 | } |
691 | | |
692 | 0 | let half_sign = (i & 0x8000u16) as u32; |
693 | 0 | let half_exp = (i & 0x7C00u16) as u32; |
694 | 0 | let half_man = (i & 0x03FFu16) as u32; |
695 | | |
696 | | // Check for an infinity or NaN when all exponent bits set |
697 | 0 | if half_exp == 0x7C00u32 { |
698 | | // Check for signed infinity if mantissa is zero |
699 | 0 | if half_man == 0 { |
700 | 0 | return unsafe { mem::transmute::<u32, f32>((half_sign << 16) | 0x7F80_0000u32) }; |
701 | | } else { |
702 | | // NaN, keep current mantissa but also set most significiant mantissa bit |
703 | | return unsafe { |
704 | 0 | mem::transmute::<u32, f32>((half_sign << 16) | 0x7FC0_0000u32 | (half_man << 13)) |
705 | | }; |
706 | | } |
707 | 0 | } |
708 | | |
709 | | // Calculate single-precision components with adjusted exponent |
710 | 0 | let sign = half_sign << 16; |
711 | | // Unbias exponent |
712 | 0 | let unbiased_exp = ((half_exp as i32) >> 10) - 15; |
713 | | |
714 | | // Check for subnormals, which will be normalized by adjusting exponent |
715 | 0 | if half_exp == 0 { |
716 | | // Calculate how much to adjust the exponent by |
717 | 0 | let e = leading_zeros_u16(half_man as u16) - 6; |
718 | | |
719 | | // Rebias and adjust exponent |
720 | 0 | let exp = (127 - 15 - e) << 23; |
721 | 0 | let man = (half_man << (14 + e)) & 0x7F_FF_FFu32; |
722 | 0 | return unsafe { mem::transmute::<u32, f32>(sign | exp | man) }; |
723 | 0 | } |
724 | | |
725 | | // Rebias exponent for a normalized normal |
726 | 0 | let exp = ((unbiased_exp + 127) as u32) << 23; |
727 | 0 | let man = (half_man & 0x03FFu32) << 13; |
728 | 0 | unsafe { mem::transmute::<u32, f32>(sign | exp | man) } |
729 | 0 | } Unexecuted instantiation: half::binary16::arch::f16_to_f32_fallback Unexecuted instantiation: half::binary16::arch::f16_to_f32_fallback Unexecuted instantiation: half::binary16::arch::f16_to_f32_fallback Unexecuted instantiation: half::binary16::arch::f16_to_f32_fallback Unexecuted instantiation: half::binary16::arch::f16_to_f32_fallback Unexecuted instantiation: half::binary16::arch::f16_to_f32_fallback Unexecuted instantiation: half::binary16::arch::f16_to_f32_fallback Unexecuted instantiation: half::binary16::arch::f16_to_f32_fallback Unexecuted instantiation: half::binary16::arch::f16_to_f32_fallback Unexecuted instantiation: half::binary16::arch::f16_to_f32_fallback Unexecuted instantiation: half::binary16::arch::f16_to_f32_fallback Unexecuted instantiation: half::binary16::arch::f16_to_f32_fallback Unexecuted instantiation: half::binary16::arch::f16_to_f32_fallback Unexecuted instantiation: half::binary16::arch::f16_to_f32_fallback |
730 | | |
731 | | #[inline] |
732 | 0 | pub(crate) const fn f16_to_f64_fallback(i: u16) -> f64 { |
733 | | // Check for signed zero |
734 | | // TODO: Replace mem::transmute with from_bits() once from_bits is const-stabilized |
735 | 0 | if i & 0x7FFFu16 == 0 { |
736 | 0 | return unsafe { mem::transmute::<u64, f64>((i as u64) << 48) }; |
737 | 0 | } |
738 | | |
739 | 0 | let half_sign = (i & 0x8000u16) as u64; |
740 | 0 | let half_exp = (i & 0x7C00u16) as u64; |
741 | 0 | let half_man = (i & 0x03FFu16) as u64; |
742 | | |
743 | | // Check for an infinity or NaN when all exponent bits set |
744 | 0 | if half_exp == 0x7C00u64 { |
745 | | // Check for signed infinity if mantissa is zero |
746 | 0 | if half_man == 0 { |
747 | | return unsafe { |
748 | 0 | mem::transmute::<u64, f64>((half_sign << 48) | 0x7FF0_0000_0000_0000u64) |
749 | | }; |
750 | | } else { |
751 | | // NaN, keep current mantissa but also set most significiant mantissa bit |
752 | | return unsafe { |
753 | 0 | mem::transmute::<u64, f64>( |
754 | 0 | (half_sign << 48) | 0x7FF8_0000_0000_0000u64 | (half_man << 42), |
755 | | ) |
756 | | }; |
757 | | } |
758 | 0 | } |
759 | | |
760 | | // Calculate double-precision components with adjusted exponent |
761 | 0 | let sign = half_sign << 48; |
762 | | // Unbias exponent |
763 | 0 | let unbiased_exp = ((half_exp as i64) >> 10) - 15; |
764 | | |
765 | | // Check for subnormals, which will be normalized by adjusting exponent |
766 | 0 | if half_exp == 0 { |
767 | | // Calculate how much to adjust the exponent by |
768 | 0 | let e = leading_zeros_u16(half_man as u16) - 6; |
769 | | |
770 | | // Rebias and adjust exponent |
771 | 0 | let exp = ((1023 - 15 - e) as u64) << 52; |
772 | 0 | let man = (half_man << (43 + e)) & 0xF_FFFF_FFFF_FFFFu64; |
773 | 0 | return unsafe { mem::transmute::<u64, f64>(sign | exp | man) }; |
774 | 0 | } |
775 | | |
776 | | // Rebias exponent for a normalized normal |
777 | 0 | let exp = ((unbiased_exp + 1023) as u64) << 52; |
778 | 0 | let man = (half_man & 0x03FFu64) << 42; |
779 | 0 | unsafe { mem::transmute::<u64, f64>(sign | exp | man) } |
780 | 0 | } |
781 | | |
782 | | #[inline] |
783 | 0 | fn f16x4_to_f32x4_fallback(v: &[u16; 4]) -> [f32; 4] { |
784 | 0 | [ |
785 | 0 | f16_to_f32_fallback(v[0]), |
786 | 0 | f16_to_f32_fallback(v[1]), |
787 | 0 | f16_to_f32_fallback(v[2]), |
788 | 0 | f16_to_f32_fallback(v[3]), |
789 | 0 | ] |
790 | 0 | } |
791 | | |
792 | | #[inline] |
793 | 0 | fn f32x4_to_f16x4_fallback(v: &[f32; 4]) -> [u16; 4] { |
794 | 0 | [ |
795 | 0 | f32_to_f16_fallback(v[0]), |
796 | 0 | f32_to_f16_fallback(v[1]), |
797 | 0 | f32_to_f16_fallback(v[2]), |
798 | 0 | f32_to_f16_fallback(v[3]), |
799 | 0 | ] |
800 | 0 | } |
801 | | |
802 | | #[inline] |
803 | 0 | fn f16x4_to_f64x4_fallback(v: &[u16; 4]) -> [f64; 4] { |
804 | 0 | [ |
805 | 0 | f16_to_f64_fallback(v[0]), |
806 | 0 | f16_to_f64_fallback(v[1]), |
807 | 0 | f16_to_f64_fallback(v[2]), |
808 | 0 | f16_to_f64_fallback(v[3]), |
809 | 0 | ] |
810 | 0 | } |
811 | | |
812 | | #[inline] |
813 | 0 | fn f64x4_to_f16x4_fallback(v: &[f64; 4]) -> [u16; 4] { |
814 | 0 | [ |
815 | 0 | f64_to_f16_fallback(v[0]), |
816 | 0 | f64_to_f16_fallback(v[1]), |
817 | 0 | f64_to_f16_fallback(v[2]), |
818 | 0 | f64_to_f16_fallback(v[3]), |
819 | 0 | ] |
820 | 0 | } |
821 | | |
822 | | #[inline] |
823 | 0 | fn f16x8_to_f32x8_fallback(v: &[u16; 8]) -> [f32; 8] { |
824 | 0 | [ |
825 | 0 | f16_to_f32_fallback(v[0]), |
826 | 0 | f16_to_f32_fallback(v[1]), |
827 | 0 | f16_to_f32_fallback(v[2]), |
828 | 0 | f16_to_f32_fallback(v[3]), |
829 | 0 | f16_to_f32_fallback(v[4]), |
830 | 0 | f16_to_f32_fallback(v[5]), |
831 | 0 | f16_to_f32_fallback(v[6]), |
832 | 0 | f16_to_f32_fallback(v[7]), |
833 | 0 | ] |
834 | 0 | } |
835 | | |
836 | | #[inline] |
837 | 0 | fn f32x8_to_f16x8_fallback(v: &[f32; 8]) -> [u16; 8] { |
838 | 0 | [ |
839 | 0 | f32_to_f16_fallback(v[0]), |
840 | 0 | f32_to_f16_fallback(v[1]), |
841 | 0 | f32_to_f16_fallback(v[2]), |
842 | 0 | f32_to_f16_fallback(v[3]), |
843 | 0 | f32_to_f16_fallback(v[4]), |
844 | 0 | f32_to_f16_fallback(v[5]), |
845 | 0 | f32_to_f16_fallback(v[6]), |
846 | 0 | f32_to_f16_fallback(v[7]), |
847 | 0 | ] |
848 | 0 | } |
849 | | |
850 | | #[inline] |
851 | 0 | fn f16x8_to_f64x8_fallback(v: &[u16; 8]) -> [f64; 8] { |
852 | 0 | [ |
853 | 0 | f16_to_f64_fallback(v[0]), |
854 | 0 | f16_to_f64_fallback(v[1]), |
855 | 0 | f16_to_f64_fallback(v[2]), |
856 | 0 | f16_to_f64_fallback(v[3]), |
857 | 0 | f16_to_f64_fallback(v[4]), |
858 | 0 | f16_to_f64_fallback(v[5]), |
859 | 0 | f16_to_f64_fallback(v[6]), |
860 | 0 | f16_to_f64_fallback(v[7]), |
861 | 0 | ] |
862 | 0 | } |
863 | | |
864 | | #[inline] |
865 | 0 | fn f64x8_to_f16x8_fallback(v: &[f64; 8]) -> [u16; 8] { |
866 | 0 | [ |
867 | 0 | f64_to_f16_fallback(v[0]), |
868 | 0 | f64_to_f16_fallback(v[1]), |
869 | 0 | f64_to_f16_fallback(v[2]), |
870 | 0 | f64_to_f16_fallback(v[3]), |
871 | 0 | f64_to_f16_fallback(v[4]), |
872 | 0 | f64_to_f16_fallback(v[5]), |
873 | 0 | f64_to_f16_fallback(v[6]), |
874 | 0 | f64_to_f16_fallback(v[7]), |
875 | 0 | ] |
876 | 0 | } |
877 | | |
878 | | #[inline] |
879 | 0 | fn slice_fallback<S: Copy, D>(src: &[S], dst: &mut [D], f: fn(S) -> D) { |
880 | 0 | assert_eq!(src.len(), dst.len()); |
881 | 0 | for (s, d) in src.iter().copied().zip(dst.iter_mut()) { |
882 | 0 | *d = f(s); |
883 | 0 | } |
884 | 0 | } Unexecuted instantiation: half::binary16::arch::slice_fallback::<u16, f32> Unexecuted instantiation: half::binary16::arch::slice_fallback::<f64, u16> Unexecuted instantiation: half::binary16::arch::slice_fallback::<f32, u16> Unexecuted instantiation: half::binary16::arch::slice_fallback::<u16, f32> Unexecuted instantiation: half::binary16::arch::slice_fallback::<u16, f32> Unexecuted instantiation: half::binary16::arch::slice_fallback::<u16, f32> Unexecuted instantiation: half::binary16::arch::slice_fallback::<u16, f32> Unexecuted instantiation: half::binary16::arch::slice_fallback::<u16, f32> Unexecuted instantiation: half::binary16::arch::slice_fallback::<u16, f32> Unexecuted instantiation: half::binary16::arch::slice_fallback::<u16, f32> Unexecuted instantiation: half::binary16::arch::slice_fallback::<u16, f32> Unexecuted instantiation: half::binary16::arch::slice_fallback::<u16, f32> Unexecuted instantiation: half::binary16::arch::slice_fallback::<u16, f32> Unexecuted instantiation: half::binary16::arch::slice_fallback::<u16, f32> |
885 | | |
886 | | #[inline] |
887 | 0 | fn add_f16_fallback(a: u16, b: u16) -> u16 { |
888 | 0 | f32_to_f16(f16_to_f32(a) + f16_to_f32(b)) |
889 | 0 | } |
890 | | |
891 | | #[inline] |
892 | 0 | fn subtract_f16_fallback(a: u16, b: u16) -> u16 { |
893 | 0 | f32_to_f16(f16_to_f32(a) - f16_to_f32(b)) |
894 | 0 | } |
895 | | |
896 | | #[inline] |
897 | 0 | fn multiply_f16_fallback(a: u16, b: u16) -> u16 { |
898 | 0 | f32_to_f16(f16_to_f32(a) * f16_to_f32(b)) |
899 | 0 | } |
900 | | |
901 | | #[inline] |
902 | 0 | fn divide_f16_fallback(a: u16, b: u16) -> u16 { |
903 | 0 | f32_to_f16(f16_to_f32(a) / f16_to_f32(b)) |
904 | 0 | } |
905 | | |
906 | | #[inline] |
907 | 0 | fn remainder_f16_fallback(a: u16, b: u16) -> u16 { |
908 | 0 | f32_to_f16(f16_to_f32(a) % f16_to_f32(b)) |
909 | 0 | } |
910 | | |
911 | | #[inline] |
912 | 0 | fn product_f16_fallback<I: Iterator<Item = u16>>(iter: I) -> u16 { |
913 | 0 | f32_to_f16(iter.map(f16_to_f32).product()) |
914 | 0 | } |
915 | | |
916 | | #[inline] |
917 | 0 | fn sum_f16_fallback<I: Iterator<Item = u16>>(iter: I) -> u16 { |
918 | 0 | f32_to_f16(iter.map(f16_to_f32).sum()) |
919 | 0 | } |
920 | | |
921 | | // TODO SIMD arithmetic |