Coverage Report

Created: 2025-10-14 06:57

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/rust/registry/src/index.crates.io-1949cf8c6b5b557f/half-2.7.1/src/binary16/arch.rs
Line
Count
Source
1
#![allow(dead_code, unused_imports)]
2
use crate::leading_zeros::leading_zeros_u16;
3
use core::mem;
4
5
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
6
mod x86;
7
8
#[cfg(target_arch = "aarch64")]
9
mod aarch64;
10
11
#[cfg(all(feature = "nightly", target_arch = "loongarch64"))]
12
mod loongarch64;
13
14
macro_rules! convert_fn {
15
    (if x86_feature("f16c") { $f16c:expr }
16
    else if aarch64_feature("fp16") { $aarch64:expr }
17
    else if loongarch64_feature("lsx") { $loongarch64:expr }
18
    else { $fallback:expr }) => {
19
        cfg_if::cfg_if! {
20
            // Use intrinsics directly when a compile target or using no_std
21
            if #[cfg(all(
22
                any(target_arch = "x86", target_arch = "x86_64"),
23
                target_feature = "f16c"
24
            ))] {
25
                $f16c
26
            }
27
            else if #[cfg(all(
28
                target_arch = "aarch64",
29
                target_feature = "fp16"
30
            ))] {
31
                $aarch64
32
            }
33
            else if #[cfg(all(
34
                feature = "nightly",
35
                target_arch = "loongarch64",
36
                target_feature = "lsx"
37
            ))] {
38
                $loongarch64
39
            }
40
41
            // Use CPU feature detection if using std
42
            else if #[cfg(all(
43
                feature = "std",
44
                any(target_arch = "x86", target_arch = "x86_64")
45
            ))] {
46
                use std::arch::is_x86_feature_detected;
47
                if is_x86_feature_detected!("f16c") {
48
                    $f16c
49
                } else {
50
                    $fallback
51
                }
52
            }
53
            else if #[cfg(all(
54
                feature = "std",
55
                target_arch = "aarch64",
56
            ))] {
57
                use std::arch::is_aarch64_feature_detected;
58
                if is_aarch64_feature_detected!("fp16") {
59
                    $aarch64
60
                } else {
61
                    $fallback
62
                }
63
            }
64
            else if #[cfg(all(
65
                feature = "std",
66
                feature = "nightly",
67
                target_arch = "loongarch64",
68
            ))] {
69
                use std::arch::is_loongarch_feature_detected;
70
                if is_loongarch_feature_detected!("lsx") {
71
                    $loongarch64
72
                } else {
73
                    $fallback
74
                }
75
            }
76
77
            // Fallback to software
78
            else {
79
                $fallback
80
            }
81
        }
82
    };
83
}
84
85
#[inline]
86
0
pub(crate) fn f32_to_f16(f: f32) -> u16 {
87
0
    convert_fn! {
88
        if x86_feature("f16c") {
89
0
            unsafe { x86::f32_to_f16_x86_f16c(f) }
90
        } else if aarch64_feature("fp16") {
91
            unsafe { aarch64::f32_to_f16_fp16(f) }
92
        } else if loongarch64_feature("lsx") {
93
            unsafe { loongarch64::f32_to_f16_lsx(f) }
94
        } else {
95
0
            f32_to_f16_fallback(f)
96
        }
97
    }
98
0
}
Unexecuted instantiation: half::binary16::arch::f32_to_f16
Unexecuted instantiation: half::binary16::arch::f32_to_f16
99
100
#[inline]
101
0
pub(crate) fn f64_to_f16(f: f64) -> u16 {
102
0
    convert_fn! {
103
        if x86_feature("f16c") {
104
0
            unsafe { x86::f32_to_f16_x86_f16c(f as f32) }
105
        } else if aarch64_feature("fp16") {
106
            unsafe { aarch64::f64_to_f16_fp16(f) }
107
        } else if loongarch64_feature("lsx") {
108
            f64_to_f16_fallback(f)
109
        } else {
110
0
            f64_to_f16_fallback(f)
111
        }
112
    }
113
0
}
114
115
#[inline]
116
0
pub(crate) fn f16_to_f32(i: u16) -> f32 {
117
0
    convert_fn! {
118
        if x86_feature("f16c") {
119
0
            unsafe { x86::f16_to_f32_x86_f16c(i) }
120
        } else if aarch64_feature("fp16") {
121
            unsafe { aarch64::f16_to_f32_fp16(i) }
122
        } else if loongarch64_feature("lsx") {
123
            unsafe { loongarch64::f16_to_f32_lsx(i) }
124
        } else {
125
0
            f16_to_f32_fallback(i)
126
        }
127
    }
128
0
}
Unexecuted instantiation: half::binary16::arch::f16_to_f32
Unexecuted instantiation: half::binary16::arch::f16_to_f32
129
130
#[inline]
131
0
pub(crate) fn f16_to_f64(i: u16) -> f64 {
132
0
    convert_fn! {
133
        if x86_feature("f16c") {
134
0
            unsafe { x86::f16_to_f32_x86_f16c(i) as f64 }
135
        } else if aarch64_feature("fp16") {
136
            unsafe { aarch64::f16_to_f64_fp16(i) }
137
        } else if loongarch64_feature("lsx") {
138
            unsafe { loongarch64::f16_to_f32_lsx(i) as f64 }
139
        } else {
140
0
            f16_to_f64_fallback(i)
141
        }
142
    }
143
0
}
144
145
#[inline]
146
0
pub(crate) fn f32x4_to_f16x4(f: &[f32; 4]) -> [u16; 4] {
147
0
    convert_fn! {
148
        if x86_feature("f16c") {
149
0
            unsafe { x86::f32x4_to_f16x4_x86_f16c(f) }
150
        } else if aarch64_feature("fp16") {
151
            unsafe { aarch64::f32x4_to_f16x4_fp16(f) }
152
        } else if loongarch64_feature("lsx") {
153
            unsafe { loongarch64::f32x4_to_f16x4_lsx(f) }
154
        } else {
155
0
            f32x4_to_f16x4_fallback(f)
156
        }
157
    }
158
0
}
159
160
#[inline]
161
0
pub(crate) fn f16x4_to_f32x4(i: &[u16; 4]) -> [f32; 4] {
162
0
    convert_fn! {
163
        if x86_feature("f16c") {
164
0
            unsafe { x86::f16x4_to_f32x4_x86_f16c(i) }
165
        } else if aarch64_feature("fp16") {
166
            unsafe { aarch64::f16x4_to_f32x4_fp16(i) }
167
        } else if loongarch64_feature("lsx") {
168
            unsafe { loongarch64::f16x4_to_f32x4_lsx(i) }
169
        } else {
170
0
            f16x4_to_f32x4_fallback(i)
171
        }
172
    }
173
0
}
174
175
#[inline]
176
0
pub(crate) fn f64x4_to_f16x4(f: &[f64; 4]) -> [u16; 4] {
177
0
    convert_fn! {
178
        if x86_feature("f16c") {
179
0
            unsafe { x86::f64x4_to_f16x4_x86_f16c(f) }
180
        } else if aarch64_feature("fp16") {
181
            unsafe { aarch64::f64x4_to_f16x4_fp16(f) }
182
        } else if loongarch64_feature("lsx") {
183
            unsafe { loongarch64::f64x4_to_f16x4_lsx(f) }
184
        } else {
185
0
            f64x4_to_f16x4_fallback(f)
186
        }
187
    }
188
0
}
189
190
#[inline]
191
0
pub(crate) fn f16x4_to_f64x4(i: &[u16; 4]) -> [f64; 4] {
192
0
    convert_fn! {
193
        if x86_feature("f16c") {
194
0
            unsafe { x86::f16x4_to_f64x4_x86_f16c(i) }
195
        } else if aarch64_feature("fp16") {
196
            unsafe { aarch64::f16x4_to_f64x4_fp16(i) }
197
        } else if loongarch64_feature("lsx") {
198
            unsafe { loongarch64::f16x4_to_f64x4_lsx(i) }
199
        } else {
200
0
            f16x4_to_f64x4_fallback(i)
201
        }
202
    }
203
0
}
204
205
#[inline]
206
0
pub(crate) fn f32x8_to_f16x8(f: &[f32; 8]) -> [u16; 8] {
207
0
    convert_fn! {
208
        if x86_feature("f16c") {
209
0
            unsafe { x86::f32x8_to_f16x8_x86_f16c(f) }
210
        } else if aarch64_feature("fp16") {
211
            {
212
                let mut result = [0u16; 8];
213
                convert_chunked_slice_4(f.as_slice(), result.as_mut_slice(),
214
                    aarch64::f32x4_to_f16x4_fp16);
215
                result
216
            }
217
        } else if loongarch64_feature("lsx") {
218
            {
219
                let mut result = [0u16; 8];
220
                convert_chunked_slice_4(f.as_slice(), result.as_mut_slice(),
221
                    loongarch64::f32x4_to_f16x4_lsx);
222
                result
223
            }
224
        } else {
225
0
            f32x8_to_f16x8_fallback(f)
226
        }
227
    }
228
0
}
229
230
#[inline]
231
0
pub(crate) fn f16x8_to_f32x8(i: &[u16; 8]) -> [f32; 8] {
232
0
    convert_fn! {
233
        if x86_feature("f16c") {
234
0
            unsafe { x86::f16x8_to_f32x8_x86_f16c(i) }
235
        } else if aarch64_feature("fp16") {
236
            {
237
                let mut result = [0f32; 8];
238
                convert_chunked_slice_4(i.as_slice(), result.as_mut_slice(),
239
                    aarch64::f16x4_to_f32x4_fp16);
240
                result
241
            }
242
        } else if loongarch64_feature("lsx") {
243
            {
244
                let mut result = [0f32; 8];
245
                convert_chunked_slice_4(i.as_slice(), result.as_mut_slice(),
246
                    loongarch64::f16x4_to_f32x4_lsx);
247
                result
248
            }
249
        } else {
250
0
            f16x8_to_f32x8_fallback(i)
251
        }
252
    }
253
0
}
254
255
#[inline]
256
0
pub(crate) fn f64x8_to_f16x8(f: &[f64; 8]) -> [u16; 8] {
257
0
    convert_fn! {
258
        if x86_feature("f16c") {
259
0
            unsafe { x86::f64x8_to_f16x8_x86_f16c(f) }
260
        } else if aarch64_feature("fp16") {
261
            {
262
                let mut result = [0u16; 8];
263
                convert_chunked_slice_4(f.as_slice(), result.as_mut_slice(),
264
                    aarch64::f64x4_to_f16x4_fp16);
265
                result
266
            }
267
        } else if loongarch64_feature("lsx") {
268
            {
269
                let mut result = [0u16; 8];
270
                convert_chunked_slice_4(f.as_slice(), result.as_mut_slice(),
271
                    loongarch64::f64x4_to_f16x4_lsx);
272
                result
273
            }
274
        } else {
275
0
            f64x8_to_f16x8_fallback(f)
276
        }
277
    }
278
0
}
279
280
#[inline]
281
0
pub(crate) fn f16x8_to_f64x8(i: &[u16; 8]) -> [f64; 8] {
282
0
    convert_fn! {
283
        if x86_feature("f16c") {
284
0
            unsafe { x86::f16x8_to_f64x8_x86_f16c(i) }
285
        } else if aarch64_feature("fp16") {
286
            {
287
                let mut result = [0f64; 8];
288
                convert_chunked_slice_4(i.as_slice(), result.as_mut_slice(),
289
                    aarch64::f16x4_to_f64x4_fp16);
290
                result
291
            }
292
        } else if loongarch64_feature("lsx") {
293
            {
294
                let mut result = [0f64; 8];
295
                convert_chunked_slice_4(i.as_slice(), result.as_mut_slice(),
296
                    loongarch64::f16x4_to_f64x4_lsx);
297
                result
298
            }
299
        } else {
300
0
            f16x8_to_f64x8_fallback(i)
301
        }
302
    }
303
0
}
304
305
#[inline]
306
0
pub(crate) fn f32_to_f16_slice(src: &[f32], dst: &mut [u16]) {
307
0
    convert_fn! {
308
        if x86_feature("f16c") {
309
0
            convert_chunked_slice_8(src, dst, x86::f32x8_to_f16x8_x86_f16c,
310
0
                x86::f32x4_to_f16x4_x86_f16c)
311
        } else if aarch64_feature("fp16") {
312
            convert_chunked_slice_4(src, dst, aarch64::f32x4_to_f16x4_fp16)
313
        } else if loongarch64_feature("lsx") {
314
            convert_chunked_slice_4(src, dst, loongarch64::f32x4_to_f16x4_lsx)
315
        } else {
316
0
            slice_fallback(src, dst, f32_to_f16_fallback)
317
        }
318
    }
319
0
}
320
321
#[inline]
322
211k
pub(crate) fn f16_to_f32_slice(src: &[u16], dst: &mut [f32]) {
323
211k
    convert_fn! {
324
        if x86_feature("f16c") {
325
211k
            convert_chunked_slice_8(src, dst, x86::f16x8_to_f32x8_x86_f16c,
326
211k
                x86::f16x4_to_f32x4_x86_f16c)
327
        } else if aarch64_feature("fp16") {
328
            convert_chunked_slice_4(src, dst, aarch64::f16x4_to_f32x4_fp16)
329
        } else if loongarch64_feature("lsx") {
330
            convert_chunked_slice_4(src, dst, loongarch64::f16x4_to_f32x4_lsx)
331
        } else {
332
0
            slice_fallback(src, dst, f16_to_f32_fallback)
333
        }
334
    }
335
211k
}
Unexecuted instantiation: half::binary16::arch::f16_to_f32_slice
Unexecuted instantiation: half::binary16::arch::f16_to_f32_slice
Unexecuted instantiation: half::binary16::arch::f16_to_f32_slice
Unexecuted instantiation: half::binary16::arch::f16_to_f32_slice
Unexecuted instantiation: half::binary16::arch::f16_to_f32_slice
Unexecuted instantiation: half::binary16::arch::f16_to_f32_slice
Unexecuted instantiation: half::binary16::arch::f16_to_f32_slice
Unexecuted instantiation: half::binary16::arch::f16_to_f32_slice
Unexecuted instantiation: half::binary16::arch::f16_to_f32_slice
Unexecuted instantiation: half::binary16::arch::f16_to_f32_slice
Unexecuted instantiation: half::binary16::arch::f16_to_f32_slice
Unexecuted instantiation: half::binary16::arch::f16_to_f32_slice
half::binary16::arch::f16_to_f32_slice
Line
Count
Source
322
211k
pub(crate) fn f16_to_f32_slice(src: &[u16], dst: &mut [f32]) {
323
211k
    convert_fn! {
324
        if x86_feature("f16c") {
325
211k
            convert_chunked_slice_8(src, dst, x86::f16x8_to_f32x8_x86_f16c,
326
211k
                x86::f16x4_to_f32x4_x86_f16c)
327
        } else if aarch64_feature("fp16") {
328
            convert_chunked_slice_4(src, dst, aarch64::f16x4_to_f32x4_fp16)
329
        } else if loongarch64_feature("lsx") {
330
            convert_chunked_slice_4(src, dst, loongarch64::f16x4_to_f32x4_lsx)
331
        } else {
332
0
            slice_fallback(src, dst, f16_to_f32_fallback)
333
        }
334
    }
335
211k
}
336
337
#[inline]
338
0
pub(crate) fn f64_to_f16_slice(src: &[f64], dst: &mut [u16]) {
339
0
    convert_fn! {
340
        if x86_feature("f16c") {
341
0
            convert_chunked_slice_8(src, dst, x86::f64x8_to_f16x8_x86_f16c,
342
0
                x86::f64x4_to_f16x4_x86_f16c)
343
        } else if aarch64_feature("fp16") {
344
            convert_chunked_slice_4(src, dst, aarch64::f64x4_to_f16x4_fp16)
345
        } else if loongarch64_feature("lsx") {
346
            convert_chunked_slice_4(src, dst, loongarch64::f64x4_to_f16x4_lsx)
347
        } else {
348
0
            slice_fallback(src, dst, f64_to_f16_fallback)
349
        }
350
    }
351
0
}
352
353
#[inline]
354
0
pub(crate) fn f16_to_f64_slice(src: &[u16], dst: &mut [f64]) {
355
0
    convert_fn! {
356
        if x86_feature("f16c") {
357
0
            convert_chunked_slice_8(src, dst, x86::f16x8_to_f64x8_x86_f16c,
358
0
                x86::f16x4_to_f64x4_x86_f16c)
359
        } else if aarch64_feature("fp16") {
360
            convert_chunked_slice_4(src, dst, aarch64::f16x4_to_f64x4_fp16)
361
        } else if loongarch64_feature("lsx") {
362
            convert_chunked_slice_4(src, dst, loongarch64::f16x4_to_f64x4_lsx)
363
        } else {
364
0
            slice_fallback(src, dst, f16_to_f64_fallback)
365
        }
366
    }
367
0
}
368
369
macro_rules! math_fn {
370
    (if aarch64_feature("fp16") { $aarch64:expr }
371
    else { $fallback:expr }) => {
372
        cfg_if::cfg_if! {
373
            // Use intrinsics directly when a compile target or using no_std
374
            if #[cfg(all(
375
                target_arch = "aarch64",
376
                target_feature = "fp16"
377
            ))] {
378
                $aarch64
379
            }
380
381
            // Use CPU feature detection if using std
382
            else if #[cfg(all(
383
                feature = "std",
384
                target_arch = "aarch64",
385
                not(target_feature = "fp16")
386
            ))] {
387
                use std::arch::is_aarch64_feature_detected;
388
                if is_aarch64_feature_detected!("fp16") {
389
                    $aarch64
390
                } else {
391
                    $fallback
392
                }
393
            }
394
395
            // Fallback to software
396
            else {
397
                $fallback
398
            }
399
        }
400
    };
401
}
402
403
#[inline]
404
0
pub(crate) fn add_f16(a: u16, b: u16) -> u16 {
405
    math_fn! {
406
        if aarch64_feature("fp16") {
407
            unsafe { aarch64::add_f16_fp16(a, b) }
408
        } else {
409
0
            add_f16_fallback(a, b)
410
        }
411
    }
412
0
}
413
414
#[inline]
415
0
pub(crate) fn subtract_f16(a: u16, b: u16) -> u16 {
416
    math_fn! {
417
        if aarch64_feature("fp16") {
418
            unsafe { aarch64::subtract_f16_fp16(a, b) }
419
        } else {
420
0
            subtract_f16_fallback(a, b)
421
        }
422
    }
423
0
}
424
425
#[inline]
426
0
pub(crate) fn multiply_f16(a: u16, b: u16) -> u16 {
427
    math_fn! {
428
        if aarch64_feature("fp16") {
429
            unsafe { aarch64::multiply_f16_fp16(a, b) }
430
        } else {
431
0
            multiply_f16_fallback(a, b)
432
        }
433
    }
434
0
}
435
436
#[inline]
437
0
pub(crate) fn divide_f16(a: u16, b: u16) -> u16 {
438
    math_fn! {
439
        if aarch64_feature("fp16") {
440
            unsafe { aarch64::divide_f16_fp16(a, b) }
441
        } else {
442
0
            divide_f16_fallback(a, b)
443
        }
444
    }
445
0
}
446
447
#[inline]
448
0
pub(crate) fn remainder_f16(a: u16, b: u16) -> u16 {
449
0
    remainder_f16_fallback(a, b)
450
0
}
451
452
#[inline]
453
0
pub(crate) fn product_f16<I: Iterator<Item = u16>>(iter: I) -> u16 {
454
    math_fn! {
455
        if aarch64_feature("fp16") {
456
            iter.fold(0, |acc, x| unsafe { aarch64::multiply_f16_fp16(acc, x) })
457
        } else {
458
0
            product_f16_fallback(iter)
459
        }
460
    }
461
0
}
462
463
#[inline]
464
0
pub(crate) fn sum_f16<I: Iterator<Item = u16>>(iter: I) -> u16 {
465
    math_fn! {
466
        if aarch64_feature("fp16") {
467
            iter.fold(0, |acc, x| unsafe { aarch64::add_f16_fp16(acc, x) })
468
        } else {
469
0
            sum_f16_fallback(iter)
470
        }
471
    }
472
0
}
473
474
/// Chunks sliced into x8 or x4 arrays
475
#[inline]
476
211k
fn convert_chunked_slice_8<S: Copy + Default, D: Copy>(
477
211k
    src: &[S],
478
211k
    dst: &mut [D],
479
211k
    fn8: unsafe fn(&[S; 8]) -> [D; 8],
480
211k
    fn4: unsafe fn(&[S; 4]) -> [D; 4],
481
211k
) {
482
211k
    assert_eq!(src.len(), dst.len());
483
484
    // TODO: Can be further optimized with array_chunks when it becomes stabilized
485
486
211k
    let src_chunks = src.chunks_exact(8);
487
211k
    let mut dst_chunks = dst.chunks_exact_mut(8);
488
211k
    let src_remainder = src_chunks.remainder();
489
340k
    for (s, d) in src_chunks.zip(&mut dst_chunks) {
490
340k
        let chunk: &[S; 8] = s.try_into().unwrap();
491
340k
        d.copy_from_slice(unsafe { &fn8(chunk) });
492
340k
    }
493
494
    // Process remainder
495
211k
    if src_remainder.len() > 4 {
496
21
        let mut buf: [S; 8] = Default::default();
497
21
        buf[..src_remainder.len()].copy_from_slice(src_remainder);
498
21
        let vec = unsafe { fn8(&buf) };
499
21
        let dst_remainder = dst_chunks.into_remainder();
500
21
        dst_remainder.copy_from_slice(&vec[..dst_remainder.len()]);
501
211k
    } else if !src_remainder.is_empty() {
502
40.7k
        let mut buf: [S; 4] = Default::default();
503
40.7k
        buf[..src_remainder.len()].copy_from_slice(src_remainder);
504
40.7k
        let vec = unsafe { fn4(&buf) };
505
40.7k
        let dst_remainder = dst_chunks.into_remainder();
506
40.7k
        dst_remainder.copy_from_slice(&vec[..dst_remainder.len()]);
507
170k
    }
508
211k
}
Unexecuted instantiation: half::binary16::arch::convert_chunked_slice_8::<u16, f32>
Unexecuted instantiation: half::binary16::arch::convert_chunked_slice_8::<f64, u16>
Unexecuted instantiation: half::binary16::arch::convert_chunked_slice_8::<f32, u16>
Unexecuted instantiation: half::binary16::arch::convert_chunked_slice_8::<u16, f32>
Unexecuted instantiation: half::binary16::arch::convert_chunked_slice_8::<u16, f32>
Unexecuted instantiation: half::binary16::arch::convert_chunked_slice_8::<u16, f32>
Unexecuted instantiation: half::binary16::arch::convert_chunked_slice_8::<u16, f32>
Unexecuted instantiation: half::binary16::arch::convert_chunked_slice_8::<u16, f32>
Unexecuted instantiation: half::binary16::arch::convert_chunked_slice_8::<u16, f32>
Unexecuted instantiation: half::binary16::arch::convert_chunked_slice_8::<u16, f32>
Unexecuted instantiation: half::binary16::arch::convert_chunked_slice_8::<u16, f32>
Unexecuted instantiation: half::binary16::arch::convert_chunked_slice_8::<u16, f32>
Unexecuted instantiation: half::binary16::arch::convert_chunked_slice_8::<u16, f32>
half::binary16::arch::convert_chunked_slice_8::<u16, f32>
Line
Count
Source
476
211k
fn convert_chunked_slice_8<S: Copy + Default, D: Copy>(
477
211k
    src: &[S],
478
211k
    dst: &mut [D],
479
211k
    fn8: unsafe fn(&[S; 8]) -> [D; 8],
480
211k
    fn4: unsafe fn(&[S; 4]) -> [D; 4],
481
211k
) {
482
211k
    assert_eq!(src.len(), dst.len());
483
484
    // TODO: Can be further optimized with array_chunks when it becomes stabilized
485
486
211k
    let src_chunks = src.chunks_exact(8);
487
211k
    let mut dst_chunks = dst.chunks_exact_mut(8);
488
211k
    let src_remainder = src_chunks.remainder();
489
340k
    for (s, d) in src_chunks.zip(&mut dst_chunks) {
490
340k
        let chunk: &[S; 8] = s.try_into().unwrap();
491
340k
        d.copy_from_slice(unsafe { &fn8(chunk) });
492
340k
    }
493
494
    // Process remainder
495
211k
    if src_remainder.len() > 4 {
496
21
        let mut buf: [S; 8] = Default::default();
497
21
        buf[..src_remainder.len()].copy_from_slice(src_remainder);
498
21
        let vec = unsafe { fn8(&buf) };
499
21
        let dst_remainder = dst_chunks.into_remainder();
500
21
        dst_remainder.copy_from_slice(&vec[..dst_remainder.len()]);
501
211k
    } else if !src_remainder.is_empty() {
502
40.7k
        let mut buf: [S; 4] = Default::default();
503
40.7k
        buf[..src_remainder.len()].copy_from_slice(src_remainder);
504
40.7k
        let vec = unsafe { fn4(&buf) };
505
40.7k
        let dst_remainder = dst_chunks.into_remainder();
506
40.7k
        dst_remainder.copy_from_slice(&vec[..dst_remainder.len()]);
507
170k
    }
508
211k
}
509
510
/// Chunks sliced into x4 arrays
511
#[inline]
512
0
fn convert_chunked_slice_4<S: Copy + Default, D: Copy>(
513
0
    src: &[S],
514
0
    dst: &mut [D],
515
0
    f: unsafe fn(&[S; 4]) -> [D; 4],
516
0
) {
517
0
    assert_eq!(src.len(), dst.len());
518
519
    // TODO: Can be further optimized with array_chunks when it becomes stabilized
520
521
0
    let src_chunks = src.chunks_exact(4);
522
0
    let mut dst_chunks = dst.chunks_exact_mut(4);
523
0
    let src_remainder = src_chunks.remainder();
524
0
    for (s, d) in src_chunks.zip(&mut dst_chunks) {
525
0
        let chunk: &[S; 4] = s.try_into().unwrap();
526
0
        d.copy_from_slice(unsafe { &f(chunk) });
527
0
    }
528
529
    // Process remainder
530
0
    if !src_remainder.is_empty() {
531
0
        let mut buf: [S; 4] = Default::default();
532
0
        buf[..src_remainder.len()].copy_from_slice(src_remainder);
533
0
        let vec = unsafe { f(&buf) };
534
0
        let dst_remainder = dst_chunks.into_remainder();
535
0
        dst_remainder.copy_from_slice(&vec[..dst_remainder.len()]);
536
0
    }
537
0
}
538
539
/////////////// Fallbacks ////////////////
540
541
// In the below functions, round to nearest, with ties to even.
542
// Let us call the most significant bit that will be shifted out the round_bit.
543
//
544
// Round up if either
545
//  a) Removed part > tie.
546
//     (mantissa & round_bit) != 0 && (mantissa & (round_bit - 1)) != 0
547
//  b) Removed part == tie, and retained part is odd.
548
//     (mantissa & round_bit) != 0 && (mantissa & (2 * round_bit)) != 0
549
// (If removed part == tie and retained part is even, do not round up.)
550
// These two conditions can be combined into one:
551
//     (mantissa & round_bit) != 0 && (mantissa & ((round_bit - 1) | (2 * round_bit))) != 0
552
// which can be simplified into
553
//     (mantissa & round_bit) != 0 && (mantissa & (3 * round_bit - 1)) != 0
554
555
#[inline]
556
0
pub(crate) const fn f32_to_f16_fallback(value: f32) -> u16 {
557
    // TODO: Replace mem::transmute with to_bits() once to_bits is const-stabilized
558
    // Convert to raw bytes
559
0
    let x: u32 = unsafe { mem::transmute::<f32, u32>(value) };
560
561
    // Extract IEEE754 components
562
0
    let sign = x & 0x8000_0000u32;
563
0
    let exp = x & 0x7F80_0000u32;
564
0
    let man = x & 0x007F_FFFFu32;
565
566
    // Check for all exponent bits being set, which is Infinity or NaN
567
0
    if exp == 0x7F80_0000u32 {
568
        // Set mantissa MSB for NaN (and also keep shifted mantissa bits)
569
0
        let nan_bit = if man == 0 { 0 } else { 0x0200u32 };
570
0
        return ((sign >> 16) | 0x7C00u32 | nan_bit | (man >> 13)) as u16;
571
0
    }
572
573
    // The number is normalized, start assembling half precision version
574
0
    let half_sign = sign >> 16;
575
    // Unbias the exponent, then bias for half precision
576
0
    let unbiased_exp = ((exp >> 23) as i32) - 127;
577
0
    let half_exp = unbiased_exp + 15;
578
579
    // Check for exponent overflow, return +infinity
580
0
    if half_exp >= 0x1F {
581
0
        return (half_sign | 0x7C00u32) as u16;
582
0
    }
583
584
    // Check for underflow
585
0
    if half_exp <= 0 {
586
        // Check mantissa for what we can do
587
0
        if 14 - half_exp > 24 {
588
            // No rounding possibility, so this is a full underflow, return signed zero
589
0
            return half_sign as u16;
590
0
        }
591
        // Don't forget about hidden leading mantissa bit when assembling mantissa
592
0
        let man = man | 0x0080_0000u32;
593
0
        let mut half_man = man >> (14 - half_exp);
594
        // Check for rounding (see comment above functions)
595
0
        let round_bit = 1 << (13 - half_exp);
596
0
        if (man & round_bit) != 0 && (man & (3 * round_bit - 1)) != 0 {
597
0
            half_man += 1;
598
0
        }
599
        // No exponent for subnormals
600
0
        return (half_sign | half_man) as u16;
601
0
    }
602
603
    // Rebias the exponent
604
0
    let half_exp = (half_exp as u32) << 10;
605
0
    let half_man = man >> 13;
606
    // Check for rounding (see comment above functions)
607
0
    let round_bit = 0x0000_1000u32;
608
0
    if (man & round_bit) != 0 && (man & (3 * round_bit - 1)) != 0 {
609
        // Round it
610
0
        ((half_sign | half_exp | half_man) + 1) as u16
611
    } else {
612
0
        (half_sign | half_exp | half_man) as u16
613
    }
614
0
}
Unexecuted instantiation: half::binary16::arch::f32_to_f16_fallback
Unexecuted instantiation: half::binary16::arch::f32_to_f16_fallback
615
616
#[inline]
617
0
pub(crate) const fn f64_to_f16_fallback(value: f64) -> u16 {
618
    // Convert to raw bytes, truncating the last 32-bits of mantissa; that precision will always
619
    // be lost on half-precision.
620
    // TODO: Replace mem::transmute with to_bits() once to_bits is const-stabilized
621
0
    let val: u64 = unsafe { mem::transmute::<f64, u64>(value) };
622
0
    let x = (val >> 32) as u32;
623
624
    // Extract IEEE754 components
625
0
    let sign = x & 0x8000_0000u32;
626
0
    let exp = x & 0x7FF0_0000u32;
627
0
    let man = x & 0x000F_FFFFu32;
628
629
    // Check for all exponent bits being set, which is Infinity or NaN
630
0
    if exp == 0x7FF0_0000u32 {
631
        // Set mantissa MSB for NaN (and also keep shifted mantissa bits).
632
        // We also have to check the last 32 bits.
633
0
        let nan_bit = if man == 0 && (val as u32 == 0) {
634
0
            0
635
        } else {
636
0
            0x0200u32
637
        };
638
0
        return ((sign >> 16) | 0x7C00u32 | nan_bit | (man >> 10)) as u16;
639
0
    }
640
641
    // The number is normalized, start assembling half precision version
642
0
    let half_sign = sign >> 16;
643
    // Unbias the exponent, then bias for half precision
644
0
    let unbiased_exp = ((exp >> 20) as i64) - 1023;
645
0
    let half_exp = unbiased_exp + 15;
646
647
    // Check for exponent overflow, return +infinity
648
0
    if half_exp >= 0x1F {
649
0
        return (half_sign | 0x7C00u32) as u16;
650
0
    }
651
652
    // Check for underflow
653
0
    if half_exp <= 0 {
654
        // Check mantissa for what we can do
655
0
        if 10 - half_exp > 21 {
656
            // No rounding possibility, so this is a full underflow, return signed zero
657
0
            return half_sign as u16;
658
0
        }
659
        // Don't forget about hidden leading mantissa bit when assembling mantissa
660
0
        let man = man | 0x0010_0000u32;
661
0
        let mut half_man = man >> (11 - half_exp);
662
        // Check for rounding (see comment above functions)
663
0
        let round_bit = 1 << (10 - half_exp);
664
0
        if (man & round_bit) != 0 && (man & (3 * round_bit - 1)) != 0 {
665
0
            half_man += 1;
666
0
        }
667
        // No exponent for subnormals
668
0
        return (half_sign | half_man) as u16;
669
0
    }
670
671
    // Rebias the exponent
672
0
    let half_exp = (half_exp as u32) << 10;
673
0
    let half_man = man >> 10;
674
    // Check for rounding (see comment above functions)
675
0
    let round_bit = 0x0000_0200u32;
676
0
    if (man & round_bit) != 0 && (man & (3 * round_bit - 1)) != 0 {
677
        // Round it
678
0
        ((half_sign | half_exp | half_man) + 1) as u16
679
    } else {
680
0
        (half_sign | half_exp | half_man) as u16
681
    }
682
0
}
683
684
#[inline]
685
0
pub(crate) const fn f16_to_f32_fallback(i: u16) -> f32 {
686
    // Check for signed zero
687
    // TODO: Replace mem::transmute with from_bits() once from_bits is const-stabilized
688
0
    if i & 0x7FFFu16 == 0 {
689
0
        return unsafe { mem::transmute::<u32, f32>((i as u32) << 16) };
690
0
    }
691
692
0
    let half_sign = (i & 0x8000u16) as u32;
693
0
    let half_exp = (i & 0x7C00u16) as u32;
694
0
    let half_man = (i & 0x03FFu16) as u32;
695
696
    // Check for an infinity or NaN when all exponent bits set
697
0
    if half_exp == 0x7C00u32 {
698
        // Check for signed infinity if mantissa is zero
699
0
        if half_man == 0 {
700
0
            return unsafe { mem::transmute::<u32, f32>((half_sign << 16) | 0x7F80_0000u32) };
701
        } else {
702
            // NaN, keep current mantissa but also set most significiant mantissa bit
703
            return unsafe {
704
0
                mem::transmute::<u32, f32>((half_sign << 16) | 0x7FC0_0000u32 | (half_man << 13))
705
            };
706
        }
707
0
    }
708
709
    // Calculate single-precision components with adjusted exponent
710
0
    let sign = half_sign << 16;
711
    // Unbias exponent
712
0
    let unbiased_exp = ((half_exp as i32) >> 10) - 15;
713
714
    // Check for subnormals, which will be normalized by adjusting exponent
715
0
    if half_exp == 0 {
716
        // Calculate how much to adjust the exponent by
717
0
        let e = leading_zeros_u16(half_man as u16) - 6;
718
719
        // Rebias and adjust exponent
720
0
        let exp = (127 - 15 - e) << 23;
721
0
        let man = (half_man << (14 + e)) & 0x7F_FF_FFu32;
722
0
        return unsafe { mem::transmute::<u32, f32>(sign | exp | man) };
723
0
    }
724
725
    // Rebias exponent for a normalized normal
726
0
    let exp = ((unbiased_exp + 127) as u32) << 23;
727
0
    let man = (half_man & 0x03FFu32) << 13;
728
0
    unsafe { mem::transmute::<u32, f32>(sign | exp | man) }
729
0
}
Unexecuted instantiation: half::binary16::arch::f16_to_f32_fallback
Unexecuted instantiation: half::binary16::arch::f16_to_f32_fallback
Unexecuted instantiation: half::binary16::arch::f16_to_f32_fallback
Unexecuted instantiation: half::binary16::arch::f16_to_f32_fallback
Unexecuted instantiation: half::binary16::arch::f16_to_f32_fallback
Unexecuted instantiation: half::binary16::arch::f16_to_f32_fallback
Unexecuted instantiation: half::binary16::arch::f16_to_f32_fallback
Unexecuted instantiation: half::binary16::arch::f16_to_f32_fallback
Unexecuted instantiation: half::binary16::arch::f16_to_f32_fallback
Unexecuted instantiation: half::binary16::arch::f16_to_f32_fallback
Unexecuted instantiation: half::binary16::arch::f16_to_f32_fallback
Unexecuted instantiation: half::binary16::arch::f16_to_f32_fallback
Unexecuted instantiation: half::binary16::arch::f16_to_f32_fallback
Unexecuted instantiation: half::binary16::arch::f16_to_f32_fallback
730
731
#[inline]
732
0
pub(crate) const fn f16_to_f64_fallback(i: u16) -> f64 {
733
    // Check for signed zero
734
    // TODO: Replace mem::transmute with from_bits() once from_bits is const-stabilized
735
0
    if i & 0x7FFFu16 == 0 {
736
0
        return unsafe { mem::transmute::<u64, f64>((i as u64) << 48) };
737
0
    }
738
739
0
    let half_sign = (i & 0x8000u16) as u64;
740
0
    let half_exp = (i & 0x7C00u16) as u64;
741
0
    let half_man = (i & 0x03FFu16) as u64;
742
743
    // Check for an infinity or NaN when all exponent bits set
744
0
    if half_exp == 0x7C00u64 {
745
        // Check for signed infinity if mantissa is zero
746
0
        if half_man == 0 {
747
            return unsafe {
748
0
                mem::transmute::<u64, f64>((half_sign << 48) | 0x7FF0_0000_0000_0000u64)
749
            };
750
        } else {
751
            // NaN, keep current mantissa but also set most significiant mantissa bit
752
            return unsafe {
753
0
                mem::transmute::<u64, f64>(
754
0
                    (half_sign << 48) | 0x7FF8_0000_0000_0000u64 | (half_man << 42),
755
                )
756
            };
757
        }
758
0
    }
759
760
    // Calculate double-precision components with adjusted exponent
761
0
    let sign = half_sign << 48;
762
    // Unbias exponent
763
0
    let unbiased_exp = ((half_exp as i64) >> 10) - 15;
764
765
    // Check for subnormals, which will be normalized by adjusting exponent
766
0
    if half_exp == 0 {
767
        // Calculate how much to adjust the exponent by
768
0
        let e = leading_zeros_u16(half_man as u16) - 6;
769
770
        // Rebias and adjust exponent
771
0
        let exp = ((1023 - 15 - e) as u64) << 52;
772
0
        let man = (half_man << (43 + e)) & 0xF_FFFF_FFFF_FFFFu64;
773
0
        return unsafe { mem::transmute::<u64, f64>(sign | exp | man) };
774
0
    }
775
776
    // Rebias exponent for a normalized normal
777
0
    let exp = ((unbiased_exp + 1023) as u64) << 52;
778
0
    let man = (half_man & 0x03FFu64) << 42;
779
0
    unsafe { mem::transmute::<u64, f64>(sign | exp | man) }
780
0
}
781
782
#[inline]
783
0
fn f16x4_to_f32x4_fallback(v: &[u16; 4]) -> [f32; 4] {
784
0
    [
785
0
        f16_to_f32_fallback(v[0]),
786
0
        f16_to_f32_fallback(v[1]),
787
0
        f16_to_f32_fallback(v[2]),
788
0
        f16_to_f32_fallback(v[3]),
789
0
    ]
790
0
}
791
792
#[inline]
793
0
fn f32x4_to_f16x4_fallback(v: &[f32; 4]) -> [u16; 4] {
794
0
    [
795
0
        f32_to_f16_fallback(v[0]),
796
0
        f32_to_f16_fallback(v[1]),
797
0
        f32_to_f16_fallback(v[2]),
798
0
        f32_to_f16_fallback(v[3]),
799
0
    ]
800
0
}
801
802
#[inline]
803
0
fn f16x4_to_f64x4_fallback(v: &[u16; 4]) -> [f64; 4] {
804
0
    [
805
0
        f16_to_f64_fallback(v[0]),
806
0
        f16_to_f64_fallback(v[1]),
807
0
        f16_to_f64_fallback(v[2]),
808
0
        f16_to_f64_fallback(v[3]),
809
0
    ]
810
0
}
811
812
#[inline]
813
0
fn f64x4_to_f16x4_fallback(v: &[f64; 4]) -> [u16; 4] {
814
0
    [
815
0
        f64_to_f16_fallback(v[0]),
816
0
        f64_to_f16_fallback(v[1]),
817
0
        f64_to_f16_fallback(v[2]),
818
0
        f64_to_f16_fallback(v[3]),
819
0
    ]
820
0
}
821
822
#[inline]
823
0
fn f16x8_to_f32x8_fallback(v: &[u16; 8]) -> [f32; 8] {
824
0
    [
825
0
        f16_to_f32_fallback(v[0]),
826
0
        f16_to_f32_fallback(v[1]),
827
0
        f16_to_f32_fallback(v[2]),
828
0
        f16_to_f32_fallback(v[3]),
829
0
        f16_to_f32_fallback(v[4]),
830
0
        f16_to_f32_fallback(v[5]),
831
0
        f16_to_f32_fallback(v[6]),
832
0
        f16_to_f32_fallback(v[7]),
833
0
    ]
834
0
}
835
836
#[inline]
837
0
fn f32x8_to_f16x8_fallback(v: &[f32; 8]) -> [u16; 8] {
838
0
    [
839
0
        f32_to_f16_fallback(v[0]),
840
0
        f32_to_f16_fallback(v[1]),
841
0
        f32_to_f16_fallback(v[2]),
842
0
        f32_to_f16_fallback(v[3]),
843
0
        f32_to_f16_fallback(v[4]),
844
0
        f32_to_f16_fallback(v[5]),
845
0
        f32_to_f16_fallback(v[6]),
846
0
        f32_to_f16_fallback(v[7]),
847
0
    ]
848
0
}
849
850
#[inline]
851
0
fn f16x8_to_f64x8_fallback(v: &[u16; 8]) -> [f64; 8] {
852
0
    [
853
0
        f16_to_f64_fallback(v[0]),
854
0
        f16_to_f64_fallback(v[1]),
855
0
        f16_to_f64_fallback(v[2]),
856
0
        f16_to_f64_fallback(v[3]),
857
0
        f16_to_f64_fallback(v[4]),
858
0
        f16_to_f64_fallback(v[5]),
859
0
        f16_to_f64_fallback(v[6]),
860
0
        f16_to_f64_fallback(v[7]),
861
0
    ]
862
0
}
863
864
#[inline]
865
0
fn f64x8_to_f16x8_fallback(v: &[f64; 8]) -> [u16; 8] {
866
0
    [
867
0
        f64_to_f16_fallback(v[0]),
868
0
        f64_to_f16_fallback(v[1]),
869
0
        f64_to_f16_fallback(v[2]),
870
0
        f64_to_f16_fallback(v[3]),
871
0
        f64_to_f16_fallback(v[4]),
872
0
        f64_to_f16_fallback(v[5]),
873
0
        f64_to_f16_fallback(v[6]),
874
0
        f64_to_f16_fallback(v[7]),
875
0
    ]
876
0
}
877
878
#[inline]
879
0
fn slice_fallback<S: Copy, D>(src: &[S], dst: &mut [D], f: fn(S) -> D) {
880
0
    assert_eq!(src.len(), dst.len());
881
0
    for (s, d) in src.iter().copied().zip(dst.iter_mut()) {
882
0
        *d = f(s);
883
0
    }
884
0
}
Unexecuted instantiation: half::binary16::arch::slice_fallback::<u16, f32>
Unexecuted instantiation: half::binary16::arch::slice_fallback::<f64, u16>
Unexecuted instantiation: half::binary16::arch::slice_fallback::<f32, u16>
Unexecuted instantiation: half::binary16::arch::slice_fallback::<u16, f32>
Unexecuted instantiation: half::binary16::arch::slice_fallback::<u16, f32>
Unexecuted instantiation: half::binary16::arch::slice_fallback::<u16, f32>
Unexecuted instantiation: half::binary16::arch::slice_fallback::<u16, f32>
Unexecuted instantiation: half::binary16::arch::slice_fallback::<u16, f32>
Unexecuted instantiation: half::binary16::arch::slice_fallback::<u16, f32>
Unexecuted instantiation: half::binary16::arch::slice_fallback::<u16, f32>
Unexecuted instantiation: half::binary16::arch::slice_fallback::<u16, f32>
Unexecuted instantiation: half::binary16::arch::slice_fallback::<u16, f32>
Unexecuted instantiation: half::binary16::arch::slice_fallback::<u16, f32>
Unexecuted instantiation: half::binary16::arch::slice_fallback::<u16, f32>
885
886
#[inline]
887
0
fn add_f16_fallback(a: u16, b: u16) -> u16 {
888
0
    f32_to_f16(f16_to_f32(a) + f16_to_f32(b))
889
0
}
890
891
#[inline]
892
0
fn subtract_f16_fallback(a: u16, b: u16) -> u16 {
893
0
    f32_to_f16(f16_to_f32(a) - f16_to_f32(b))
894
0
}
895
896
#[inline]
897
0
fn multiply_f16_fallback(a: u16, b: u16) -> u16 {
898
0
    f32_to_f16(f16_to_f32(a) * f16_to_f32(b))
899
0
}
900
901
#[inline]
902
0
fn divide_f16_fallback(a: u16, b: u16) -> u16 {
903
0
    f32_to_f16(f16_to_f32(a) / f16_to_f32(b))
904
0
}
905
906
#[inline]
907
0
fn remainder_f16_fallback(a: u16, b: u16) -> u16 {
908
0
    f32_to_f16(f16_to_f32(a) % f16_to_f32(b))
909
0
}
910
911
#[inline]
912
0
fn product_f16_fallback<I: Iterator<Item = u16>>(iter: I) -> u16 {
913
0
    f32_to_f16(iter.map(f16_to_f32).product())
914
0
}
915
916
#[inline]
917
0
fn sum_f16_fallback<I: Iterator<Item = u16>>(iter: I) -> u16 {
918
0
    f32_to_f16(iter.map(f16_to_f32).sum())
919
0
}
920
921
// TODO SIMD arithmetic