/rust/registry/src/github.com-1ecc6299db9ec823/memchr-2.2.1/src/x86/mod.rs
Line | Count | Source (jump to first uncovered line) |
1 | | use fallback; |
2 | | |
3 | | // We only use AVX when we can detect at runtime whether it's available, which |
4 | | // requires std. |
5 | | #[cfg(feature = "use_std")] |
6 | | mod avx; |
7 | | mod sse2; |
8 | | |
9 | | // This macro employs a gcc-like "ifunc" trick where by upon first calling |
10 | | // `memchr` (for example), CPU feature detection will be performed at runtime |
11 | | // to determine the best implementation to use. After CPU feature detection |
12 | | // is done, we replace `memchr`'s function pointer with the selection. Upon |
13 | | // subsequent invocations, the CPU-specific routine is invoked directly, which |
14 | | // skips the CPU feature detection and subsequent branch that's required. |
15 | | // |
16 | | // While this typically doesn't matter for rare occurrences or when used on |
17 | | // larger haystacks, `memchr` can be called in tight loops where the overhead |
18 | | // of this branch can actually add up *and is measurable*. This trick was |
19 | | // necessary to bring this implementation up to glibc's speeds for the 'tiny' |
20 | | // benchmarks, for example. |
21 | | // |
22 | | // At some point, I expect the Rust ecosystem will get a nice macro for doing |
23 | | // exactly this, at which point, we can replace our hand-jammed version of it. |
24 | | // |
25 | | // N.B. The ifunc strategy does prevent function inlining of course, but on |
26 | | // modern CPUs, you'll probably end up with the AVX2 implementation, which |
27 | | // probably can't be inlined anyway---unless you've compiled your entire |
28 | | // program with AVX2 enabled. However, even then, the various memchr |
29 | | // implementations aren't exactly small, so inlining might not help anyway! |
30 | | #[cfg(feature = "use_std")] |
31 | | macro_rules! ifunc { |
32 | | ($fnty:ty, $name:ident, $haystack:ident, $($needle:ident),+) => {{ |
33 | | use std::mem; |
34 | | use std::sync::atomic::{AtomicPtr, Ordering}; |
35 | | |
36 | | type FnRaw = *mut (); |
37 | | |
38 | | static FN: AtomicPtr<()> = AtomicPtr::new(detect as FnRaw); |
39 | | |
40 | 8 | fn detect($($needle: u8),+, haystack: &[u8]) -> Option<usize> { |
41 | 8 | let fun = |
42 | 0 | if cfg!(memchr_runtime_avx) && is_x86_feature_detected!("avx2") { |
43 | 8 | avx::$name as FnRaw |
44 | 0 | } else if cfg!(memchr_runtime_sse2) { |
45 | 0 | sse2::$name as FnRaw |
46 | | } else { |
47 | 0 | fallback::$name as FnRaw |
48 | | }; |
49 | 8 | FN.store(fun as FnRaw, Ordering::Relaxed); |
50 | 8 | unsafe { |
51 | 8 | mem::transmute::<FnRaw, $fnty>(fun)($($needle),+, haystack) |
52 | 8 | } |
53 | 8 | } Unexecuted instantiation: memchr::x86::memchr3::detect Unexecuted instantiation: memchr::x86::memrchr::detect Unexecuted instantiation: memchr::x86::memchr2::detect Unexecuted instantiation: memchr::x86::memrchr3::detect memchr::x86::memchr::detect Line | Count | Source | 40 | 8 | fn detect($($needle: u8),+, haystack: &[u8]) -> Option<usize> { | 41 | 8 | let fun = | 42 | 0 | if cfg!(memchr_runtime_avx) && is_x86_feature_detected!("avx2") { | 43 | 8 | avx::$name as FnRaw | 44 | 0 | } else if cfg!(memchr_runtime_sse2) { | 45 | 0 | sse2::$name as FnRaw | 46 | | } else { | 47 | 0 | fallback::$name as FnRaw | 48 | | }; | 49 | 8 | FN.store(fun as FnRaw, Ordering::Relaxed); | 50 | 8 | unsafe { | 51 | 8 | mem::transmute::<FnRaw, $fnty>(fun)($($needle),+, haystack) | 52 | 8 | } | 53 | 8 | } |
Unexecuted instantiation: memchr::x86::memrchr2::detect |
54 | | |
55 | | unsafe { |
56 | | let fun = FN.load(Ordering::Relaxed); |
57 | | mem::transmute::<FnRaw, $fnty>(fun)($($needle),+, $haystack) |
58 | | } |
59 | | }} |
60 | | } |
61 | | |
62 | | // When std isn't available to provide runtime CPU feature detection, or if |
63 | | // runtime CPU feature detection has been explicitly disabled, then just call |
64 | | // our optimized SSE2 routine directly. SSE2 is avalbale on all x86_64 targets, |
65 | | // so no CPU feature detection is necessary. |
66 | | #[cfg(not(feature = "use_std"))] |
67 | | macro_rules! ifunc { |
68 | | ($fnty:ty, $name:ident, $haystack:ident, $($needle:ident),+) => {{ |
69 | | if cfg!(memchr_runtime_sse2) { |
70 | | unsafe { sse2::$name($($needle),+, $haystack) } |
71 | | } else { |
72 | | fallback::$name($($needle),+, $haystack) |
73 | | } |
74 | | }} |
75 | | } |
76 | | |
77 | | #[inline(always)] |
78 | 18.5M | pub fn memchr(n1: u8, haystack: &[u8]) -> Option<usize> { |
79 | 18.5M | ifunc!(fn(u8, &[u8]) -> Option<usize>, memchr, haystack, n1) |
80 | 18.5M | } |
81 | | |
82 | | #[inline(always)] |
83 | 0 | pub fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> { |
84 | 0 | ifunc!(fn(u8, u8, &[u8]) -> Option<usize>, memchr2, haystack, n1, n2) |
85 | 0 | } |
86 | | |
87 | | #[inline(always)] |
88 | 0 | pub fn memchr3(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option<usize> { |
89 | 0 | ifunc!(fn(u8, u8, u8, &[u8]) -> Option<usize>, memchr3, haystack, n1, n2, n3) |
90 | 0 | } |
91 | | |
92 | | #[inline(always)] |
93 | 0 | pub fn memrchr(n1: u8, haystack: &[u8]) -> Option<usize> { |
94 | 0 | ifunc!(fn(u8, &[u8]) -> Option<usize>, memrchr, haystack, n1) |
95 | 0 | } |
96 | | |
97 | | #[inline(always)] |
98 | 0 | pub fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> { |
99 | 0 | ifunc!(fn(u8, u8, &[u8]) -> Option<usize>, memrchr2, haystack, n1, n2) |
100 | 0 | } |
101 | | |
102 | | #[inline(always)] |
103 | 0 | pub fn memrchr3(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option<usize> { |
104 | 0 | ifunc!(fn(u8, u8, u8, &[u8]) -> Option<usize>, memrchr3, haystack, n1, n2, n3) |
105 | 0 | } |