/rust/registry/src/index.crates.io-6f17d22bba15001f/matrixmultiply-0.3.9/src/threading.rs
Line | Count | Source (jump to first uncovered line) |
1 | | /// |
2 | | /// Threading support functions and statics |
3 | | |
4 | | #[cfg(feature="threading")] |
5 | | use std::cmp::min; |
6 | | #[cfg(feature="threading")] |
7 | | use std::str::FromStr; |
8 | | #[cfg(feature="threading")] |
9 | | use once_cell::sync::Lazy; |
10 | | |
11 | | #[cfg(feature="threading")] |
12 | | pub use thread_tree::ThreadTree as ThreadPool; |
13 | | #[cfg(feature="threading")] |
14 | | pub use thread_tree::ThreadTreeCtx as ThreadPoolCtx; |
15 | | |
16 | | use crate::kernel::GemmKernel; |
17 | | use crate::util::RangeChunk; |
18 | | |
19 | | /// Dummy threadpool |
20 | | #[cfg(not(feature="threading"))] |
21 | | pub(crate) struct ThreadPool; |
22 | | |
23 | | #[cfg(not(feature="threading"))] |
24 | | pub(crate) type ThreadPoolCtx<'a> = &'a (); |
25 | | |
26 | | #[cfg(not(feature="threading"))] |
27 | | impl ThreadPool { |
28 | | /// Get top dummy thread pool context |
29 | 0 | pub(crate) fn top(&self) -> ThreadPoolCtx<'_> { &() } |
30 | | } |
31 | | |
32 | 0 | pub(crate) fn get_thread_pool<'a>() -> (usize, ThreadPoolCtx<'a>) { |
33 | 0 | let reg = &*REGISTRY; |
34 | 0 | (reg.nthreads, reg.thread_pool().top()) |
35 | 0 | } |
36 | | |
37 | | struct Registry { |
38 | | nthreads: usize, |
39 | | #[cfg(feature="threading")] |
40 | | thread_pool: Box<ThreadPool>, |
41 | | } |
42 | | |
43 | | impl Registry { |
44 | 0 | fn thread_pool(&self) -> &ThreadPool { |
45 | 0 | #[cfg(feature="threading")] |
46 | 0 | return &*REGISTRY.thread_pool; |
47 | 0 | #[cfg(not(feature="threading"))] |
48 | 0 | return &ThreadPool; |
49 | 0 | } |
50 | | } |
51 | | |
52 | | #[cfg(not(feature="threading"))] |
53 | | const REGISTRY: &'static Registry = &Registry { nthreads: 1 }; |
54 | | |
55 | | #[cfg(feature="threading")] |
56 | | /// Maximum (usefully) supported threads at the moment |
57 | | const MAX_THREADS: usize = 4; |
58 | | |
59 | | #[cfg(feature="threading")] |
60 | | static REGISTRY: Lazy<Registry> = Lazy::new(|| { |
61 | | let var = ::std::env::var("MATMUL_NUM_THREADS").ok(); |
62 | | let threads = match var { |
63 | | Some(s) if !s.is_empty() => { |
64 | | if let Ok(nt) = usize::from_str(&s) { |
65 | | nt |
66 | | } else { |
67 | | eprintln!("Failed to parse MATMUL_NUM_THREADS"); |
68 | | 1 |
69 | | } |
70 | | } |
71 | | _otherwise => num_cpus::get_physical(), |
72 | | }; |
73 | | |
74 | | // Ensure threads in 1 <= threads <= MAX_THREADS |
75 | | let threads = 1.max(threads).min(MAX_THREADS); |
76 | | |
77 | | let tp = if threads <= 1 { |
78 | | Box::new(ThreadPool::new_level0()) |
79 | | } else if threads <= 3 { |
80 | | ThreadPool::new_with_level(1) |
81 | | } else { |
82 | | ThreadPool::new_with_level(2) |
83 | | }; |
84 | | |
85 | | Registry { |
86 | | nthreads: threads, |
87 | | thread_pool: tp, |
88 | | } |
89 | | }); |
90 | | |
91 | | /// Describe how many threads we use in each loop |
92 | | #[derive(Copy, Clone)] |
93 | | pub(crate) struct LoopThreadConfig { |
94 | | /// Loop 3 threads |
95 | | pub(crate) loop3: u8, |
96 | | /// Loop 2 threads |
97 | | pub(crate) loop2: u8, |
98 | | } |
99 | | |
100 | | impl LoopThreadConfig { |
101 | | /// Decide how many threads to use in each loop |
102 | 0 | pub(crate) fn new<K>(m: usize, k: usize, n: usize, max_threads: usize) -> Self |
103 | 0 | where K: GemmKernel |
104 | 0 | { |
105 | 0 | let default_config = LoopThreadConfig { loop3: 1, loop2: 1 }; |
106 | 0 |
|
107 | 0 | #[cfg(not(feature="threading"))] |
108 | 0 | { |
109 | 0 | let _ = (m, k, n, max_threads); // used |
110 | 0 | return default_config; |
111 | 0 | } |
112 | 0 |
|
113 | 0 | #[cfg(feature="threading")] |
114 | 0 | { |
115 | 0 | if max_threads == 1 { |
116 | 0 | return default_config; |
117 | 0 | } |
118 | 0 |
|
119 | 0 | Self::new_impl(m, k, n, max_threads, K::mc()) |
120 | 0 | } |
121 | 0 | } Unexecuted instantiation: <matrixmultiply::threading::LoopThreadConfig>::new::<matrixmultiply::cgemm_kernel::KernelAvx2> Unexecuted instantiation: <matrixmultiply::threading::LoopThreadConfig>::new::<matrixmultiply::cgemm_kernel::KernelFallback> Unexecuted instantiation: <matrixmultiply::threading::LoopThreadConfig>::new::<matrixmultiply::cgemm_kernel::KernelFma> Unexecuted instantiation: <matrixmultiply::threading::LoopThreadConfig>::new::<matrixmultiply::dgemm_kernel::KernelSse2> Unexecuted instantiation: <matrixmultiply::threading::LoopThreadConfig>::new::<matrixmultiply::dgemm_kernel::KernelFmaAvx2> Unexecuted instantiation: <matrixmultiply::threading::LoopThreadConfig>::new::<matrixmultiply::dgemm_kernel::KernelFallback> Unexecuted instantiation: <matrixmultiply::threading::LoopThreadConfig>::new::<matrixmultiply::dgemm_kernel::KernelAvx> Unexecuted instantiation: <matrixmultiply::threading::LoopThreadConfig>::new::<matrixmultiply::dgemm_kernel::KernelFma> Unexecuted instantiation: <matrixmultiply::threading::LoopThreadConfig>::new::<matrixmultiply::sgemm_kernel::KernelSse2> Unexecuted instantiation: <matrixmultiply::threading::LoopThreadConfig>::new::<matrixmultiply::sgemm_kernel::KernelFmaAvx2> Unexecuted instantiation: <matrixmultiply::threading::LoopThreadConfig>::new::<matrixmultiply::sgemm_kernel::KernelFallback> Unexecuted instantiation: <matrixmultiply::threading::LoopThreadConfig>::new::<matrixmultiply::sgemm_kernel::KernelAvx> Unexecuted instantiation: <matrixmultiply::threading::LoopThreadConfig>::new::<matrixmultiply::sgemm_kernel::KernelFma> Unexecuted instantiation: <matrixmultiply::threading::LoopThreadConfig>::new::<matrixmultiply::zgemm_kernel::KernelAvx2> Unexecuted instantiation: <matrixmultiply::threading::LoopThreadConfig>::new::<matrixmultiply::zgemm_kernel::KernelFallback> Unexecuted instantiation: <matrixmultiply::threading::LoopThreadConfig>::new::<matrixmultiply::zgemm_kernel::KernelFma> |
122 | | |
123 | | #[cfg(feature="threading")] |
124 | | fn new_impl(m: usize, k: usize, n: usize, max_threads: usize, kmc: usize) -> Self { |
125 | | // use a heuristic to try not to use too many threads for smaller matrices |
126 | | let size_factor = m * k + k * n; |
127 | | let thread_factor = 1 << 14; |
128 | | // pure guesswork in terms of what the default should be |
129 | | let arch_factor = if cfg!(target_arch="arm") { |
130 | | 20 |
131 | | } else { |
132 | | 1 |
133 | | }; |
134 | | |
135 | | // At the moment only a configuration of 1, 2, or 4 threads is supported. |
136 | | // |
137 | | // Prefer to split Loop 3 if only 2 threads are available, (because it was better in a |
138 | | // square matrix benchmark). |
139 | | |
140 | | let matrix_max_threads = size_factor / (thread_factor / arch_factor); |
141 | | let mut max_threads = max_threads.min(matrix_max_threads); |
142 | | |
143 | | let loop3 = if max_threads >= 2 && m >= 3 * (kmc / 2) { |
144 | | max_threads /= 2; |
145 | | 2 |
146 | | } else { |
147 | | 1 |
148 | | }; |
149 | | let loop2 = if max_threads >= 2 { 2 } else { 1 }; |
150 | | |
151 | | LoopThreadConfig { |
152 | | loop3, |
153 | | loop2, |
154 | | } |
155 | | } |
156 | | |
157 | | /// Number of packing buffers for A |
158 | | #[inline(always)] |
159 | 0 | pub(crate) fn num_pack_a(&self) -> usize { self.loop3 as usize } |
160 | | } |
161 | | |
162 | | |
163 | | impl RangeChunk { |
164 | | /// "Builder" method to create a RangeChunkParallel |
165 | 0 | pub(crate) fn parallel(self, nthreads: u8, pool: ThreadPoolCtx) -> RangeChunkParallel<fn()> { |
166 | 0 | fn nop() {} |
167 | | |
168 | 0 | RangeChunkParallel { |
169 | 0 | nthreads, |
170 | 0 | pool, |
171 | 0 | range: self, |
172 | 0 | thread_local: nop, |
173 | 0 | } |
174 | 0 | } |
175 | | } |
176 | | |
177 | | /// Intermediate struct for building the parallel execution of a range chunk. |
178 | | pub(crate) struct RangeChunkParallel<'a, G> { |
179 | | range: RangeChunk, |
180 | | nthreads: u8, |
181 | | pool: ThreadPoolCtx<'a>, |
182 | | thread_local: G, |
183 | | } |
184 | | |
185 | | impl<'a, G> RangeChunkParallel<'a, G> { |
186 | | #[cfg(feature="threading")] |
187 | | /// Set thread local setup function - called once per thread to setup thread local data. |
188 | | pub(crate) fn thread_local<G2, R>(self, func: G2) -> RangeChunkParallel<'a, G2> |
189 | | where G2: Fn(usize, usize) -> R + Sync |
190 | | { |
191 | | RangeChunkParallel { |
192 | | nthreads: self.nthreads, |
193 | | pool: self.pool, |
194 | | thread_local: func, |
195 | | range: self.range, |
196 | | } |
197 | | } |
198 | | |
199 | | #[cfg(not(feature="threading"))] |
200 | | /// Set thread local setup function - called once per thread to setup thread local data. |
201 | 0 | pub(crate) fn thread_local<G2, R>(self, func: G2) -> RangeChunkParallel<'a, G2> |
202 | 0 | where G2: FnOnce(usize, usize) -> R + Sync |
203 | 0 | { |
204 | 0 | RangeChunkParallel { |
205 | 0 | nthreads: self.nthreads, |
206 | 0 | pool: self.pool, |
207 | 0 | thread_local: func, |
208 | 0 | range: self.range, |
209 | 0 | } |
210 | 0 | } Unexecuted instantiation: <matrixmultiply::threading::RangeChunkParallel<fn()>>::thread_local::<matrixmultiply::gemm::gemm_packed<matrixmultiply::cgemm_kernel::KernelAvx2>::{closure#0}, &mut [[f32; 2]]> Unexecuted instantiation: <matrixmultiply::threading::RangeChunkParallel<fn()>>::thread_local::<matrixmultiply::gemm::gemm_packed<matrixmultiply::cgemm_kernel::KernelFallback>::{closure#0}, &mut [[f32; 2]]> Unexecuted instantiation: <matrixmultiply::threading::RangeChunkParallel<fn()>>::thread_local::<matrixmultiply::gemm::gemm_packed<matrixmultiply::cgemm_kernel::KernelFma>::{closure#0}, &mut [[f32; 2]]> Unexecuted instantiation: <matrixmultiply::threading::RangeChunkParallel<fn()>>::thread_local::<matrixmultiply::gemm::gemm_packed<matrixmultiply::dgemm_kernel::KernelSse2>::{closure#0}, &mut [f64]> Unexecuted instantiation: <matrixmultiply::threading::RangeChunkParallel<fn()>>::thread_local::<matrixmultiply::gemm::gemm_packed<matrixmultiply::dgemm_kernel::KernelFmaAvx2>::{closure#0}, &mut [f64]> Unexecuted instantiation: <matrixmultiply::threading::RangeChunkParallel<fn()>>::thread_local::<matrixmultiply::gemm::gemm_packed<matrixmultiply::dgemm_kernel::KernelFallback>::{closure#0}, &mut [f64]> Unexecuted instantiation: <matrixmultiply::threading::RangeChunkParallel<fn()>>::thread_local::<matrixmultiply::gemm::gemm_packed<matrixmultiply::dgemm_kernel::KernelAvx>::{closure#0}, &mut [f64]> Unexecuted instantiation: <matrixmultiply::threading::RangeChunkParallel<fn()>>::thread_local::<matrixmultiply::gemm::gemm_packed<matrixmultiply::dgemm_kernel::KernelFma>::{closure#0}, &mut [f64]> Unexecuted instantiation: <matrixmultiply::threading::RangeChunkParallel<fn()>>::thread_local::<matrixmultiply::gemm::gemm_packed<matrixmultiply::sgemm_kernel::KernelSse2>::{closure#0}, &mut [f32]> Unexecuted instantiation: <matrixmultiply::threading::RangeChunkParallel<fn()>>::thread_local::<matrixmultiply::gemm::gemm_packed<matrixmultiply::sgemm_kernel::KernelFmaAvx2>::{closure#0}, &mut [f32]> Unexecuted instantiation: <matrixmultiply::threading::RangeChunkParallel<fn()>>::thread_local::<matrixmultiply::gemm::gemm_packed<matrixmultiply::sgemm_kernel::KernelFallback>::{closure#0}, &mut [f32]> Unexecuted instantiation: <matrixmultiply::threading::RangeChunkParallel<fn()>>::thread_local::<matrixmultiply::gemm::gemm_packed<matrixmultiply::sgemm_kernel::KernelAvx>::{closure#0}, &mut [f32]> Unexecuted instantiation: <matrixmultiply::threading::RangeChunkParallel<fn()>>::thread_local::<matrixmultiply::gemm::gemm_packed<matrixmultiply::sgemm_kernel::KernelFma>::{closure#0}, &mut [f32]> Unexecuted instantiation: <matrixmultiply::threading::RangeChunkParallel<fn()>>::thread_local::<matrixmultiply::gemm::gemm_packed<matrixmultiply::zgemm_kernel::KernelAvx2>::{closure#0}, &mut [[f64; 2]]> Unexecuted instantiation: <matrixmultiply::threading::RangeChunkParallel<fn()>>::thread_local::<matrixmultiply::gemm::gemm_packed<matrixmultiply::zgemm_kernel::KernelFallback>::{closure#0}, &mut [[f64; 2]]> Unexecuted instantiation: <matrixmultiply::threading::RangeChunkParallel<fn()>>::thread_local::<matrixmultiply::gemm::gemm_packed<matrixmultiply::zgemm_kernel::KernelFma>::{closure#0}, &mut [[f64; 2]]> Unexecuted instantiation: <matrixmultiply::threading::RangeChunkParallel<fn()>>::thread_local::<matrixmultiply::gemm::gemm_loop<matrixmultiply::cgemm_kernel::KernelAvx2>::{closure#0}, matrixmultiply::ptr::Ptr<*mut [f32; 2]>> Unexecuted instantiation: <matrixmultiply::threading::RangeChunkParallel<fn()>>::thread_local::<matrixmultiply::gemm::gemm_loop<matrixmultiply::cgemm_kernel::KernelFallback>::{closure#0}, matrixmultiply::ptr::Ptr<*mut [f32; 2]>> Unexecuted instantiation: <matrixmultiply::threading::RangeChunkParallel<fn()>>::thread_local::<matrixmultiply::gemm::gemm_loop<matrixmultiply::cgemm_kernel::KernelFma>::{closure#0}, matrixmultiply::ptr::Ptr<*mut [f32; 2]>> Unexecuted instantiation: <matrixmultiply::threading::RangeChunkParallel<fn()>>::thread_local::<matrixmultiply::gemm::gemm_loop<matrixmultiply::dgemm_kernel::KernelSse2>::{closure#0}, matrixmultiply::ptr::Ptr<*mut f64>> Unexecuted instantiation: <matrixmultiply::threading::RangeChunkParallel<fn()>>::thread_local::<matrixmultiply::gemm::gemm_loop<matrixmultiply::dgemm_kernel::KernelFmaAvx2>::{closure#0}, matrixmultiply::ptr::Ptr<*mut f64>> Unexecuted instantiation: <matrixmultiply::threading::RangeChunkParallel<fn()>>::thread_local::<matrixmultiply::gemm::gemm_loop<matrixmultiply::dgemm_kernel::KernelFallback>::{closure#0}, matrixmultiply::ptr::Ptr<*mut f64>> Unexecuted instantiation: <matrixmultiply::threading::RangeChunkParallel<fn()>>::thread_local::<matrixmultiply::gemm::gemm_loop<matrixmultiply::dgemm_kernel::KernelAvx>::{closure#0}, matrixmultiply::ptr::Ptr<*mut f64>> Unexecuted instantiation: <matrixmultiply::threading::RangeChunkParallel<fn()>>::thread_local::<matrixmultiply::gemm::gemm_loop<matrixmultiply::dgemm_kernel::KernelFma>::{closure#0}, matrixmultiply::ptr::Ptr<*mut f64>> Unexecuted instantiation: <matrixmultiply::threading::RangeChunkParallel<fn()>>::thread_local::<matrixmultiply::gemm::gemm_loop<matrixmultiply::sgemm_kernel::KernelSse2>::{closure#0}, matrixmultiply::ptr::Ptr<*mut f32>> Unexecuted instantiation: <matrixmultiply::threading::RangeChunkParallel<fn()>>::thread_local::<matrixmultiply::gemm::gemm_loop<matrixmultiply::sgemm_kernel::KernelFmaAvx2>::{closure#0}, matrixmultiply::ptr::Ptr<*mut f32>> Unexecuted instantiation: <matrixmultiply::threading::RangeChunkParallel<fn()>>::thread_local::<matrixmultiply::gemm::gemm_loop<matrixmultiply::sgemm_kernel::KernelFallback>::{closure#0}, matrixmultiply::ptr::Ptr<*mut f32>> Unexecuted instantiation: <matrixmultiply::threading::RangeChunkParallel<fn()>>::thread_local::<matrixmultiply::gemm::gemm_loop<matrixmultiply::sgemm_kernel::KernelAvx>::{closure#0}, matrixmultiply::ptr::Ptr<*mut f32>> Unexecuted instantiation: <matrixmultiply::threading::RangeChunkParallel<fn()>>::thread_local::<matrixmultiply::gemm::gemm_loop<matrixmultiply::sgemm_kernel::KernelFma>::{closure#0}, matrixmultiply::ptr::Ptr<*mut f32>> Unexecuted instantiation: <matrixmultiply::threading::RangeChunkParallel<fn()>>::thread_local::<matrixmultiply::gemm::gemm_loop<matrixmultiply::zgemm_kernel::KernelAvx2>::{closure#0}, matrixmultiply::ptr::Ptr<*mut [f64; 2]>> Unexecuted instantiation: <matrixmultiply::threading::RangeChunkParallel<fn()>>::thread_local::<matrixmultiply::gemm::gemm_loop<matrixmultiply::zgemm_kernel::KernelFallback>::{closure#0}, matrixmultiply::ptr::Ptr<*mut [f64; 2]>> Unexecuted instantiation: <matrixmultiply::threading::RangeChunkParallel<fn()>>::thread_local::<matrixmultiply::gemm::gemm_loop<matrixmultiply::zgemm_kernel::KernelFma>::{closure#0}, matrixmultiply::ptr::Ptr<*mut [f64; 2]>> |
211 | | } |
212 | | |
213 | | #[cfg(not(feature="threading"))] |
214 | | impl<G, R> RangeChunkParallel<'_, G> |
215 | | where G: FnOnce(usize, usize) -> R + Sync, |
216 | | { |
217 | 0 | pub(crate) fn for_each<F>(self, for_each: F) |
218 | 0 | where F: Fn(ThreadPoolCtx<'_>, &mut R, usize, usize) + Sync, |
219 | 0 | { |
220 | 0 | let mut local = (self.thread_local)(0, 1); |
221 | 0 | for (ln, chunk_size) in self.range { |
222 | 0 | for_each(self.pool, &mut local, ln, chunk_size) |
223 | | } |
224 | 0 | } Unexecuted instantiation: <matrixmultiply::threading::RangeChunkParallel<matrixmultiply::gemm::gemm_packed<matrixmultiply::cgemm_kernel::KernelAvx2>::{closure#0}>>::for_each::<matrixmultiply::gemm::gemm_packed<matrixmultiply::cgemm_kernel::KernelAvx2>::{closure#1}> Unexecuted instantiation: <matrixmultiply::threading::RangeChunkParallel<matrixmultiply::gemm::gemm_packed<matrixmultiply::cgemm_kernel::KernelFallback>::{closure#0}>>::for_each::<matrixmultiply::gemm::gemm_packed<matrixmultiply::cgemm_kernel::KernelFallback>::{closure#1}> Unexecuted instantiation: <matrixmultiply::threading::RangeChunkParallel<matrixmultiply::gemm::gemm_packed<matrixmultiply::cgemm_kernel::KernelFma>::{closure#0}>>::for_each::<matrixmultiply::gemm::gemm_packed<matrixmultiply::cgemm_kernel::KernelFma>::{closure#1}> Unexecuted instantiation: <matrixmultiply::threading::RangeChunkParallel<matrixmultiply::gemm::gemm_packed<matrixmultiply::dgemm_kernel::KernelSse2>::{closure#0}>>::for_each::<matrixmultiply::gemm::gemm_packed<matrixmultiply::dgemm_kernel::KernelSse2>::{closure#1}> Unexecuted instantiation: <matrixmultiply::threading::RangeChunkParallel<matrixmultiply::gemm::gemm_packed<matrixmultiply::dgemm_kernel::KernelFmaAvx2>::{closure#0}>>::for_each::<matrixmultiply::gemm::gemm_packed<matrixmultiply::dgemm_kernel::KernelFmaAvx2>::{closure#1}> Unexecuted instantiation: <matrixmultiply::threading::RangeChunkParallel<matrixmultiply::gemm::gemm_packed<matrixmultiply::dgemm_kernel::KernelFallback>::{closure#0}>>::for_each::<matrixmultiply::gemm::gemm_packed<matrixmultiply::dgemm_kernel::KernelFallback>::{closure#1}> Unexecuted instantiation: <matrixmultiply::threading::RangeChunkParallel<matrixmultiply::gemm::gemm_packed<matrixmultiply::dgemm_kernel::KernelAvx>::{closure#0}>>::for_each::<matrixmultiply::gemm::gemm_packed<matrixmultiply::dgemm_kernel::KernelAvx>::{closure#1}> Unexecuted instantiation: <matrixmultiply::threading::RangeChunkParallel<matrixmultiply::gemm::gemm_packed<matrixmultiply::dgemm_kernel::KernelFma>::{closure#0}>>::for_each::<matrixmultiply::gemm::gemm_packed<matrixmultiply::dgemm_kernel::KernelFma>::{closure#1}> Unexecuted instantiation: <matrixmultiply::threading::RangeChunkParallel<matrixmultiply::gemm::gemm_packed<matrixmultiply::sgemm_kernel::KernelSse2>::{closure#0}>>::for_each::<matrixmultiply::gemm::gemm_packed<matrixmultiply::sgemm_kernel::KernelSse2>::{closure#1}> Unexecuted instantiation: <matrixmultiply::threading::RangeChunkParallel<matrixmultiply::gemm::gemm_packed<matrixmultiply::sgemm_kernel::KernelFmaAvx2>::{closure#0}>>::for_each::<matrixmultiply::gemm::gemm_packed<matrixmultiply::sgemm_kernel::KernelFmaAvx2>::{closure#1}> Unexecuted instantiation: <matrixmultiply::threading::RangeChunkParallel<matrixmultiply::gemm::gemm_packed<matrixmultiply::sgemm_kernel::KernelFallback>::{closure#0}>>::for_each::<matrixmultiply::gemm::gemm_packed<matrixmultiply::sgemm_kernel::KernelFallback>::{closure#1}> Unexecuted instantiation: <matrixmultiply::threading::RangeChunkParallel<matrixmultiply::gemm::gemm_packed<matrixmultiply::sgemm_kernel::KernelAvx>::{closure#0}>>::for_each::<matrixmultiply::gemm::gemm_packed<matrixmultiply::sgemm_kernel::KernelAvx>::{closure#1}> Unexecuted instantiation: <matrixmultiply::threading::RangeChunkParallel<matrixmultiply::gemm::gemm_packed<matrixmultiply::sgemm_kernel::KernelFma>::{closure#0}>>::for_each::<matrixmultiply::gemm::gemm_packed<matrixmultiply::sgemm_kernel::KernelFma>::{closure#1}> Unexecuted instantiation: <matrixmultiply::threading::RangeChunkParallel<matrixmultiply::gemm::gemm_packed<matrixmultiply::zgemm_kernel::KernelAvx2>::{closure#0}>>::for_each::<matrixmultiply::gemm::gemm_packed<matrixmultiply::zgemm_kernel::KernelAvx2>::{closure#1}> Unexecuted instantiation: <matrixmultiply::threading::RangeChunkParallel<matrixmultiply::gemm::gemm_packed<matrixmultiply::zgemm_kernel::KernelFallback>::{closure#0}>>::for_each::<matrixmultiply::gemm::gemm_packed<matrixmultiply::zgemm_kernel::KernelFallback>::{closure#1}> Unexecuted instantiation: <matrixmultiply::threading::RangeChunkParallel<matrixmultiply::gemm::gemm_packed<matrixmultiply::zgemm_kernel::KernelFma>::{closure#0}>>::for_each::<matrixmultiply::gemm::gemm_packed<matrixmultiply::zgemm_kernel::KernelFma>::{closure#1}> Unexecuted instantiation: <matrixmultiply::threading::RangeChunkParallel<matrixmultiply::gemm::gemm_loop<matrixmultiply::cgemm_kernel::KernelAvx2>::{closure#0}>>::for_each::<matrixmultiply::gemm::gemm_loop<matrixmultiply::cgemm_kernel::KernelAvx2>::{closure#1}> Unexecuted instantiation: <matrixmultiply::threading::RangeChunkParallel<matrixmultiply::gemm::gemm_loop<matrixmultiply::cgemm_kernel::KernelFallback>::{closure#0}>>::for_each::<matrixmultiply::gemm::gemm_loop<matrixmultiply::cgemm_kernel::KernelFallback>::{closure#1}> Unexecuted instantiation: <matrixmultiply::threading::RangeChunkParallel<matrixmultiply::gemm::gemm_loop<matrixmultiply::cgemm_kernel::KernelFma>::{closure#0}>>::for_each::<matrixmultiply::gemm::gemm_loop<matrixmultiply::cgemm_kernel::KernelFma>::{closure#1}> Unexecuted instantiation: <matrixmultiply::threading::RangeChunkParallel<matrixmultiply::gemm::gemm_loop<matrixmultiply::dgemm_kernel::KernelSse2>::{closure#0}>>::for_each::<matrixmultiply::gemm::gemm_loop<matrixmultiply::dgemm_kernel::KernelSse2>::{closure#1}> Unexecuted instantiation: <matrixmultiply::threading::RangeChunkParallel<matrixmultiply::gemm::gemm_loop<matrixmultiply::dgemm_kernel::KernelFmaAvx2>::{closure#0}>>::for_each::<matrixmultiply::gemm::gemm_loop<matrixmultiply::dgemm_kernel::KernelFmaAvx2>::{closure#1}> Unexecuted instantiation: <matrixmultiply::threading::RangeChunkParallel<matrixmultiply::gemm::gemm_loop<matrixmultiply::dgemm_kernel::KernelFallback>::{closure#0}>>::for_each::<matrixmultiply::gemm::gemm_loop<matrixmultiply::dgemm_kernel::KernelFallback>::{closure#1}> Unexecuted instantiation: <matrixmultiply::threading::RangeChunkParallel<matrixmultiply::gemm::gemm_loop<matrixmultiply::dgemm_kernel::KernelAvx>::{closure#0}>>::for_each::<matrixmultiply::gemm::gemm_loop<matrixmultiply::dgemm_kernel::KernelAvx>::{closure#1}> Unexecuted instantiation: <matrixmultiply::threading::RangeChunkParallel<matrixmultiply::gemm::gemm_loop<matrixmultiply::dgemm_kernel::KernelFma>::{closure#0}>>::for_each::<matrixmultiply::gemm::gemm_loop<matrixmultiply::dgemm_kernel::KernelFma>::{closure#1}> Unexecuted instantiation: <matrixmultiply::threading::RangeChunkParallel<matrixmultiply::gemm::gemm_loop<matrixmultiply::sgemm_kernel::KernelSse2>::{closure#0}>>::for_each::<matrixmultiply::gemm::gemm_loop<matrixmultiply::sgemm_kernel::KernelSse2>::{closure#1}> Unexecuted instantiation: <matrixmultiply::threading::RangeChunkParallel<matrixmultiply::gemm::gemm_loop<matrixmultiply::sgemm_kernel::KernelFmaAvx2>::{closure#0}>>::for_each::<matrixmultiply::gemm::gemm_loop<matrixmultiply::sgemm_kernel::KernelFmaAvx2>::{closure#1}> Unexecuted instantiation: <matrixmultiply::threading::RangeChunkParallel<matrixmultiply::gemm::gemm_loop<matrixmultiply::sgemm_kernel::KernelFallback>::{closure#0}>>::for_each::<matrixmultiply::gemm::gemm_loop<matrixmultiply::sgemm_kernel::KernelFallback>::{closure#1}> Unexecuted instantiation: <matrixmultiply::threading::RangeChunkParallel<matrixmultiply::gemm::gemm_loop<matrixmultiply::sgemm_kernel::KernelAvx>::{closure#0}>>::for_each::<matrixmultiply::gemm::gemm_loop<matrixmultiply::sgemm_kernel::KernelAvx>::{closure#1}> Unexecuted instantiation: <matrixmultiply::threading::RangeChunkParallel<matrixmultiply::gemm::gemm_loop<matrixmultiply::sgemm_kernel::KernelFma>::{closure#0}>>::for_each::<matrixmultiply::gemm::gemm_loop<matrixmultiply::sgemm_kernel::KernelFma>::{closure#1}> Unexecuted instantiation: <matrixmultiply::threading::RangeChunkParallel<matrixmultiply::gemm::gemm_loop<matrixmultiply::zgemm_kernel::KernelAvx2>::{closure#0}>>::for_each::<matrixmultiply::gemm::gemm_loop<matrixmultiply::zgemm_kernel::KernelAvx2>::{closure#1}> Unexecuted instantiation: <matrixmultiply::threading::RangeChunkParallel<matrixmultiply::gemm::gemm_loop<matrixmultiply::zgemm_kernel::KernelFallback>::{closure#0}>>::for_each::<matrixmultiply::gemm::gemm_loop<matrixmultiply::zgemm_kernel::KernelFallback>::{closure#1}> Unexecuted instantiation: <matrixmultiply::threading::RangeChunkParallel<matrixmultiply::gemm::gemm_loop<matrixmultiply::zgemm_kernel::KernelFma>::{closure#0}>>::for_each::<matrixmultiply::gemm::gemm_loop<matrixmultiply::zgemm_kernel::KernelFma>::{closure#1}> |
225 | | } |
226 | | |
227 | | |
228 | | #[cfg(feature="threading")] |
229 | | impl<G, R> RangeChunkParallel<'_, G> |
230 | | where G: Fn(usize, usize) -> R + Sync, |
231 | | { |
232 | | /// Execute loop iterations (parallel if enabled) using the given closure. |
233 | | /// |
234 | | /// The closure gets the following arguments for each iteration: |
235 | | /// |
236 | | /// - Thread pool context (used for child threads) |
237 | | /// - Mutable reference to thread local data |
238 | | /// - index of chunk (like RangeChunk) |
239 | | /// - size of chunk (like RangeChunk) |
240 | | pub(crate) fn for_each<F>(self, for_each: F) |
241 | | where F: Fn(ThreadPoolCtx<'_>, &mut R, usize, usize) + Sync, |
242 | | { |
243 | | fn inner<F, G, R>(range: RangeChunk, index: usize, nthreads: usize, pool: ThreadPoolCtx<'_>, |
244 | | thread_local: G, for_each: F) |
245 | | where G: Fn(usize, usize) -> R + Sync, |
246 | | F: Fn(ThreadPoolCtx<'_>, &mut R, usize, usize) + Sync |
247 | | { |
248 | | let mut local = thread_local(index, nthreads); |
249 | | for (ln, chunk_size) in range.part(index, nthreads) { |
250 | | for_each(pool, &mut local, ln, chunk_size) |
251 | | } |
252 | | } |
253 | | |
254 | | debug_assert!(self.nthreads <= 4, "this method does not support nthreads > 4, got {}", |
255 | | self.nthreads); |
256 | | let pool = self.pool; |
257 | | let range = self.range; |
258 | | let for_each = &for_each; |
259 | | let local = &self.thread_local; |
260 | | let nthreads = min(self.nthreads as usize, 4); |
261 | | let f = move |ctx: ThreadPoolCtx<'_>, i| inner(range, i, nthreads, ctx, local, for_each); |
262 | | if nthreads >= 4 { |
263 | | pool.join4(&f); |
264 | | } else if nthreads >= 3 { |
265 | | pool.join3l(&f); |
266 | | } else if nthreads >= 2 { |
267 | | pool.join(|ctx| f(ctx, 0), |ctx| f(ctx, 1)); |
268 | | } else { |
269 | | f(pool, 0) |
270 | | } |
271 | | } |
272 | | |
273 | | } |
274 | | |