/src/tesseract/src/arch/simddetect.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /////////////////////////////////////////////////////////////////////// |
2 | | // File: simddetect.cpp |
3 | | // Description: Architecture detector. |
4 | | // Author: Stefan Weil (based on code from Ray Smith) |
5 | | // |
6 | | // (C) Copyright 2014, Google Inc. |
7 | | // Licensed under the Apache License, Version 2.0 (the "License"); |
8 | | // you may not use this file except in compliance with the License. |
9 | | // You may obtain a copy of the License at |
10 | | // http://www.apache.org/licenses/LICENSE-2.0 |
11 | | // Unless required by applicable law or agreed to in writing, software |
12 | | // distributed under the License is distributed on an "AS IS" BASIS, |
13 | | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | // See the License for the specific language governing permissions and |
15 | | // limitations under the License. |
16 | | /////////////////////////////////////////////////////////////////////// |
17 | | |
18 | | #ifdef HAVE_CONFIG_H |
19 | | # include "config_auto.h" // for HAVE_AVX, ... |
20 | | #endif |
21 | | #include <numeric> // for std::inner_product |
22 | | #include "dotproduct.h" |
23 | | #include "intsimdmatrix.h" // for IntSimdMatrix |
24 | | #include "params.h" // for STRING_VAR |
25 | | #include "simddetect.h" |
26 | | #include "tprintf.h" // for tprintf |
27 | | |
28 | | #if !defined(__clang__) && defined(__GNUC__) && (__GNUC__ < 12) |
29 | | // The GNU compiler g++ fails to compile with the Accelerate framework |
30 | | // (tested with versions 10 and 11), so unconditionally disable it. |
31 | | #undef HAVE_FRAMEWORK_ACCELERATE |
32 | | #endif |
33 | | |
34 | | #if defined(HAVE_FRAMEWORK_ACCELERATE) |
35 | | |
36 | | // Use Apple Accelerate framework. |
37 | | // https://developer.apple.com/documentation/accelerate/simd |
38 | | |
39 | | #include <Accelerate/Accelerate.h> |
40 | | |
41 | | #endif |
42 | | |
43 | | #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA) || defined(HAVE_SSE4_1) |
44 | | // See https://en.wikipedia.org/wiki/CPUID. |
45 | | # define HAS_CPUID |
46 | | #endif |
47 | | |
48 | | #if defined(HAS_CPUID) |
49 | | # if defined(__GNUC__) |
50 | | # include <cpuid.h> |
51 | | # elif defined(_WIN32) |
52 | | # include <intrin.h> |
53 | | # endif |
54 | | #endif |
55 | | |
56 | | #if defined(HAVE_NEON) && !defined(__aarch64__) |
57 | | # if defined(HAVE_ANDROID_GETCPUFAMILY) |
58 | | # include <cpu-features.h> |
59 | | # elif defined(HAVE_GETAUXVAL) |
60 | | # include <asm/hwcap.h> |
61 | | # include <sys/auxv.h> |
62 | | # elif defined(HAVE_ELF_AUX_INFO) |
63 | | # include <sys/auxv.h> |
64 | | # endif |
65 | | #endif |
66 | | |
67 | | #if defined(HAVE_RVV) |
68 | | # if defined(HAVE_GETAUXVAL) || defined(HAVE_ELF_AUX_INFO) |
69 | | # include <sys/auxv.h> |
70 | | # define HWCAP_RV(letter) (1ul << ((letter) - 'A')) |
71 | | # endif |
72 | | #endif |
73 | | |
74 | | namespace tesseract { |
75 | | |
76 | | // Computes and returns the dot product of the two n-vectors u and v. |
77 | | // Note: because the order of addition is different among the different dot |
78 | | // product functions, the results can (and do) vary slightly (although they |
79 | | // agree to within about 4e-15). This produces different results when running |
80 | | // training, despite all random inputs being precisely equal. |
81 | | // To get consistent results, use just one of these dot product functions. |
82 | | // On a test multi-layer network, serial is 57% slower than SSE, and AVX |
83 | | // is about 8% faster than SSE. This suggests that the time is memory |
84 | | // bandwidth constrained and could benefit from holding the reused vector |
85 | | // in AVX registers. |
86 | | DotProductFunction DotProduct; |
87 | | |
88 | | static STRING_VAR(dotproduct, "auto", "Function used for calculation of dot product"); |
89 | | |
90 | | SIMDDetect SIMDDetect::detector; |
91 | | |
92 | | #if defined(__aarch64__) |
93 | | // ARMv8 always has NEON. |
94 | | bool SIMDDetect::neon_available_ = true; |
95 | | #elif defined(HAVE_NEON) |
96 | | // If true, then Neon has been detected. |
97 | | bool SIMDDetect::neon_available_; |
98 | | #elif defined(HAVE_RVV) |
99 | | bool SIMDDetect::rvv_available_; |
100 | | #else |
101 | | // If true, then AVX has been detected. |
102 | | bool SIMDDetect::avx_available_; |
103 | | bool SIMDDetect::avx2_available_; |
104 | | bool SIMDDetect::avx512F_available_; |
105 | | bool SIMDDetect::avx512BW_available_; |
106 | | bool SIMDDetect::avx512VNNI_available_; |
107 | | // If true, then FMA has been detected. |
108 | | bool SIMDDetect::fma_available_; |
109 | | // If true, then SSe4.1 has been detected. |
110 | | bool SIMDDetect::sse_available_; |
111 | | #endif |
112 | | |
113 | | #if defined(HAVE_FRAMEWORK_ACCELERATE) |
114 | | static TFloat DotProductAccelerate(const TFloat* u, const TFloat* v, int n) { |
115 | | TFloat total = 0; |
116 | | const int stride = 1; |
117 | | #if defined(FAST_FLOAT) |
118 | | vDSP_dotpr(u, stride, v, stride, &total, n); |
119 | | #else |
120 | | vDSP_dotprD(u, stride, v, stride, &total, n); |
121 | | #endif |
122 | | return total; |
123 | | } |
124 | | #endif |
125 | | |
126 | | // Computes and returns the dot product of the two n-vectors u and v. |
127 | 0 | static TFloat DotProductGeneric(const TFloat *u, const TFloat *v, int n) { |
128 | 0 | TFloat total = 0; |
129 | 0 | for (int k = 0; k < n; ++k) { |
130 | 0 | total += u[k] * v[k]; |
131 | 0 | } |
132 | 0 | return total; |
133 | 0 | } |
134 | | |
135 | | // Compute dot product using std::inner_product. |
136 | 0 | static TFloat DotProductStdInnerProduct(const TFloat *u, const TFloat *v, int n) { |
137 | 0 | return std::inner_product(u, u + n, v, static_cast<TFloat>(0)); |
138 | 0 | } |
139 | | |
140 | 4 | static void SetDotProduct(DotProductFunction f, const IntSimdMatrix *m = nullptr) { |
141 | 4 | DotProduct = f; |
142 | 4 | IntSimdMatrix::intSimdMatrix = m; |
143 | 4 | } |
144 | | |
145 | | // Constructor. |
146 | | // Tests the architecture in a system-dependent way to detect AVX, SSE and |
147 | | // any other available SIMD equipment. |
148 | | // __GNUC__ is also defined by compilers that include GNU extensions such as |
149 | | // clang. |
150 | 2 | SIMDDetect::SIMDDetect() { |
151 | | // The fallback is a generic dot product calculation. |
152 | 2 | SetDotProduct(DotProductGeneric); |
153 | | |
154 | 2 | #if defined(HAS_CPUID) |
155 | 2 | # if defined(__GNUC__) |
156 | 2 | unsigned int eax, ebx, ecx, edx; |
157 | 2 | if (__get_cpuid(1, &eax, &ebx, &ecx, &edx) != 0) { |
158 | | // Note that these tests all use hex because the older compilers don't have |
159 | | // the newer flags. |
160 | 2 | # if defined(HAVE_SSE4_1) |
161 | 2 | sse_available_ = (ecx & 0x00080000) != 0; |
162 | 2 | # endif |
163 | 2 | # if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA) |
164 | 2 | auto xgetbv = []() { |
165 | 2 | uint32_t xcr0; |
166 | 2 | __asm__("xgetbv" : "=a"(xcr0) : "c"(0) : "%edx"); |
167 | 2 | return xcr0; |
168 | 2 | }; |
169 | 2 | if ((ecx & 0x08000000) && ((xgetbv() & 6) == 6)) { |
170 | | // OSXSAVE bit is set, XMM state and YMM state are fine. |
171 | 2 | # if defined(HAVE_FMA) |
172 | 2 | fma_available_ = (ecx & 0x00001000) != 0; |
173 | 2 | # endif |
174 | 2 | # if defined(HAVE_AVX) |
175 | 2 | avx_available_ = (ecx & 0x10000000) != 0; |
176 | 2 | if (avx_available_) { |
177 | | // There is supposed to be a __get_cpuid_count function, but this is all |
178 | | // there is in my cpuid.h. It is a macro for an asm statement and cannot |
179 | | // be used inside an if. |
180 | 2 | __cpuid_count(7, 0, eax, ebx, ecx, edx); |
181 | 2 | avx2_available_ = (ebx & 0x00000020) != 0; |
182 | 2 | avx512F_available_ = (ebx & 0x00010000) != 0; |
183 | 2 | avx512BW_available_ = (ebx & 0x40000000) != 0; |
184 | 2 | avx512VNNI_available_ = (ecx & 0x00000800) != 0; |
185 | 2 | } |
186 | 2 | # endif |
187 | 2 | } |
188 | 2 | # endif |
189 | 2 | } |
190 | | # elif defined(_WIN32) |
191 | | int cpuInfo[4]; |
192 | | int max_function_id; |
193 | | __cpuid(cpuInfo, 0); |
194 | | max_function_id = cpuInfo[0]; |
195 | | if (max_function_id >= 1) { |
196 | | __cpuid(cpuInfo, 1); |
197 | | # if defined(HAVE_SSE4_1) |
198 | | sse_available_ = (cpuInfo[2] & 0x00080000) != 0; |
199 | | # endif |
200 | | # if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA) |
201 | | if ((cpuInfo[2] & 0x08000000) && ((_xgetbv(0) & 6) == 6)) { |
202 | | // OSXSAVE bit is set, XMM state and YMM state are fine. |
203 | | # if defined(HAVE_FMA) |
204 | | fma_available_ = (cpuInfo[2] & 0x00001000) != 0; |
205 | | # endif |
206 | | # if defined(HAVE_AVX) |
207 | | avx_available_ = (cpuInfo[2] & 0x10000000) != 0; |
208 | | # endif |
209 | | # if defined(HAVE_AVX2) |
210 | | if (max_function_id >= 7) { |
211 | | __cpuid(cpuInfo, 7); |
212 | | avx2_available_ = (cpuInfo[1] & 0x00000020) != 0; |
213 | | avx512F_available_ = (cpuInfo[1] & 0x00010000) != 0; |
214 | | avx512BW_available_ = (cpuInfo[1] & 0x40000000) != 0; |
215 | | avx512VNNI_available_ = (cpuInfo[2] & 0x00000800) != 0; |
216 | | } |
217 | | # endif |
218 | | } |
219 | | # endif |
220 | | } |
221 | | # else |
222 | | # error "I don't know how to test for SIMD with this compiler" |
223 | | # endif |
224 | 2 | #endif |
225 | | |
226 | | #if defined(HAVE_NEON) && !defined(__aarch64__) |
227 | | # if defined(HAVE_ANDROID_GETCPUFAMILY) |
228 | | { |
229 | | AndroidCpuFamily family = android_getCpuFamily(); |
230 | | if (family == ANDROID_CPU_FAMILY_ARM) |
231 | | neon_available_ = (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON); |
232 | | } |
233 | | # elif defined(HAVE_GETAUXVAL) |
234 | | neon_available_ = getauxval(AT_HWCAP) & HWCAP_NEON; |
235 | | # elif defined(HAVE_ELF_AUX_INFO) |
236 | | unsigned long hwcap = 0; |
237 | | elf_aux_info(AT_HWCAP, &hwcap, sizeof hwcap); |
238 | | neon_available_ = hwcap & HWCAP_NEON; |
239 | | # endif |
240 | | #endif |
241 | | |
242 | | #if defined(HAVE_RVV) |
243 | | # if defined(HAVE_GETAUXVAL) |
244 | | const unsigned long hwcap = getauxval(AT_HWCAP); |
245 | | rvv_available_ = hwcap & HWCAP_RV('V'); |
246 | | # elif defined(HAVE_ELF_AUX_INFO) |
247 | | unsigned long hwcap = 0; |
248 | | elf_aux_info(AT_HWCAP, &hwcap, sizeof hwcap); |
249 | | rvv_available_ = hwcap & HWCAP_RV('V'); |
250 | | # endif |
251 | | #endif |
252 | | |
253 | | // Select code for calculation of dot product based on autodetection. |
254 | 2 | if (false) { |
255 | | // This is a dummy to support conditional compilation. |
256 | 0 | #if defined(HAVE_AVX512F) |
257 | 2 | } else if (avx512F_available_) { |
258 | | // AVX512F detected. |
259 | 0 | SetDotProduct(DotProductAVX512F, &IntSimdMatrix::intSimdMatrixAVX2); |
260 | 0 | #endif |
261 | 0 | #if defined(HAVE_AVX2) |
262 | 2 | } else if (avx2_available_) { |
263 | | // AVX2 detected. |
264 | 2 | SetDotProduct(DotProductAVX, &IntSimdMatrix::intSimdMatrixAVX2); |
265 | 2 | #endif |
266 | 2 | #if defined(HAVE_AVX) |
267 | 2 | } else if (avx_available_) { |
268 | | // AVX detected. |
269 | 0 | SetDotProduct(DotProductAVX, &IntSimdMatrix::intSimdMatrixSSE); |
270 | 0 | #endif |
271 | 0 | #if defined(HAVE_SSE4_1) |
272 | 0 | } else if (sse_available_) { |
273 | | // SSE detected. |
274 | 0 | SetDotProduct(DotProductSSE, &IntSimdMatrix::intSimdMatrixSSE); |
275 | 0 | #endif |
276 | | #if defined(HAVE_NEON) || defined(__aarch64__) |
277 | | } else if (neon_available_) { |
278 | | // NEON detected. |
279 | | SetDotProduct(DotProductNEON, &IntSimdMatrix::intSimdMatrixNEON); |
280 | | #endif |
281 | | #if defined(HAVE_RVV) |
282 | | } else if (rvv_available_) { |
283 | | SetDotProduct(DotProductGeneric, &IntSimdMatrix::intSimdMatrixRVV); |
284 | | #endif |
285 | 0 | } |
286 | | |
287 | 2 | const char *dotproduct_env = getenv("DOTPRODUCT"); |
288 | 2 | if (dotproduct_env != nullptr) { |
289 | | // Override automatic settings by value from environment variable. |
290 | 0 | dotproduct = dotproduct_env; |
291 | 0 | Update(); |
292 | 0 | } |
293 | 2 | } |
294 | | |
295 | 0 | void SIMDDetect::Update() { |
296 | | // Select code for calculation of dot product based on the |
297 | | // value of the config variable if that value is not empty. |
298 | 0 | const char *dotproduct_method = "generic"; |
299 | 0 | if (dotproduct == "auto") { |
300 | | // Automatic detection. Nothing to be done. |
301 | 0 | } else if (dotproduct == "generic") { |
302 | | // Generic code selected by config variable. |
303 | 0 | SetDotProduct(DotProductGeneric); |
304 | 0 | dotproduct_method = "generic"; |
305 | 0 | } else if (dotproduct == "native") { |
306 | | // Native optimized code selected by config variable. |
307 | 0 | SetDotProduct(DotProductNative, IntSimdMatrix::intSimdMatrix); |
308 | 0 | dotproduct_method = "native"; |
309 | 0 | #if defined(HAVE_AVX2) |
310 | 0 | } else if (dotproduct == "avx2") { |
311 | | // AVX2 selected by config variable. |
312 | 0 | SetDotProduct(DotProductAVX, &IntSimdMatrix::intSimdMatrixAVX2); |
313 | 0 | dotproduct_method = "avx2"; |
314 | 0 | #endif |
315 | 0 | #if defined(HAVE_AVX) |
316 | 0 | } else if (dotproduct == "avx") { |
317 | | // AVX selected by config variable. |
318 | 0 | SetDotProduct(DotProductAVX, &IntSimdMatrix::intSimdMatrixSSE); |
319 | 0 | dotproduct_method = "avx"; |
320 | 0 | #endif |
321 | 0 | #if defined(HAVE_FMA) |
322 | 0 | } else if (dotproduct == "fma") { |
323 | | // FMA selected by config variable. |
324 | 0 | SetDotProduct(DotProductFMA, IntSimdMatrix::intSimdMatrix); |
325 | 0 | dotproduct_method = "fma"; |
326 | 0 | #endif |
327 | 0 | #if defined(HAVE_SSE4_1) |
328 | 0 | } else if (dotproduct == "sse") { |
329 | | // SSE selected by config variable. |
330 | 0 | SetDotProduct(DotProductSSE, &IntSimdMatrix::intSimdMatrixSSE); |
331 | 0 | dotproduct_method = "sse"; |
332 | 0 | #endif |
333 | | #if defined(HAVE_FRAMEWORK_ACCELERATE) |
334 | | } else if (dotproduct == "accelerate") { |
335 | | SetDotProduct(DotProductAccelerate, IntSimdMatrix::intSimdMatrix); |
336 | | #endif |
337 | | #if defined(HAVE_NEON) || defined(__aarch64__) |
338 | | } else if (dotproduct == "neon" && neon_available_) { |
339 | | // NEON selected by config variable. |
340 | | SetDotProduct(DotProductNEON, &IntSimdMatrix::intSimdMatrixNEON); |
341 | | dotproduct_method = "neon"; |
342 | | #endif |
343 | 0 | } else if (dotproduct == "std::inner_product") { |
344 | | // std::inner_product selected by config variable. |
345 | 0 | SetDotProduct(DotProductStdInnerProduct, IntSimdMatrix::intSimdMatrix); |
346 | 0 | dotproduct_method = "std::inner_product"; |
347 | 0 | } else { |
348 | | // Unsupported value of config variable. |
349 | 0 | tprintf("Warning, ignoring unsupported config variable value: dotproduct=%s\n", |
350 | 0 | dotproduct.c_str()); |
351 | 0 | tprintf( |
352 | 0 | "Supported values for dotproduct: auto generic native" |
353 | 0 | #if defined(HAVE_AVX2) |
354 | 0 | " avx2" |
355 | 0 | #endif |
356 | 0 | #if defined(HAVE_AVX) |
357 | 0 | " avx" |
358 | 0 | #endif |
359 | 0 | #if defined(HAVE_FMA) |
360 | 0 | " fma" |
361 | 0 | #endif |
362 | 0 | #if defined(HAVE_SSE4_1) |
363 | 0 | " sse" |
364 | 0 | #endif |
365 | | #if defined(HAVE_FRAMEWORK_ACCELERATE) |
366 | | " accelerate" |
367 | | #endif |
368 | 0 | " std::inner_product.\n"); |
369 | 0 | } |
370 | |
|
371 | 0 | dotproduct.set_value(dotproduct_method); |
372 | 0 | } |
373 | | |
374 | | } // namespace tesseract |