Coverage Report

Created: 2025-06-13 07:02

/src/tesseract/src/arch/simddetect.cpp
Line
Count
Source (jump to first uncovered line)
1
///////////////////////////////////////////////////////////////////////
2
// File:        simddetect.cpp
3
// Description: Architecture detector.
4
// Author:      Stefan Weil (based on code from Ray Smith)
5
//
6
// (C) Copyright 2014, Google Inc.
7
// Licensed under the Apache License, Version 2.0 (the "License");
8
// you may not use this file except in compliance with the License.
9
// You may obtain a copy of the License at
10
// http://www.apache.org/licenses/LICENSE-2.0
11
// Unless required by applicable law or agreed to in writing, software
12
// distributed under the License is distributed on an "AS IS" BASIS,
13
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
// See the License for the specific language governing permissions and
15
// limitations under the License.
16
///////////////////////////////////////////////////////////////////////
17
18
#ifdef HAVE_CONFIG_H
19
#  include "config_auto.h" // for HAVE_AVX, ...
20
#endif
21
#include <numeric> // for std::inner_product
22
#include "dotproduct.h"
23
#include "intsimdmatrix.h" // for IntSimdMatrix
24
#include "params.h"        // for STRING_VAR
25
#include "simddetect.h"
26
#include "tprintf.h" // for tprintf
27
28
#if !defined(__clang__) && defined(__GNUC__) && (__GNUC__ < 12)
29
// The GNU compiler g++ fails to compile with the Accelerate framework
30
// (tested with versions 10 and 11), so unconditionally disable it.
31
#undef HAVE_FRAMEWORK_ACCELERATE
32
#endif
33
34
#if defined(HAVE_FRAMEWORK_ACCELERATE)
35
36
// Use Apple Accelerate framework.
37
// https://developer.apple.com/documentation/accelerate/simd
38
39
#include <Accelerate/Accelerate.h>
40
41
#endif
42
43
#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA) || defined(HAVE_SSE4_1)
44
// See https://en.wikipedia.org/wiki/CPUID.
45
#  define HAS_CPUID
46
#endif
47
48
#if defined(HAS_CPUID)
49
#  if defined(__GNUC__)
50
#    include <cpuid.h>
51
#  elif defined(_WIN32)
52
#    include <intrin.h>
53
#  endif
54
#endif
55
56
#if defined(HAVE_NEON) && !defined(__aarch64__)
57
#  if defined(HAVE_ANDROID_GETCPUFAMILY)
58
#    include <cpu-features.h>
59
#  elif defined(HAVE_GETAUXVAL)
60
#    include <asm/hwcap.h>
61
#    include <sys/auxv.h>
62
#  elif defined(HAVE_ELF_AUX_INFO)
63
#    include <sys/auxv.h>
64
#  endif
65
#endif
66
67
#if defined(HAVE_RVV)
68
#  if defined(HAVE_GETAUXVAL) || defined(HAVE_ELF_AUX_INFO)
69
#    include <sys/auxv.h>
70
#    define HWCAP_RV(letter) (1ul << ((letter) - 'A'))
71
#  endif
72
#endif
73
74
namespace tesseract {
75
76
// Computes and returns the dot product of the two n-vectors u and v.
77
// Note: because the order of addition is different among the different dot
78
// product functions, the results can (and do) vary slightly (although they
79
// agree to within about 4e-15). This produces different results when running
80
// training, despite all random inputs being precisely equal.
81
// To get consistent results, use just one of these dot product functions.
82
// On a test multi-layer network, serial is 57% slower than SSE, and AVX
83
// is about 8% faster than SSE. This suggests that the time is memory
84
// bandwidth constrained and could benefit from holding the reused vector
85
// in AVX registers.
86
DotProductFunction DotProduct;
87
88
static STRING_VAR(dotproduct, "auto", "Function used for calculation of dot product");
89
90
SIMDDetect SIMDDetect::detector;
91
92
#if defined(__aarch64__)
93
// ARMv8 always has NEON.
94
bool SIMDDetect::neon_available_ = true;
95
#elif defined(HAVE_NEON)
96
// If true, then Neon has been detected.
97
bool SIMDDetect::neon_available_;
98
#elif defined(HAVE_RVV)
99
bool SIMDDetect::rvv_available_;
100
#else
101
// If true, then AVX has been detected.
102
bool SIMDDetect::avx_available_;
103
bool SIMDDetect::avx2_available_;
104
bool SIMDDetect::avx512F_available_;
105
bool SIMDDetect::avx512BW_available_;
106
bool SIMDDetect::avx512VNNI_available_;
107
// If true, then FMA has been detected.
108
bool SIMDDetect::fma_available_;
109
// If true, then SSe4.1 has been detected.
110
bool SIMDDetect::sse_available_;
111
#endif
112
113
#if defined(HAVE_FRAMEWORK_ACCELERATE)
114
static TFloat DotProductAccelerate(const TFloat* u, const TFloat* v, int n) {
115
  TFloat total = 0;
116
  const int stride = 1;
117
#if defined(FAST_FLOAT)
118
  vDSP_dotpr(u, stride, v, stride, &total, n);
119
#else
120
  vDSP_dotprD(u, stride, v, stride, &total, n);
121
#endif
122
  return total;
123
}
124
#endif
125
126
// Computes and returns the dot product of the two n-vectors u and v.
127
0
static TFloat DotProductGeneric(const TFloat *u, const TFloat *v, int n) {
128
0
  TFloat total = 0;
129
0
  for (int k = 0; k < n; ++k) {
130
0
    total += u[k] * v[k];
131
0
  }
132
0
  return total;
133
0
}
134
135
// Compute dot product using std::inner_product.
136
0
static TFloat DotProductStdInnerProduct(const TFloat *u, const TFloat *v, int n) {
137
0
  return std::inner_product(u, u + n, v, static_cast<TFloat>(0));
138
0
}
139
140
4
static void SetDotProduct(DotProductFunction f, const IntSimdMatrix *m = nullptr) {
141
4
  DotProduct = f;
142
4
  IntSimdMatrix::intSimdMatrix = m;
143
4
}
144
145
// Constructor.
146
// Tests the architecture in a system-dependent way to detect AVX, SSE and
147
// any other available SIMD equipment.
148
// __GNUC__ is also defined by compilers that include GNU extensions such as
149
// clang.
150
2
SIMDDetect::SIMDDetect() {
151
  // The fallback is a generic dot product calculation.
152
2
  SetDotProduct(DotProductGeneric);
153
154
2
#if defined(HAS_CPUID)
155
2
#  if defined(__GNUC__)
156
2
  unsigned int eax, ebx, ecx, edx;
157
2
  if (__get_cpuid(1, &eax, &ebx, &ecx, &edx) != 0) {
158
    // Note that these tests all use hex because the older compilers don't have
159
    // the newer flags.
160
2
#    if defined(HAVE_SSE4_1)
161
2
    sse_available_ = (ecx & 0x00080000) != 0;
162
2
#    endif
163
2
#    if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA)
164
2
    auto xgetbv = []() {
165
2
      uint32_t xcr0;
166
2
      __asm__("xgetbv" : "=a"(xcr0) : "c"(0) : "%edx");
167
2
      return xcr0;
168
2
    };
169
2
    if ((ecx & 0x08000000) && ((xgetbv() & 6) == 6)) {
170
      // OSXSAVE bit is set, XMM state and YMM state are fine.
171
2
#      if defined(HAVE_FMA)
172
2
      fma_available_ = (ecx & 0x00001000) != 0;
173
2
#      endif
174
2
#      if defined(HAVE_AVX)
175
2
      avx_available_ = (ecx & 0x10000000) != 0;
176
2
      if (avx_available_) {
177
        // There is supposed to be a __get_cpuid_count function, but this is all
178
        // there is in my cpuid.h. It is a macro for an asm statement and cannot
179
        // be used inside an if.
180
2
        __cpuid_count(7, 0, eax, ebx, ecx, edx);
181
2
        avx2_available_ = (ebx & 0x00000020) != 0;
182
2
        avx512F_available_ = (ebx & 0x00010000) != 0;
183
2
        avx512BW_available_ = (ebx & 0x40000000) != 0;
184
2
        avx512VNNI_available_ = (ecx & 0x00000800) != 0;
185
2
      }
186
2
#      endif
187
2
    }
188
2
#    endif
189
2
  }
190
#  elif defined(_WIN32)
191
  int cpuInfo[4];
192
  int max_function_id;
193
  __cpuid(cpuInfo, 0);
194
  max_function_id = cpuInfo[0];
195
  if (max_function_id >= 1) {
196
    __cpuid(cpuInfo, 1);
197
#    if defined(HAVE_SSE4_1)
198
    sse_available_ = (cpuInfo[2] & 0x00080000) != 0;
199
#    endif
200
#    if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA)
201
    if ((cpuInfo[2] & 0x08000000) && ((_xgetbv(0) & 6) == 6)) {
202
      // OSXSAVE bit is set, XMM state and YMM state are fine.
203
#      if defined(HAVE_FMA)
204
      fma_available_ = (cpuInfo[2] & 0x00001000) != 0;
205
#      endif
206
#      if defined(HAVE_AVX)
207
      avx_available_ = (cpuInfo[2] & 0x10000000) != 0;
208
#      endif
209
#      if defined(HAVE_AVX2)
210
      if (max_function_id >= 7) {
211
        __cpuid(cpuInfo, 7);
212
        avx2_available_ = (cpuInfo[1] & 0x00000020) != 0;
213
        avx512F_available_ = (cpuInfo[1] & 0x00010000) != 0;
214
        avx512BW_available_ = (cpuInfo[1] & 0x40000000) != 0;
215
        avx512VNNI_available_ = (cpuInfo[2] & 0x00000800) != 0;
216
      }
217
#      endif
218
    }
219
#    endif
220
  }
221
#  else
222
#    error "I don't know how to test for SIMD with this compiler"
223
#  endif
224
2
#endif
225
226
#if defined(HAVE_NEON) && !defined(__aarch64__)
227
#  if defined(HAVE_ANDROID_GETCPUFAMILY)
228
  {
229
    AndroidCpuFamily family = android_getCpuFamily();
230
    if (family == ANDROID_CPU_FAMILY_ARM)
231
      neon_available_ = (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON);
232
  }
233
#  elif defined(HAVE_GETAUXVAL)
234
  neon_available_ = getauxval(AT_HWCAP) & HWCAP_NEON;
235
#  elif defined(HAVE_ELF_AUX_INFO)
236
  unsigned long hwcap = 0;
237
  elf_aux_info(AT_HWCAP, &hwcap, sizeof hwcap);
238
  neon_available_ = hwcap & HWCAP_NEON;
239
#  endif
240
#endif
241
242
#if defined(HAVE_RVV)
243
#  if defined(HAVE_GETAUXVAL)
244
  const unsigned long hwcap = getauxval(AT_HWCAP);
245
  rvv_available_ = hwcap & HWCAP_RV('V');
246
#  elif defined(HAVE_ELF_AUX_INFO)
247
  unsigned long hwcap = 0;
248
  elf_aux_info(AT_HWCAP, &hwcap, sizeof hwcap);
249
  rvv_available_ = hwcap & HWCAP_RV('V');
250
#  endif
251
#endif
252
253
  // Select code for calculation of dot product based on autodetection.
254
2
  if (false) {
255
    // This is a dummy to support conditional compilation.
256
0
#if defined(HAVE_AVX512F)
257
2
  } else if (avx512F_available_) {
258
    // AVX512F detected.
259
0
    SetDotProduct(DotProductAVX512F, &IntSimdMatrix::intSimdMatrixAVX2);
260
0
#endif
261
0
#if defined(HAVE_AVX2)
262
2
  } else if (avx2_available_) {
263
    // AVX2 detected.
264
2
    SetDotProduct(DotProductAVX, &IntSimdMatrix::intSimdMatrixAVX2);
265
2
#endif
266
2
#if defined(HAVE_AVX)
267
2
  } else if (avx_available_) {
268
    // AVX detected.
269
0
    SetDotProduct(DotProductAVX, &IntSimdMatrix::intSimdMatrixSSE);
270
0
#endif
271
0
#if defined(HAVE_SSE4_1)
272
0
  } else if (sse_available_) {
273
    // SSE detected.
274
0
    SetDotProduct(DotProductSSE, &IntSimdMatrix::intSimdMatrixSSE);
275
0
#endif
276
#if defined(HAVE_NEON) || defined(__aarch64__)
277
  } else if (neon_available_) {
278
    // NEON detected.
279
    SetDotProduct(DotProductNEON, &IntSimdMatrix::intSimdMatrixNEON);
280
#endif
281
#if defined(HAVE_RVV)
282
  } else if (rvv_available_) {
283
    SetDotProduct(DotProductGeneric, &IntSimdMatrix::intSimdMatrixRVV);
284
#endif
285
0
  }
286
287
2
  const char *dotproduct_env = getenv("DOTPRODUCT");
288
2
  if (dotproduct_env != nullptr) {
289
    // Override automatic settings by value from environment variable.
290
0
    dotproduct = dotproduct_env;
291
0
    Update();
292
0
  }
293
2
}
294
295
0
void SIMDDetect::Update() {
296
  // Select code for calculation of dot product based on the
297
  // value of the config variable if that value is not empty.
298
0
  const char *dotproduct_method = "generic";
299
0
  if (dotproduct == "auto") {
300
    // Automatic detection. Nothing to be done.
301
0
  } else if (dotproduct == "generic") {
302
    // Generic code selected by config variable.
303
0
    SetDotProduct(DotProductGeneric);
304
0
    dotproduct_method = "generic";
305
0
  } else if (dotproduct == "native") {
306
    // Native optimized code selected by config variable.
307
0
    SetDotProduct(DotProductNative, IntSimdMatrix::intSimdMatrix);
308
0
    dotproduct_method = "native";
309
0
#if defined(HAVE_AVX2)
310
0
  } else if (dotproduct == "avx2") {
311
    // AVX2 selected by config variable.
312
0
    SetDotProduct(DotProductAVX, &IntSimdMatrix::intSimdMatrixAVX2);
313
0
    dotproduct_method = "avx2";
314
0
#endif
315
0
#if defined(HAVE_AVX)
316
0
  } else if (dotproduct == "avx") {
317
    // AVX selected by config variable.
318
0
    SetDotProduct(DotProductAVX, &IntSimdMatrix::intSimdMatrixSSE);
319
0
    dotproduct_method = "avx";
320
0
#endif
321
0
#if defined(HAVE_FMA)
322
0
  } else if (dotproduct == "fma") {
323
    // FMA selected by config variable.
324
0
    SetDotProduct(DotProductFMA, IntSimdMatrix::intSimdMatrix);
325
0
    dotproduct_method = "fma";
326
0
#endif
327
0
#if defined(HAVE_SSE4_1)
328
0
  } else if (dotproduct == "sse") {
329
    // SSE selected by config variable.
330
0
    SetDotProduct(DotProductSSE, &IntSimdMatrix::intSimdMatrixSSE);
331
0
    dotproduct_method = "sse";
332
0
#endif
333
#if defined(HAVE_FRAMEWORK_ACCELERATE)
334
  } else if (dotproduct == "accelerate") {
335
    SetDotProduct(DotProductAccelerate, IntSimdMatrix::intSimdMatrix);
336
#endif
337
#if defined(HAVE_NEON) || defined(__aarch64__)
338
  } else if (dotproduct == "neon" && neon_available_) {
339
    // NEON selected by config variable.
340
    SetDotProduct(DotProductNEON, &IntSimdMatrix::intSimdMatrixNEON);
341
    dotproduct_method = "neon";
342
#endif
343
0
  } else if (dotproduct == "std::inner_product") {
344
    // std::inner_product selected by config variable.
345
0
    SetDotProduct(DotProductStdInnerProduct, IntSimdMatrix::intSimdMatrix);
346
0
    dotproduct_method = "std::inner_product";
347
0
  } else {
348
    // Unsupported value of config variable.
349
0
    tprintf("Warning, ignoring unsupported config variable value: dotproduct=%s\n",
350
0
            dotproduct.c_str());
351
0
    tprintf(
352
0
        "Supported values for dotproduct: auto generic native"
353
0
#if defined(HAVE_AVX2)
354
0
        " avx2"
355
0
#endif
356
0
#if defined(HAVE_AVX)
357
0
        " avx"
358
0
#endif
359
0
#if defined(HAVE_FMA)
360
0
        " fma"
361
0
#endif
362
0
#if defined(HAVE_SSE4_1)
363
0
        " sse"
364
0
#endif
365
#if defined(HAVE_FRAMEWORK_ACCELERATE)
366
        " accelerate"
367
#endif
368
0
        " std::inner_product.\n");
369
0
  }
370
371
0
  dotproduct.set_value(dotproduct_method);
372
0
}
373
374
} // namespace tesseract