/src/tesseract/src/arch/simddetect.cpp

Source (jump to first uncovered line)
///////////////////////////////////////////////////////////////////////
// File:        simddetect.cpp
// Description: Architecture detector.
// Author:      Stefan Weil (based on code from Ray Smith)
//
// (C) Copyright 2014, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
///////////////////////////////////////////////////////////////////////

#ifdef HAVE_CONFIG_H
#  include "config_auto.h" // for HAVE_AVX, ...
#endif
#include <numeric> // for std::inner_product
#include "dotproduct.h"
#include "intsimdmatrix.h" // for IntSimdMatrix
#include "params.h"        // for STRING_VAR
#include "simddetect.h"
#include "tprintf.h" // for tprintf

#if !defined(__clang__) && defined(__GNUC__) && (__GNUC__ < 12)
// The GNU compiler g++ fails to compile with the Accelerate framework
// (tested with versions 10 and 11), so unconditionally disable it.
#undef HAVE_FRAMEWORK_ACCELERATE
#endif

#if defined(HAVE_FRAMEWORK_ACCELERATE)

// Use Apple Accelerate framework.
// https://developer.apple.com/documentation/accelerate/simd

#include <Accelerate/Accelerate.h>

#endif

#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA) || defined(HAVE_SSE4_1)
// See https://en.wikipedia.org/wiki/CPUID.
#  define HAS_CPUID
#endif

#if defined(HAS_CPUID)
#  if defined(__GNUC__)
#    include <cpuid.h>
#  elif defined(_WIN32)
#    include <intrin.h>
#  endif
#endif

#if defined(HAVE_NEON) && !defined(__aarch64__)
#  if defined(HAVE_ANDROID_GETCPUFAMILY)
#    include <cpu-features.h>
#  elif defined(HAVE_GETAUXVAL)
#    include <asm/hwcap.h>
#    include <sys/auxv.h>
#  elif defined(HAVE_ELF_AUX_INFO)
#    include <sys/auxv.h>
#  endif
#endif

#if defined(HAVE_RVV)
#  if defined(HAVE_GETAUXVAL) || defined(HAVE_ELF_AUX_INFO)
#    include <sys/auxv.h>
#    define HWCAP_RV(letter) (1ul << ((letter) - 'A'))
#  endif
#endif

namespace tesseract {

// Computes and returns the dot product of the two n-vectors u and v.
// Note: because the order of addition is different among the different dot
// product functions, the results can (and do) vary slightly (although they
// agree to within about 4e-15). This produces different results when running
// training, despite all random inputs being precisely equal.
// To get consistent results, use just one of these dot product functions.
// On a test multi-layer network, serial is 57% slower than SSE, and AVX
// is about 8% faster than SSE. This suggests that the time is memory
// bandwidth constrained and could benefit from holding the reused vector
// in AVX registers.
DotProductFunction DotProduct;

static STRING_VAR(dotproduct, "auto", "Function used for calculation of dot product");

SIMDDetect SIMDDetect::detector;

#if defined(__aarch64__)
// ARMv8 always has NEON.
bool SIMDDetect::neon_available_ = true;
#elif defined(HAVE_NEON)
// If true, then Neon has been detected.
bool SIMDDetect::neon_available_;
#elif defined(HAVE_RVV)
bool SIMDDetect::rvv_available_;
#else
// If true, then AVX has been detected.
bool SIMDDetect::avx_available_;
bool SIMDDetect::avx2_available_;
bool SIMDDetect::avx512F_available_;
bool SIMDDetect::avx512BW_available_;
bool SIMDDetect::avx512VNNI_available_;
// If true, then FMA has been detected.
bool SIMDDetect::fma_available_;
// If true, then SSe4.1 has been detected.
bool SIMDDetect::sse_available_;
#endif

#if defined(HAVE_FRAMEWORK_ACCELERATE)
static TFloat DotProductAccelerate(const TFloat* u, const TFloat* v, int n) {
  TFloat total = 0;
  const int stride = 1;
#if defined(FAST_FLOAT)
  vDSP_dotpr(u, stride, v, stride, &total, n);
#else
  vDSP_dotprD(u, stride, v, stride, &total, n);
#endif
  return total;
}
#endif

// Computes and returns the dot product of the two n-vectors u and v.
static TFloat DotProductGeneric(const TFloat *u, const TFloat *v, int n) {
  TFloat total = 0;
  for (int k = 0; k < n; ++k) {
    total += u[k] * v[k];
  }
  return total;
}

// Compute dot product using std::inner_product.
static TFloat DotProductStdInnerProduct(const TFloat *u, const TFloat *v, int n) {
  return std::inner_product(u, u + n, v, static_cast<TFloat>(0));
}

static void SetDotProduct(DotProductFunction f, const IntSimdMatrix *m = nullptr) {
  DotProduct = f;
  IntSimdMatrix::intSimdMatrix = m;
}

// Constructor.
// Tests the architecture in a system-dependent way to detect AVX, SSE and
// any other available SIMD equipment.
// __GNUC__ is also defined by compilers that include GNU extensions such as
// clang.
SIMDDetect::SIMDDetect() {
  // The fallback is a generic dot product calculation.
  SetDotProduct(DotProductGeneric);

#if defined(HAS_CPUID)
#  if defined(__GNUC__)
  unsigned int eax, ebx, ecx, edx;
  if (__get_cpuid(1, &eax, &ebx, &ecx, &edx) != 0) {
    // Note that these tests all use hex because the older compilers don't have
    // the newer flags.
#    if defined(HAVE_SSE4_1)
    sse_available_ = (ecx & 0x00080000) != 0;
#    endif
#    if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA)
    auto xgetbv = []() {
      uint32_t xcr0;
      __asm__("xgetbv" : "=a"(xcr0) : "c"(0) : "%edx");
      return xcr0;
    };
    if ((ecx & 0x08000000) && ((xgetbv() & 6) == 6)) {
      // OSXSAVE bit is set, XMM state and YMM state are fine.
#      if defined(HAVE_FMA)
      fma_available_ = (ecx & 0x00001000) != 0;
#      endif
#      if defined(HAVE_AVX)
      avx_available_ = (ecx & 0x10000000) != 0;
      if (avx_available_) {
        // There is supposed to be a __get_cpuid_count function, but this is all
        // there is in my cpuid.h. It is a macro for an asm statement and cannot
        // be used inside an if.
        __cpuid_count(7, 0, eax, ebx, ecx, edx);
        avx2_available_ = (ebx & 0x00000020) != 0;
        avx512F_available_ = (ebx & 0x00010000) != 0;
        avx512BW_available_ = (ebx & 0x40000000) != 0;
        avx512VNNI_available_ = (ecx & 0x00000800) != 0;
      }
#      endif
    }
#    endif
  }
#  elif defined(_WIN32)
  int cpuInfo[4];
  int max_function_id;
  __cpuid(cpuInfo, 0);
  max_function_id = cpuInfo[0];
  if (max_function_id >= 1) {
    __cpuid(cpuInfo, 1);
#    if defined(HAVE_SSE4_1)
    sse_available_ = (cpuInfo[2] & 0x00080000) != 0;
#    endif
#    if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA)
    if ((cpuInfo[2] & 0x08000000) && ((_xgetbv(0) & 6) == 6)) {
      // OSXSAVE bit is set, XMM state and YMM state are fine.
#      if defined(HAVE_FMA)
      fma_available_ = (cpuInfo[2] & 0x00001000) != 0;
#      endif
#      if defined(HAVE_AVX)
      avx_available_ = (cpuInfo[2] & 0x10000000) != 0;
#      endif
#      if defined(HAVE_AVX2)
      if (max_function_id >= 7) {
        __cpuid(cpuInfo, 7);
        avx2_available_ = (cpuInfo[1] & 0x00000020) != 0;
        avx512F_available_ = (cpuInfo[1] & 0x00010000) != 0;
        avx512BW_available_ = (cpuInfo[1] & 0x40000000) != 0;
        avx512VNNI_available_ = (cpuInfo[2] & 0x00000800) != 0;
      }
#      endif
    }
#    endif
  }
#  else
#    error "I don't know how to test for SIMD with this compiler"
#  endif
#endif

#if defined(HAVE_NEON) && !defined(__aarch64__)
#  if defined(HAVE_ANDROID_GETCPUFAMILY)
  {
    AndroidCpuFamily family = android_getCpuFamily();
    if (family == ANDROID_CPU_FAMILY_ARM)
      neon_available_ = (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON);
  }
#  elif defined(HAVE_GETAUXVAL)
  neon_available_ = getauxval(AT_HWCAP) & HWCAP_NEON;
#  elif defined(HAVE_ELF_AUX_INFO)
  unsigned long hwcap = 0;
  elf_aux_info(AT_HWCAP, &hwcap, sizeof hwcap);
  neon_available_ = hwcap & HWCAP_NEON;
#  endif
#endif

#if defined(HAVE_RVV)
#  if defined(HAVE_GETAUXVAL)
  const unsigned long hwcap = getauxval(AT_HWCAP);
  rvv_available_ = hwcap & HWCAP_RV('V');
#  elif defined(HAVE_ELF_AUX_INFO)
  unsigned long hwcap = 0;
  elf_aux_info(AT_HWCAP, &hwcap, sizeof hwcap);
  rvv_available_ = hwcap & HWCAP_RV('V');
#  endif
#endif

  // Select code for calculation of dot product based on autodetection.
  if (false) {
    // This is a dummy to support conditional compilation.
#if defined(HAVE_AVX512F)
  } else if (avx512F_available_) {
    // AVX512F detected.
    SetDotProduct(DotProductAVX512F, &IntSimdMatrix::intSimdMatrixAVX2);
#endif
#if defined(HAVE_AVX2)
  } else if (avx2_available_) {
    // AVX2 detected.
    SetDotProduct(DotProductAVX, &IntSimdMatrix::intSimdMatrixAVX2);
#endif
#if defined(HAVE_AVX)
  } else if (avx_available_) {
    // AVX detected.
    SetDotProduct(DotProductAVX, &IntSimdMatrix::intSimdMatrixSSE);
#endif
#if defined(HAVE_SSE4_1)
  } else if (sse_available_) {
    // SSE detected.
    SetDotProduct(DotProductSSE, &IntSimdMatrix::intSimdMatrixSSE);
#endif
#if defined(HAVE_NEON) || defined(__aarch64__)
  } else if (neon_available_) {
    // NEON detected.
    SetDotProduct(DotProductNEON, &IntSimdMatrix::intSimdMatrixNEON);
#endif
#if defined(HAVE_RVV)
  } else if (rvv_available_) {
    SetDotProduct(DotProductGeneric, &IntSimdMatrix::intSimdMatrixRVV);
#endif
  }

  const char *dotproduct_env = getenv("DOTPRODUCT");
  if (dotproduct_env != nullptr) {
    // Override automatic settings by value from environment variable.
    dotproduct = dotproduct_env;
    Update();
  }
}

void SIMDDetect::Update() {
  // Select code for calculation of dot product based on the
  // value of the config variable if that value is not empty.
  const char *dotproduct_method = "generic";
  if (dotproduct == "auto") {
    // Automatic detection. Nothing to be done.
  } else if (dotproduct == "generic") {
    // Generic code selected by config variable.
    SetDotProduct(DotProductGeneric);
    dotproduct_method = "generic";
  } else if (dotproduct == "native") {
    // Native optimized code selected by config variable.
    SetDotProduct(DotProductNative, IntSimdMatrix::intSimdMatrix);
    dotproduct_method = "native";
#if defined(HAVE_AVX2)
  } else if (dotproduct == "avx2") {
    // AVX2 selected by config variable.
    SetDotProduct(DotProductAVX, &IntSimdMatrix::intSimdMatrixAVX2);
    dotproduct_method = "avx2";
#endif
#if defined(HAVE_AVX)
  } else if (dotproduct == "avx") {
    // AVX selected by config variable.
    SetDotProduct(DotProductAVX, &IntSimdMatrix::intSimdMatrixSSE);
    dotproduct_method = "avx";
#endif
#if defined(HAVE_FMA)
  } else if (dotproduct == "fma") {
    // FMA selected by config variable.
    SetDotProduct(DotProductFMA, IntSimdMatrix::intSimdMatrix);
    dotproduct_method = "fma";
#endif
#if defined(HAVE_SSE4_1)
  } else if (dotproduct == "sse") {
    // SSE selected by config variable.
    SetDotProduct(DotProductSSE, &IntSimdMatrix::intSimdMatrixSSE);
    dotproduct_method = "sse";
#endif
#if defined(HAVE_FRAMEWORK_ACCELERATE)
  } else if (dotproduct == "accelerate") {
    SetDotProduct(DotProductAccelerate, IntSimdMatrix::intSimdMatrix);
#endif
#if defined(HAVE_NEON) || defined(__aarch64__)
  } else if (dotproduct == "neon" && neon_available_) {
    // NEON selected by config variable.
    SetDotProduct(DotProductNEON, &IntSimdMatrix::intSimdMatrixNEON);
    dotproduct_method = "neon";
#endif
  } else if (dotproduct == "std::inner_product") {
    // std::inner_product selected by config variable.
    SetDotProduct(DotProductStdInnerProduct, IntSimdMatrix::intSimdMatrix);
    dotproduct_method = "std::inner_product";
  } else {
    // Unsupported value of config variable.
    tprintf("Warning, ignoring unsupported config variable value: dotproduct=%s\n",
            dotproduct.c_str());
    tprintf(
        "Supported values for dotproduct: auto generic native"
#if defined(HAVE_AVX2)
        " avx2"
#endif
#if defined(HAVE_AVX)
        " avx"
#endif
#if defined(HAVE_FMA)
        " fma"
#endif
#if defined(HAVE_SSE4_1)
        " sse"
#endif
#if defined(HAVE_FRAMEWORK_ACCELERATE)
        " accelerate"
#endif
        " std::inner_product.\n");
  }

  dotproduct.set_value(dotproduct_method);
}

} // namespace tesseract

Coverage Report

Created: 2025-06-13 07:02

Line	Count	Source (jump to first uncovered line)
1		///////////////////////////////////////////////////////////////////////
2		// File: simddetect.cpp
3		// Description: Architecture detector.
4		// Author: Stefan Weil (based on code from Ray Smith)
5		//
6		// (C) Copyright 2014, Google Inc.
7		// Licensed under the Apache License, Version 2.0 (the "License");
8		// you may not use this file except in compliance with the License.
9		// You may obtain a copy of the License at
10		// http://www.apache.org/licenses/LICENSE-2.0
11		// Unless required by applicable law or agreed to in writing, software
12		// distributed under the License is distributed on an "AS IS" BASIS,
13		// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14		// See the License for the specific language governing permissions and
15		// limitations under the License.
16		///////////////////////////////////////////////////////////////////////
17
18		#ifdef HAVE_CONFIG_H
19		# include "config_auto.h" // for HAVE_AVX, ...
20		#endif
21		#include <numeric> // for std::inner_product
22		#include "dotproduct.h"
23		#include "intsimdmatrix.h" // for IntSimdMatrix
24		#include "params.h" // for STRING_VAR
25		#include "simddetect.h"
26		#include "tprintf.h" // for tprintf
27
28		#if !defined(__clang__) && defined(__GNUC__) && (__GNUC__ < 12)
29		// The GNU compiler g++ fails to compile with the Accelerate framework
30		// (tested with versions 10 and 11), so unconditionally disable it.
31		#undef HAVE_FRAMEWORK_ACCELERATE
32		#endif
33
34		#if defined(HAVE_FRAMEWORK_ACCELERATE)
35
36		// Use Apple Accelerate framework.
37		// https://developer.apple.com/documentation/accelerate/simd
38
39		#include <Accelerate/Accelerate.h>
40
41		#endif
42
43		#if defined(HAVE_AVX) \|\| defined(HAVE_AVX2) \|\| defined(HAVE_FMA) \|\| defined(HAVE_SSE4_1)
44		// See https://en.wikipedia.org/wiki/CPUID.
45		# define HAS_CPUID
46		#endif
47
48		#if defined(HAS_CPUID)
49		# if defined(__GNUC__)
50		# include <cpuid.h>
51		# elif defined(_WIN32)
52		# include <intrin.h>
53		# endif
54		#endif
55
56		#if defined(HAVE_NEON) && !defined(__aarch64__)
57		# if defined(HAVE_ANDROID_GETCPUFAMILY)
58		# include <cpu-features.h>
59		# elif defined(HAVE_GETAUXVAL)
60		# include <asm/hwcap.h>
61		# include <sys/auxv.h>
62		# elif defined(HAVE_ELF_AUX_INFO)
63		# include <sys/auxv.h>
64		# endif
65		#endif
66
67		#if defined(HAVE_RVV)
68		# if defined(HAVE_GETAUXVAL) \|\| defined(HAVE_ELF_AUX_INFO)
69		# include <sys/auxv.h>
70		# define HWCAP_RV(letter) (1ul << ((letter) - 'A'))
71		# endif
72		#endif
73
74		namespace tesseract {
75
76		// Computes and returns the dot product of the two n-vectors u and v.
77		// Note: because the order of addition is different among the different dot
78		// product functions, the results can (and do) vary slightly (although they
79		// agree to within about 4e-15). This produces different results when running
80		// training, despite all random inputs being precisely equal.
81		// To get consistent results, use just one of these dot product functions.
82		// On a test multi-layer network, serial is 57% slower than SSE, and AVX
83		// is about 8% faster than SSE. This suggests that the time is memory
84		// bandwidth constrained and could benefit from holding the reused vector
85		// in AVX registers.
86		DotProductFunction DotProduct;
87
88		static STRING_VAR(dotproduct, "auto", "Function used for calculation of dot product");
89
90		SIMDDetect SIMDDetect::detector;
91
92		#if defined(__aarch64__)
93		// ARMv8 always has NEON.
94		bool SIMDDetect::neon_available_ = true;
95		#elif defined(HAVE_NEON)
96		// If true, then Neon has been detected.
97		bool SIMDDetect::neon_available_;
98		#elif defined(HAVE_RVV)
99		bool SIMDDetect::rvv_available_;
100		#else
101		// If true, then AVX has been detected.
102		bool SIMDDetect::avx_available_;
103		bool SIMDDetect::avx2_available_;
104		bool SIMDDetect::avx512F_available_;
105		bool SIMDDetect::avx512BW_available_;
106		bool SIMDDetect::avx512VNNI_available_;
107		// If true, then FMA has been detected.
108		bool SIMDDetect::fma_available_;
109		// If true, then SSe4.1 has been detected.
110		bool SIMDDetect::sse_available_;
111		#endif
112
113		#if defined(HAVE_FRAMEWORK_ACCELERATE)
114		static TFloat DotProductAccelerate(const TFloat* u, const TFloat* v, int n) {
115		TFloat total = 0;
116		const int stride = 1;
117		#if defined(FAST_FLOAT)
118		vDSP_dotpr(u, stride, v, stride, &total, n);
119		#else
120		vDSP_dotprD(u, stride, v, stride, &total, n);
121		#endif
122		return total;
123		}
124		#endif
125
126		// Computes and returns the dot product of the two n-vectors u and v.
127	0	static TFloat DotProductGeneric(const TFloat u, const TFloat v, int n) {
128	0	TFloat total = 0;
129	0	for (int k = 0; k < n; ++k) {
130	0	total += u[k] * v[k];
131	0	}
132	0	return total;
133	0	}
134
135		// Compute dot product using std::inner_product.
136	0	static TFloat DotProductStdInnerProduct(const TFloat u, const TFloat v, int n) {
137	0	return std::inner_product(u, u + n, v, static_cast<TFloat>(0));
138	0	}
139
140	4	static void SetDotProduct(DotProductFunction f, const IntSimdMatrix *m = nullptr) {
141	4	DotProduct = f;
142	4	IntSimdMatrix::intSimdMatrix = m;
143	4	}
144
145		// Constructor.
146		// Tests the architecture in a system-dependent way to detect AVX, SSE and
147		// any other available SIMD equipment.
148		// __GNUC__ is also defined by compilers that include GNU extensions such as
149		// clang.
150	2	SIMDDetect::SIMDDetect() {
151		// The fallback is a generic dot product calculation.
152	2	SetDotProduct(DotProductGeneric);
153
154	2	#if defined(HAS_CPUID)
155	2	# if defined(__GNUC__)
156	2	unsigned int eax, ebx, ecx, edx;
157	2	if (__get_cpuid(1, &eax, &ebx, &ecx, &edx) != 0) {
158		// Note that these tests all use hex because the older compilers don't have
159		// the newer flags.
160	2	# if defined(HAVE_SSE4_1)
161	2	sse_available_ = (ecx & 0x00080000) != 0;
162	2	# endif
163	2	# if defined(HAVE_AVX) \|\| defined(HAVE_AVX2) \|\| defined(HAVE_FMA)
164	2	auto xgetbv = []() {
165	2	uint32_t xcr0;
166	2	__asm__("xgetbv" : "=a"(xcr0) : "c"(0) : "%edx");
167	2	return xcr0;
168	2	};
169	2	if ((ecx & 0x08000000) && ((xgetbv() & 6) == 6)) {
170		// OSXSAVE bit is set, XMM state and YMM state are fine.
171	2	# if defined(HAVE_FMA)
172	2	fma_available_ = (ecx & 0x00001000) != 0;
173	2	# endif
174	2	# if defined(HAVE_AVX)
175	2	avx_available_ = (ecx & 0x10000000) != 0;
176	2	if (avx_available_) {
177		// There is supposed to be a __get_cpuid_count function, but this is all
178		// there is in my cpuid.h. It is a macro for an asm statement and cannot
179		// be used inside an if.
180	2	__cpuid_count(7, 0, eax, ebx, ecx, edx);
181	2	avx2_available_ = (ebx & 0x00000020) != 0;
182	2	avx512F_available_ = (ebx & 0x00010000) != 0;
183	2	avx512BW_available_ = (ebx & 0x40000000) != 0;
184	2	avx512VNNI_available_ = (ecx & 0x00000800) != 0;
185	2	}
186	2	# endif
187	2	}
188	2	# endif
189	2	}
190		# elif defined(_WIN32)
191		int cpuInfo[4];
192		int max_function_id;
193		__cpuid(cpuInfo, 0);
194		max_function_id = cpuInfo[0];
195		if (max_function_id >= 1) {
196		__cpuid(cpuInfo, 1);
197		# if defined(HAVE_SSE4_1)
198		sse_available_ = (cpuInfo[2] & 0x00080000) != 0;
199		# endif
200		# if defined(HAVE_AVX) \|\| defined(HAVE_AVX2) \|\| defined(HAVE_FMA)
201		if ((cpuInfo[2] & 0x08000000) && ((_xgetbv(0) & 6) == 6)) {
202		// OSXSAVE bit is set, XMM state and YMM state are fine.
203		# if defined(HAVE_FMA)
204		fma_available_ = (cpuInfo[2] & 0x00001000) != 0;
205		# endif
206		# if defined(HAVE_AVX)
207		avx_available_ = (cpuInfo[2] & 0x10000000) != 0;
208		# endif
209		# if defined(HAVE_AVX2)
210		if (max_function_id >= 7) {
211		__cpuid(cpuInfo, 7);
212		avx2_available_ = (cpuInfo[1] & 0x00000020) != 0;
213		avx512F_available_ = (cpuInfo[1] & 0x00010000) != 0;
214		avx512BW_available_ = (cpuInfo[1] & 0x40000000) != 0;
215		avx512VNNI_available_ = (cpuInfo[2] & 0x00000800) != 0;
216		}
217		# endif
218		}
219		# endif
220		}
221		# else
222		# error "I don't know how to test for SIMD with this compiler"
223		# endif
224	2	#endif
225
226		#if defined(HAVE_NEON) && !defined(__aarch64__)
227		# if defined(HAVE_ANDROID_GETCPUFAMILY)
228		{
229		AndroidCpuFamily family = android_getCpuFamily();
230		if (family == ANDROID_CPU_FAMILY_ARM)
231		neon_available_ = (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON);
232		}
233		# elif defined(HAVE_GETAUXVAL)
234		neon_available_ = getauxval(AT_HWCAP) & HWCAP_NEON;
235		# elif defined(HAVE_ELF_AUX_INFO)
236		unsigned long hwcap = 0;
237		elf_aux_info(AT_HWCAP, &hwcap, sizeof hwcap);
238		neon_available_ = hwcap & HWCAP_NEON;
239		# endif
240		#endif
241
242		#if defined(HAVE_RVV)
243		# if defined(HAVE_GETAUXVAL)
244		const unsigned long hwcap = getauxval(AT_HWCAP);
245		rvv_available_ = hwcap & HWCAP_RV('V');
246		# elif defined(HAVE_ELF_AUX_INFO)
247		unsigned long hwcap = 0;
248		elf_aux_info(AT_HWCAP, &hwcap, sizeof hwcap);
249		rvv_available_ = hwcap & HWCAP_RV('V');
250		# endif
251		#endif
252
253		// Select code for calculation of dot product based on autodetection.
254	2	if (false) {
255		// This is a dummy to support conditional compilation.
256	0	#if defined(HAVE_AVX512F)
257	2	} else if (avx512F_available_) {
258		// AVX512F detected.
259	0	SetDotProduct(DotProductAVX512F, &IntSimdMatrix::intSimdMatrixAVX2);
260	0	#endif
261	0	#if defined(HAVE_AVX2)
262	2	} else if (avx2_available_) {
263		// AVX2 detected.
264	2	SetDotProduct(DotProductAVX, &IntSimdMatrix::intSimdMatrixAVX2);
265	2	#endif
266	2	#if defined(HAVE_AVX)
267	2	} else if (avx_available_) {
268		// AVX detected.
269	0	SetDotProduct(DotProductAVX, &IntSimdMatrix::intSimdMatrixSSE);
270	0	#endif
271	0	#if defined(HAVE_SSE4_1)
272	0	} else if (sse_available_) {
273		// SSE detected.
274	0	SetDotProduct(DotProductSSE, &IntSimdMatrix::intSimdMatrixSSE);
275	0	#endif
276		#if defined(HAVE_NEON) \|\| defined(__aarch64__)
277		} else if (neon_available_) {
278		// NEON detected.
279		SetDotProduct(DotProductNEON, &IntSimdMatrix::intSimdMatrixNEON);
280		#endif
281		#if defined(HAVE_RVV)
282		} else if (rvv_available_) {
283		SetDotProduct(DotProductGeneric, &IntSimdMatrix::intSimdMatrixRVV);
284		#endif
285	0	}
286
287	2	const char *dotproduct_env = getenv("DOTPRODUCT");
288	2	if (dotproduct_env != nullptr) {
289		// Override automatic settings by value from environment variable.
290	0	dotproduct = dotproduct_env;
291	0	Update();
292	0	}
293	2	}
294
295	0	void SIMDDetect::Update() {
296		// Select code for calculation of dot product based on the
297		// value of the config variable if that value is not empty.
298	0	const char *dotproduct_method = "generic";
299	0	if (dotproduct == "auto") {
300		// Automatic detection. Nothing to be done.
301	0	} else if (dotproduct == "generic") {
302		// Generic code selected by config variable.
303	0	SetDotProduct(DotProductGeneric);
304	0	dotproduct_method = "generic";
305	0	} else if (dotproduct == "native") {
306		// Native optimized code selected by config variable.
307	0	SetDotProduct(DotProductNative, IntSimdMatrix::intSimdMatrix);
308	0	dotproduct_method = "native";
309	0	#if defined(HAVE_AVX2)
310	0	} else if (dotproduct == "avx2") {
311		// AVX2 selected by config variable.
312	0	SetDotProduct(DotProductAVX, &IntSimdMatrix::intSimdMatrixAVX2);
313	0	dotproduct_method = "avx2";
314	0	#endif
315	0	#if defined(HAVE_AVX)
316	0	} else if (dotproduct == "avx") {
317		// AVX selected by config variable.
318	0	SetDotProduct(DotProductAVX, &IntSimdMatrix::intSimdMatrixSSE);
319	0	dotproduct_method = "avx";
320	0	#endif
321	0	#if defined(HAVE_FMA)
322	0	} else if (dotproduct == "fma") {
323		// FMA selected by config variable.
324	0	SetDotProduct(DotProductFMA, IntSimdMatrix::intSimdMatrix);
325	0	dotproduct_method = "fma";
326	0	#endif
327	0	#if defined(HAVE_SSE4_1)
328	0	} else if (dotproduct == "sse") {
329		// SSE selected by config variable.
330	0	SetDotProduct(DotProductSSE, &IntSimdMatrix::intSimdMatrixSSE);
331	0	dotproduct_method = "sse";
332	0	#endif
333		#if defined(HAVE_FRAMEWORK_ACCELERATE)
334		} else if (dotproduct == "accelerate") {
335		SetDotProduct(DotProductAccelerate, IntSimdMatrix::intSimdMatrix);
336		#endif
337		#if defined(HAVE_NEON) \|\| defined(__aarch64__)
338		} else if (dotproduct == "neon" && neon_available_) {
339		// NEON selected by config variable.
340		SetDotProduct(DotProductNEON, &IntSimdMatrix::intSimdMatrixNEON);
341		dotproduct_method = "neon";
342		#endif
343	0	} else if (dotproduct == "std::inner_product") {
344		// std::inner_product selected by config variable.
345	0	SetDotProduct(DotProductStdInnerProduct, IntSimdMatrix::intSimdMatrix);
346	0	dotproduct_method = "std::inner_product";
347	0	} else {
348		// Unsupported value of config variable.
349	0	tprintf("Warning, ignoring unsupported config variable value: dotproduct=%s\n",
350	0	dotproduct.c_str());
351	0	tprintf(
352	0	"Supported values for dotproduct: auto generic native"
353	0	#if defined(HAVE_AVX2)
354	0	" avx2"
355	0	#endif
356	0	#if defined(HAVE_AVX)
357	0	" avx"
358	0	#endif
359	0	#if defined(HAVE_FMA)
360	0	" fma"
361	0	#endif
362	0	#if defined(HAVE_SSE4_1)
363	0	" sse"
364	0	#endif
365		#if defined(HAVE_FRAMEWORK_ACCELERATE)
366		" accelerate"
367		#endif
368	0	" std::inner_product.\n");
369	0	}
370
371	0	dotproduct.set_value(dotproduct_method);
372	0	}
373
374		} // namespace tesseract