/proc/self/cwd/external/gemmlowp/internal/common.h

Source (jump to first uncovered line)
// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// common.h: contains stuff that's used throughout gemmlowp
// and should always be available.

#ifndef GEMMLOWP_INTERNAL_COMMON_H_
#define GEMMLOWP_INTERNAL_COMMON_H_

#include "../internal/platform.h"
#include "../profiling/pthread_everywhere.h"

#include <algorithm>
#include <cassert>
#include <cmath>
#include <cstdlib>

#include "../internal/detect_platform.h"
#include "../profiling/instrumentation.h"

namespace gemmlowp {

// Standard cache line size. Useful to optimize alignment and
// prefetches. Ideally we would query this at runtime, however
// 64 byte cache lines are the vast majority, and even if it's
// wrong on some device, it will be wrong by no more than a 2x factor,
// which should be acceptable.
const int kDefaultCacheLineSize = 64;

// Default L1 and L2 data cache sizes.
// The L1 cache size is assumed to be for each core.
// The L2 cache size is assumed to be shared among all cores. What
// we call 'L2' here is effectively top-level cache.
//
// On x86, we should ideally query this at
// runtime. On ARM, the instruction to query this is privileged and
// Android kernels do not expose it to userspace. Fortunately, the majority
// of ARM devices have roughly comparable values:
//   Nexus 5: L1 16k, L2 1M
//   Android One: L1 32k, L2 512k
// The following values are equal to or somewhat lower than that, and were
// found to perform well on both the Nexus 5 and Android One.
// Of course, these values are in principle too low for typical x86 CPUs
// where we should set the L2 value to (L3 cache size / number of cores) at
// least.
//
#if defined(GEMMLOWP_ARM) && defined(__APPLE__)
// iPhone/iPad
const int kDefaultL1CacheSize = 48 * 1024;
const int kDefaultL2CacheSize = 2 * 1024 * 1024;
#elif defined(GEMMLOWP_ARM) || defined(GEMMLOWP_ANDROID)
// Other ARM or ARM-like hardware (Android implies ARM-like) so here it's OK
// to tune for ARM, although on x86 Atom we might be able to query
// cache sizes at runtime, which would be better.
const int kDefaultL1CacheSize = 16 * 1024;
const int kDefaultL2CacheSize = 384 * 1024;
#elif defined(GEMMLOWP_X86_64)
// x86-64 and not Android. Therefore, likely desktop-class x86 hardware.
// Thus we assume larger cache sizes, though we really should query
// them at runtime.
const int kDefaultL1CacheSize = 32 * 1024;
const int kDefaultL2CacheSize = 4 * 1024 * 1024;
#elif defined(GEMMLOWP_X86_32)
// x86-32 and not Android. Same as x86-64 but less bullish.
const int kDefaultL1CacheSize = 32 * 1024;
const int kDefaultL2CacheSize = 2 * 1024 * 1024;
#elif defined(GEMMLOWP_MIPS)
// MIPS and not Android. TODO: MIPS and Android?
const int kDefaultL1CacheSize = 32 * 1024;
const int kDefaultL2CacheSize = 1024 * 1024;
#else
// Less common hardware. Maybe some unusual or older or embedded thing.
// Assume smaller caches, but don't depart too far from what we do
// on ARM/Android to avoid accidentally exposing unexpected behavior.
const int kDefaultL1CacheSize = 16 * 1024;
const int kDefaultL2CacheSize = 256 * 1024;
#endif

// The proportion of the cache that we intend to use for storing
// RHS blocks. This should be between 0 and 1, and typically closer to 1,
// as we typically want to use most of the L2 cache for storing a large
// RHS block.
#if defined(GEMMLOWP_X86)
// For IA, use the entire L2 cache for the RHS matrix. LHS matrix is not blocked
// for L2 cache.
const float kDefaultL2RhsFactor = 1.00f;
#else
const float kDefaultL2RhsFactor = 0.75f;
#endif

// The number of bytes in a SIMD register. This is used to determine
// the dimensions of PackingRegisterBlock so that such blocks can
// be efficiently loaded into registers, so that packing code can
// work within registers as much as possible.
// In the non-SIMD generic fallback code, this is just a generic array
// size, so any size would work there. Different platforms may set this
// to different values but must ensure that their own optimized packing paths
// are consistent with this value.

#ifdef GEMMLOWP_AVX2
const int kRegisterSize = 32;
#else
const int kRegisterSize = 16;
#endif

// Hints the CPU to prefetch the cache line containing ptr.
inline void Prefetch(const void* ptr) {
#if defined GEMMLOWP_ARM_64 && defined GEMMLOWP_ALLOW_INLINE_ASM
  // Aarch64 has very detailed prefetch instructions, that compilers
  // can't know how to map __builtin_prefetch to, and as a result, don't,
  // leaving __builtin_prefetch a no-op on this architecture.
  // For our purposes, "pldl1keep" is usually what we want, meaning:
  // "prefetch for load, into L1 cache, using each value multiple times".
  asm volatile("prfm pldl1keep, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
#elif defined \
    __GNUC__  // Clang and GCC define __GNUC__ and have __builtin_prefetch.
  __builtin_prefetch(ptr);
#else
  (void)ptr;
#endif
}

// Returns the runtime argument rounded down to the nearest multiple of
// the fixed Modulus.
template <unsigned Modulus, typename Integer>
Integer RoundDown(Integer i) {
  return i - (i % Modulus);
}

// Returns the runtime argument rounded up to the nearest multiple of
// the fixed Modulus.
template <unsigned Modulus, typename Integer>
Integer RoundUp(Integer i) {
  return RoundDown<Modulus>(i + Modulus - 1);
}

// Returns the quotient a / b rounded up ('ceil') to the nearest integer.
template <typename Integer>
Integer CeilQuotient(Integer a, Integer b) {
  return (a + b - 1) / b;
}

// Returns the argument rounded up to the nearest power of two.
template <typename Integer>
Integer RoundUpToPowerOfTwo(Integer n) {
  Integer i = n - 1;
  i |= i >> 1;
  i |= i >> 2;
  i |= i >> 4;
  i |= i >> 8;
  i |= i >> 16;
  return i + 1;
}

template <int N>
struct IsPowerOfTwo {
  static constexpr bool value = !(N & (N - 1));
};

template <typename T>
void MarkMemoryAsInitialized(T* ptr, int size) {
#ifdef GEMMLOWP_MARK_MEMORY_AS_INITIALIZED
  GEMMLOWP_MARK_MEMORY_AS_INITIALIZED(static_cast<void*>(ptr),
                                      size * sizeof(T));
#else
  (void)ptr;
  (void)size;
#endif
}

}  // namespace gemmlowp

#endif  // GEMMLOWP_INTERNAL_COMMON_H_

Coverage Report

Created: 2024-05-04 12:45

Line	Count	Source (jump to first uncovered line)
1		// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
2		//
3		// Licensed under the Apache License, Version 2.0 (the "License");
4		// you may not use this file except in compliance with the License.
5		// You may obtain a copy of the License at
6		//
7		// http://www.apache.org/licenses/LICENSE-2.0
8		//
9		// Unless required by applicable law or agreed to in writing, software
10		// distributed under the License is distributed on an "AS IS" BASIS,
11		// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12		// See the License for the specific language governing permissions and
13		// limitations under the License.
14
15		// common.h: contains stuff that's used throughout gemmlowp
16		// and should always be available.
17
18		#ifndef GEMMLOWP_INTERNAL_COMMON_H_
19		#define GEMMLOWP_INTERNAL_COMMON_H_
20
21		#include "../internal/platform.h"
22		#include "../profiling/pthread_everywhere.h"
23
24		#include <algorithm>
25		#include <cassert>
26		#include <cmath>
27		#include <cstdlib>
28
29		#include "../internal/detect_platform.h"
30		#include "../profiling/instrumentation.h"
31
32		namespace gemmlowp {
33
34		// Standard cache line size. Useful to optimize alignment and
35		// prefetches. Ideally we would query this at runtime, however
36		// 64 byte cache lines are the vast majority, and even if it's
37		// wrong on some device, it will be wrong by no more than a 2x factor,
38		// which should be acceptable.
39		const int kDefaultCacheLineSize = 64;
40
41		// Default L1 and L2 data cache sizes.
42		// The L1 cache size is assumed to be for each core.
43		// The L2 cache size is assumed to be shared among all cores. What
44		// we call 'L2' here is effectively top-level cache.
45		//
46		// On x86, we should ideally query this at
47		// runtime. On ARM, the instruction to query this is privileged and
48		// Android kernels do not expose it to userspace. Fortunately, the majority
49		// of ARM devices have roughly comparable values:
50		// Nexus 5: L1 16k, L2 1M
51		// Android One: L1 32k, L2 512k
52		// The following values are equal to or somewhat lower than that, and were
53		// found to perform well on both the Nexus 5 and Android One.
54		// Of course, these values are in principle too low for typical x86 CPUs
55		// where we should set the L2 value to (L3 cache size / number of cores) at
56		// least.
57		//
58		#if defined(GEMMLOWP_ARM) && defined(__APPLE__)
59		// iPhone/iPad
60		const int kDefaultL1CacheSize = 48 * 1024;
61		const int kDefaultL2CacheSize = 2 * 1024 * 1024;
62		#elif defined(GEMMLOWP_ARM) \|\| defined(GEMMLOWP_ANDROID)
63		// Other ARM or ARM-like hardware (Android implies ARM-like) so here it's OK
64		// to tune for ARM, although on x86 Atom we might be able to query
65		// cache sizes at runtime, which would be better.
66		const int kDefaultL1CacheSize = 16 * 1024;
67		const int kDefaultL2CacheSize = 384 * 1024;
68		#elif defined(GEMMLOWP_X86_64)
69		// x86-64 and not Android. Therefore, likely desktop-class x86 hardware.
70		// Thus we assume larger cache sizes, though we really should query
71		// them at runtime.
72		const int kDefaultL1CacheSize = 32 * 1024;
73		const int kDefaultL2CacheSize = 4 * 1024 * 1024;
74		#elif defined(GEMMLOWP_X86_32)
75		// x86-32 and not Android. Same as x86-64 but less bullish.
76		const int kDefaultL1CacheSize = 32 * 1024;
77		const int kDefaultL2CacheSize = 2 * 1024 * 1024;
78		#elif defined(GEMMLOWP_MIPS)
79		// MIPS and not Android. TODO: MIPS and Android?
80		const int kDefaultL1CacheSize = 32 * 1024;
81		const int kDefaultL2CacheSize = 1024 * 1024;
82		#else
83		// Less common hardware. Maybe some unusual or older or embedded thing.
84		// Assume smaller caches, but don't depart too far from what we do
85		// on ARM/Android to avoid accidentally exposing unexpected behavior.
86		const int kDefaultL1CacheSize = 16 * 1024;
87		const int kDefaultL2CacheSize = 256 * 1024;
88		#endif
89
90		// The proportion of the cache that we intend to use for storing
91		// RHS blocks. This should be between 0 and 1, and typically closer to 1,
92		// as we typically want to use most of the L2 cache for storing a large
93		// RHS block.
94		#if defined(GEMMLOWP_X86)
95		// For IA, use the entire L2 cache for the RHS matrix. LHS matrix is not blocked
96		// for L2 cache.
97		const float kDefaultL2RhsFactor = 1.00f;
98		#else
99		const float kDefaultL2RhsFactor = 0.75f;
100		#endif
101
102		// The number of bytes in a SIMD register. This is used to determine
103		// the dimensions of PackingRegisterBlock so that such blocks can
104		// be efficiently loaded into registers, so that packing code can
105		// work within registers as much as possible.
106		// In the non-SIMD generic fallback code, this is just a generic array
107		// size, so any size would work there. Different platforms may set this
108		// to different values but must ensure that their own optimized packing paths
109		// are consistent with this value.
110
111		#ifdef GEMMLOWP_AVX2
112		const int kRegisterSize = 32;
113		#else
114		const int kRegisterSize = 16;
115		#endif
116
117		// Hints the CPU to prefetch the cache line containing ptr.
118	0	inline void Prefetch(const void* ptr) {
119		#if defined GEMMLOWP_ARM_64 && defined GEMMLOWP_ALLOW_INLINE_ASM
120		// Aarch64 has very detailed prefetch instructions, that compilers
121		// can't know how to map __builtin_prefetch to, and as a result, don't,
122		// leaving __builtin_prefetch a no-op on this architecture.
123		// For our purposes, "pldl1keep" is usually what we want, meaning:
124		// "prefetch for load, into L1 cache, using each value multiple times".
125		asm volatile("prfm pldl1keep, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
126		#elif defined \
127		__GNUC__ // Clang and GCC define __GNUC__ and have __builtin_prefetch.
128	0	__builtin_prefetch(ptr);
129		#else
130		(void)ptr;
131		#endif
132	0	}
133
134		// Returns the runtime argument rounded down to the nearest multiple of
135		// the fixed Modulus.
136		template <unsigned Modulus, typename Integer>
137	0	Integer RoundDown(Integer i) {
138	0	return i - (i % Modulus);
139	0	} Unexecuted instantiation: unsigned long gemmlowp::RoundDown<64u, unsigned long>(unsigned long) Unexecuted instantiation: int gemmlowp::RoundDown<16u, int>(int) Unexecuted instantiation: unsigned int gemmlowp::RoundDown<16u, unsigned int>(unsigned int) Unexecuted instantiation: unsigned int gemmlowp::RoundDown<4u, unsigned int>(unsigned int)
140
141		// Returns the runtime argument rounded up to the nearest multiple of
142		// the fixed Modulus.
143		template <unsigned Modulus, typename Integer>
144	0	Integer RoundUp(Integer i) {
145	0	return RoundDown<Modulus>(i + Modulus - 1);
146	0	} Unexecuted instantiation: unsigned long gemmlowp::RoundUp<64u, unsigned long>(unsigned long) Unexecuted instantiation: int gemmlowp::RoundUp<16u, int>(int) Unexecuted instantiation: int gemmlowp::RoundUp<4u, int>(int)
147
148		// Returns the quotient a / b rounded up ('ceil') to the nearest integer.
149		template <typename Integer>
150	0	Integer CeilQuotient(Integer a, Integer b) {
151	0	return (a + b - 1) / b;
152	0	}
153
154		// Returns the argument rounded up to the nearest power of two.
155		template <typename Integer>
156	0	Integer RoundUpToPowerOfTwo(Integer n) {
157	0	Integer i = n - 1;
158	0	i \|= i >> 1;
159	0	i \|= i >> 2;
160	0	i \|= i >> 4;
161	0	i \|= i >> 8;
162	0	i \|= i >> 16;
163	0	return i + 1;
164	0	}
165
166		template <int N>
167		struct IsPowerOfTwo {
168		static constexpr bool value = !(N & (N - 1));
169		};
170
171		template <typename T>
172		void MarkMemoryAsInitialized(T* ptr, int size) {
173		#ifdef GEMMLOWP_MARK_MEMORY_AS_INITIALIZED
174		GEMMLOWP_MARK_MEMORY_AS_INITIALIZED(static_cast<void*>(ptr),
175		size * sizeof(T));
176		#else
177		(void)ptr;
178		(void)size;
179		#endif
180		}
181
182		} // namespace gemmlowp
183
184		#endif // GEMMLOWP_INTERNAL_COMMON_H_