Coverage Report

Created: 2024-05-04 12:45

/proc/self/cwd/external/gemmlowp/internal/common.h
Line
Count
Source (jump to first uncovered line)
1
// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
2
//
3
// Licensed under the Apache License, Version 2.0 (the "License");
4
// you may not use this file except in compliance with the License.
5
// You may obtain a copy of the License at
6
//
7
//     http://www.apache.org/licenses/LICENSE-2.0
8
//
9
// Unless required by applicable law or agreed to in writing, software
10
// distributed under the License is distributed on an "AS IS" BASIS,
11
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
// See the License for the specific language governing permissions and
13
// limitations under the License.
14
15
// common.h: contains stuff that's used throughout gemmlowp
16
// and should always be available.
17
18
#ifndef GEMMLOWP_INTERNAL_COMMON_H_
19
#define GEMMLOWP_INTERNAL_COMMON_H_
20
21
#include "../internal/platform.h"
22
#include "../profiling/pthread_everywhere.h"
23
24
#include <algorithm>
25
#include <cassert>
26
#include <cmath>
27
#include <cstdlib>
28
29
#include "../internal/detect_platform.h"
30
#include "../profiling/instrumentation.h"
31
32
namespace gemmlowp {
33
34
// Standard cache line size. Useful to optimize alignment and
35
// prefetches. Ideally we would query this at runtime, however
36
// 64 byte cache lines are the vast majority, and even if it's
37
// wrong on some device, it will be wrong by no more than a 2x factor,
38
// which should be acceptable.
39
const int kDefaultCacheLineSize = 64;
40
41
// Default L1 and L2 data cache sizes.
42
// The L1 cache size is assumed to be for each core.
43
// The L2 cache size is assumed to be shared among all cores. What
44
// we call 'L2' here is effectively top-level cache.
45
//
46
// On x86, we should ideally query this at
47
// runtime. On ARM, the instruction to query this is privileged and
48
// Android kernels do not expose it to userspace. Fortunately, the majority
49
// of ARM devices have roughly comparable values:
50
//   Nexus 5: L1 16k, L2 1M
51
//   Android One: L1 32k, L2 512k
52
// The following values are equal to or somewhat lower than that, and were
53
// found to perform well on both the Nexus 5 and Android One.
54
// Of course, these values are in principle too low for typical x86 CPUs
55
// where we should set the L2 value to (L3 cache size / number of cores) at
56
// least.
57
//
58
#if defined(GEMMLOWP_ARM) && defined(__APPLE__)
59
// iPhone/iPad
60
const int kDefaultL1CacheSize = 48 * 1024;
61
const int kDefaultL2CacheSize = 2 * 1024 * 1024;
62
#elif defined(GEMMLOWP_ARM) || defined(GEMMLOWP_ANDROID)
63
// Other ARM or ARM-like hardware (Android implies ARM-like) so here it's OK
64
// to tune for ARM, although on x86 Atom we might be able to query
65
// cache sizes at runtime, which would be better.
66
const int kDefaultL1CacheSize = 16 * 1024;
67
const int kDefaultL2CacheSize = 384 * 1024;
68
#elif defined(GEMMLOWP_X86_64)
69
// x86-64 and not Android. Therefore, likely desktop-class x86 hardware.
70
// Thus we assume larger cache sizes, though we really should query
71
// them at runtime.
72
const int kDefaultL1CacheSize = 32 * 1024;
73
const int kDefaultL2CacheSize = 4 * 1024 * 1024;
74
#elif defined(GEMMLOWP_X86_32)
75
// x86-32 and not Android. Same as x86-64 but less bullish.
76
const int kDefaultL1CacheSize = 32 * 1024;
77
const int kDefaultL2CacheSize = 2 * 1024 * 1024;
78
#elif defined(GEMMLOWP_MIPS)
79
// MIPS and not Android. TODO: MIPS and Android?
80
const int kDefaultL1CacheSize = 32 * 1024;
81
const int kDefaultL2CacheSize = 1024 * 1024;
82
#else
83
// Less common hardware. Maybe some unusual or older or embedded thing.
84
// Assume smaller caches, but don't depart too far from what we do
85
// on ARM/Android to avoid accidentally exposing unexpected behavior.
86
const int kDefaultL1CacheSize = 16 * 1024;
87
const int kDefaultL2CacheSize = 256 * 1024;
88
#endif
89
90
// The proportion of the cache that we intend to use for storing
91
// RHS blocks. This should be between 0 and 1, and typically closer to 1,
92
// as we typically want to use most of the L2 cache for storing a large
93
// RHS block.
94
#if defined(GEMMLOWP_X86)
95
// For IA, use the entire L2 cache for the RHS matrix. LHS matrix is not blocked
96
// for L2 cache.
97
const float kDefaultL2RhsFactor = 1.00f;
98
#else
99
const float kDefaultL2RhsFactor = 0.75f;
100
#endif
101
102
// The number of bytes in a SIMD register. This is used to determine
103
// the dimensions of PackingRegisterBlock so that such blocks can
104
// be efficiently loaded into registers, so that packing code can
105
// work within registers as much as possible.
106
// In the non-SIMD generic fallback code, this is just a generic array
107
// size, so any size would work there. Different platforms may set this
108
// to different values but must ensure that their own optimized packing paths
109
// are consistent with this value.
110
111
#ifdef GEMMLOWP_AVX2
112
const int kRegisterSize = 32;
113
#else
114
const int kRegisterSize = 16;
115
#endif
116
117
// Hints the CPU to prefetch the cache line containing ptr.
118
0
inline void Prefetch(const void* ptr) {
119
#if defined GEMMLOWP_ARM_64 && defined GEMMLOWP_ALLOW_INLINE_ASM
120
  // Aarch64 has very detailed prefetch instructions, that compilers
121
  // can't know how to map __builtin_prefetch to, and as a result, don't,
122
  // leaving __builtin_prefetch a no-op on this architecture.
123
  // For our purposes, "pldl1keep" is usually what we want, meaning:
124
  // "prefetch for load, into L1 cache, using each value multiple times".
125
  asm volatile("prfm pldl1keep, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
126
#elif defined \
127
    __GNUC__  // Clang and GCC define __GNUC__ and have __builtin_prefetch.
128
0
  __builtin_prefetch(ptr);
129
#else
130
  (void)ptr;
131
#endif
132
0
}
133
134
// Returns the runtime argument rounded down to the nearest multiple of
135
// the fixed Modulus.
136
template <unsigned Modulus, typename Integer>
137
0
Integer RoundDown(Integer i) {
138
0
  return i - (i % Modulus);
139
0
}
Unexecuted instantiation: unsigned long gemmlowp::RoundDown<64u, unsigned long>(unsigned long)
Unexecuted instantiation: int gemmlowp::RoundDown<16u, int>(int)
Unexecuted instantiation: unsigned int gemmlowp::RoundDown<16u, unsigned int>(unsigned int)
Unexecuted instantiation: unsigned int gemmlowp::RoundDown<4u, unsigned int>(unsigned int)
140
141
// Returns the runtime argument rounded up to the nearest multiple of
142
// the fixed Modulus.
143
template <unsigned Modulus, typename Integer>
144
0
Integer RoundUp(Integer i) {
145
0
  return RoundDown<Modulus>(i + Modulus - 1);
146
0
}
Unexecuted instantiation: unsigned long gemmlowp::RoundUp<64u, unsigned long>(unsigned long)
Unexecuted instantiation: int gemmlowp::RoundUp<16u, int>(int)
Unexecuted instantiation: int gemmlowp::RoundUp<4u, int>(int)
147
148
// Returns the quotient a / b rounded up ('ceil') to the nearest integer.
149
template <typename Integer>
150
0
Integer CeilQuotient(Integer a, Integer b) {
151
0
  return (a + b - 1) / b;
152
0
}
153
154
// Returns the argument rounded up to the nearest power of two.
155
template <typename Integer>
156
0
Integer RoundUpToPowerOfTwo(Integer n) {
157
0
  Integer i = n - 1;
158
0
  i |= i >> 1;
159
0
  i |= i >> 2;
160
0
  i |= i >> 4;
161
0
  i |= i >> 8;
162
0
  i |= i >> 16;
163
0
  return i + 1;
164
0
}
165
166
template <int N>
167
struct IsPowerOfTwo {
168
  static constexpr bool value = !(N & (N - 1));
169
};
170
171
template <typename T>
172
void MarkMemoryAsInitialized(T* ptr, int size) {
173
#ifdef GEMMLOWP_MARK_MEMORY_AS_INITIALIZED
174
  GEMMLOWP_MARK_MEMORY_AS_INITIALIZED(static_cast<void*>(ptr),
175
                                      size * sizeof(T));
176
#else
177
  (void)ptr;
178
  (void)size;
179
#endif
180
}
181
182
}  // namespace gemmlowp
183
184
#endif  // GEMMLOWP_INTERNAL_COMMON_H_