/proc/self/cwd/external/gemmlowp/internal/common.h
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright 2015 The Gemmlowp Authors. All Rights Reserved. |
2 | | // |
3 | | // Licensed under the Apache License, Version 2.0 (the "License"); |
4 | | // you may not use this file except in compliance with the License. |
5 | | // You may obtain a copy of the License at |
6 | | // |
7 | | // http://www.apache.org/licenses/LICENSE-2.0 |
8 | | // |
9 | | // Unless required by applicable law or agreed to in writing, software |
10 | | // distributed under the License is distributed on an "AS IS" BASIS, |
11 | | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | | // See the License for the specific language governing permissions and |
13 | | // limitations under the License. |
14 | | |
15 | | // common.h: contains stuff that's used throughout gemmlowp |
16 | | // and should always be available. |
17 | | |
18 | | #ifndef GEMMLOWP_INTERNAL_COMMON_H_ |
19 | | #define GEMMLOWP_INTERNAL_COMMON_H_ |
20 | | |
21 | | #include "../internal/platform.h" |
22 | | #include "../profiling/pthread_everywhere.h" |
23 | | |
24 | | #include <algorithm> |
25 | | #include <cassert> |
26 | | #include <cmath> |
27 | | #include <cstdlib> |
28 | | |
29 | | #include "../internal/detect_platform.h" |
30 | | #include "../profiling/instrumentation.h" |
31 | | |
32 | | namespace gemmlowp { |
33 | | |
34 | | // Standard cache line size. Useful to optimize alignment and |
35 | | // prefetches. Ideally we would query this at runtime, however |
36 | | // 64 byte cache lines are the vast majority, and even if it's |
37 | | // wrong on some device, it will be wrong by no more than a 2x factor, |
38 | | // which should be acceptable. |
39 | | const int kDefaultCacheLineSize = 64; |
40 | | |
41 | | // Default L1 and L2 data cache sizes. |
42 | | // The L1 cache size is assumed to be for each core. |
43 | | // The L2 cache size is assumed to be shared among all cores. What |
44 | | // we call 'L2' here is effectively top-level cache. |
45 | | // |
46 | | // On x86, we should ideally query this at |
47 | | // runtime. On ARM, the instruction to query this is privileged and |
48 | | // Android kernels do not expose it to userspace. Fortunately, the majority |
49 | | // of ARM devices have roughly comparable values: |
50 | | // Nexus 5: L1 16k, L2 1M |
51 | | // Android One: L1 32k, L2 512k |
52 | | // The following values are equal to or somewhat lower than that, and were |
53 | | // found to perform well on both the Nexus 5 and Android One. |
54 | | // Of course, these values are in principle too low for typical x86 CPUs |
55 | | // where we should set the L2 value to (L3 cache size / number of cores) at |
56 | | // least. |
57 | | // |
58 | | #if defined(GEMMLOWP_ARM) && defined(__APPLE__) |
59 | | // iPhone/iPad |
60 | | const int kDefaultL1CacheSize = 48 * 1024; |
61 | | const int kDefaultL2CacheSize = 2 * 1024 * 1024; |
62 | | #elif defined(GEMMLOWP_ARM) || defined(GEMMLOWP_ANDROID) |
63 | | // Other ARM or ARM-like hardware (Android implies ARM-like) so here it's OK |
64 | | // to tune for ARM, although on x86 Atom we might be able to query |
65 | | // cache sizes at runtime, which would be better. |
66 | | const int kDefaultL1CacheSize = 16 * 1024; |
67 | | const int kDefaultL2CacheSize = 384 * 1024; |
68 | | #elif defined(GEMMLOWP_X86_64) |
69 | | // x86-64 and not Android. Therefore, likely desktop-class x86 hardware. |
70 | | // Thus we assume larger cache sizes, though we really should query |
71 | | // them at runtime. |
72 | | const int kDefaultL1CacheSize = 32 * 1024; |
73 | | const int kDefaultL2CacheSize = 4 * 1024 * 1024; |
74 | | #elif defined(GEMMLOWP_X86_32) |
75 | | // x86-32 and not Android. Same as x86-64 but less bullish. |
76 | | const int kDefaultL1CacheSize = 32 * 1024; |
77 | | const int kDefaultL2CacheSize = 2 * 1024 * 1024; |
78 | | #elif defined(GEMMLOWP_MIPS) |
79 | | // MIPS and not Android. TODO: MIPS and Android? |
80 | | const int kDefaultL1CacheSize = 32 * 1024; |
81 | | const int kDefaultL2CacheSize = 1024 * 1024; |
82 | | #else |
83 | | // Less common hardware. Maybe some unusual or older or embedded thing. |
84 | | // Assume smaller caches, but don't depart too far from what we do |
85 | | // on ARM/Android to avoid accidentally exposing unexpected behavior. |
86 | | const int kDefaultL1CacheSize = 16 * 1024; |
87 | | const int kDefaultL2CacheSize = 256 * 1024; |
88 | | #endif |
89 | | |
90 | | // The proportion of the cache that we intend to use for storing |
91 | | // RHS blocks. This should be between 0 and 1, and typically closer to 1, |
92 | | // as we typically want to use most of the L2 cache for storing a large |
93 | | // RHS block. |
94 | | #if defined(GEMMLOWP_X86) |
95 | | // For IA, use the entire L2 cache for the RHS matrix. LHS matrix is not blocked |
96 | | // for L2 cache. |
97 | | const float kDefaultL2RhsFactor = 1.00f; |
98 | | #else |
99 | | const float kDefaultL2RhsFactor = 0.75f; |
100 | | #endif |
101 | | |
102 | | // The number of bytes in a SIMD register. This is used to determine |
103 | | // the dimensions of PackingRegisterBlock so that such blocks can |
104 | | // be efficiently loaded into registers, so that packing code can |
105 | | // work within registers as much as possible. |
106 | | // In the non-SIMD generic fallback code, this is just a generic array |
107 | | // size, so any size would work there. Different platforms may set this |
108 | | // to different values but must ensure that their own optimized packing paths |
109 | | // are consistent with this value. |
110 | | |
111 | | #ifdef GEMMLOWP_AVX2 |
112 | | const int kRegisterSize = 32; |
113 | | #else |
114 | | const int kRegisterSize = 16; |
115 | | #endif |
116 | | |
117 | | // Hints the CPU to prefetch the cache line containing ptr. |
118 | 0 | inline void Prefetch(const void* ptr) { |
119 | | #if defined GEMMLOWP_ARM_64 && defined GEMMLOWP_ALLOW_INLINE_ASM |
120 | | // Aarch64 has very detailed prefetch instructions, that compilers |
121 | | // can't know how to map __builtin_prefetch to, and as a result, don't, |
122 | | // leaving __builtin_prefetch a no-op on this architecture. |
123 | | // For our purposes, "pldl1keep" is usually what we want, meaning: |
124 | | // "prefetch for load, into L1 cache, using each value multiple times". |
125 | | asm volatile("prfm pldl1keep, [%[ptr]]\n" ::[ptr] "r"(ptr) :); |
126 | | #elif defined \ |
127 | | __GNUC__ // Clang and GCC define __GNUC__ and have __builtin_prefetch. |
128 | 0 | __builtin_prefetch(ptr); |
129 | | #else |
130 | | (void)ptr; |
131 | | #endif |
132 | 0 | } |
133 | | |
134 | | // Returns the runtime argument rounded down to the nearest multiple of |
135 | | // the fixed Modulus. |
136 | | template <unsigned Modulus, typename Integer> |
137 | 0 | Integer RoundDown(Integer i) { |
138 | 0 | return i - (i % Modulus); |
139 | 0 | } Unexecuted instantiation: unsigned long gemmlowp::RoundDown<64u, unsigned long>(unsigned long) Unexecuted instantiation: int gemmlowp::RoundDown<16u, int>(int) Unexecuted instantiation: unsigned int gemmlowp::RoundDown<16u, unsigned int>(unsigned int) Unexecuted instantiation: unsigned int gemmlowp::RoundDown<4u, unsigned int>(unsigned int) |
140 | | |
141 | | // Returns the runtime argument rounded up to the nearest multiple of |
142 | | // the fixed Modulus. |
143 | | template <unsigned Modulus, typename Integer> |
144 | 0 | Integer RoundUp(Integer i) { |
145 | 0 | return RoundDown<Modulus>(i + Modulus - 1); |
146 | 0 | } Unexecuted instantiation: unsigned long gemmlowp::RoundUp<64u, unsigned long>(unsigned long) Unexecuted instantiation: int gemmlowp::RoundUp<16u, int>(int) Unexecuted instantiation: int gemmlowp::RoundUp<4u, int>(int) |
147 | | |
148 | | // Returns the quotient a / b rounded up ('ceil') to the nearest integer. |
149 | | template <typename Integer> |
150 | 0 | Integer CeilQuotient(Integer a, Integer b) { |
151 | 0 | return (a + b - 1) / b; |
152 | 0 | } |
153 | | |
154 | | // Returns the argument rounded up to the nearest power of two. |
155 | | template <typename Integer> |
156 | 0 | Integer RoundUpToPowerOfTwo(Integer n) { |
157 | 0 | Integer i = n - 1; |
158 | 0 | i |= i >> 1; |
159 | 0 | i |= i >> 2; |
160 | 0 | i |= i >> 4; |
161 | 0 | i |= i >> 8; |
162 | 0 | i |= i >> 16; |
163 | 0 | return i + 1; |
164 | 0 | } |
165 | | |
166 | | template <int N> |
167 | | struct IsPowerOfTwo { |
168 | | static constexpr bool value = !(N & (N - 1)); |
169 | | }; |
170 | | |
171 | | template <typename T> |
172 | | void MarkMemoryAsInitialized(T* ptr, int size) { |
173 | | #ifdef GEMMLOWP_MARK_MEMORY_AS_INITIALIZED |
174 | | GEMMLOWP_MARK_MEMORY_AS_INITIALIZED(static_cast<void*>(ptr), |
175 | | size * sizeof(T)); |
176 | | #else |
177 | | (void)ptr; |
178 | | (void)size; |
179 | | #endif |
180 | | } |
181 | | |
182 | | } // namespace gemmlowp |
183 | | |
184 | | #endif // GEMMLOWP_INTERNAL_COMMON_H_ |