/src/libjxl/third_party/highway/hwy/cache_control.h
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright 2020 Google LLC |
2 | | // SPDX-License-Identifier: Apache-2.0 |
3 | | // |
4 | | // Licensed under the Apache License, Version 2.0 (the "License"); |
5 | | // you may not use this file except in compliance with the License. |
6 | | // You may obtain a copy of the License at |
7 | | // |
8 | | // http://www.apache.org/licenses/LICENSE-2.0 |
9 | | // |
10 | | // Unless required by applicable law or agreed to in writing, software |
11 | | // distributed under the License is distributed on an "AS IS" BASIS, |
12 | | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | | // See the License for the specific language governing permissions and |
14 | | // limitations under the License. |
15 | | |
16 | | #ifndef HIGHWAY_HWY_CACHE_CONTROL_H_ |
17 | | #define HIGHWAY_HWY_CACHE_CONTROL_H_ |
18 | | |
19 | | #include "hwy/base.h" |
20 | | |
21 | | // Requires SSE2; fails to compile on 32-bit Clang 7 (see |
22 | | // https://github.com/gperftools/gperftools/issues/946). |
23 | | #if !defined(__SSE2__) || (HWY_COMPILER_CLANG && HWY_ARCH_X86_32) |
24 | | #undef HWY_DISABLE_CACHE_CONTROL |
25 | | #define HWY_DISABLE_CACHE_CONTROL |
26 | | #endif |
27 | | |
28 | | #ifndef HWY_DISABLE_CACHE_CONTROL |
29 | | // intrin.h is sufficient on MSVC and already included by base.h. |
30 | | #if HWY_ARCH_X86 && !HWY_COMPILER_MSVC |
31 | | #include <emmintrin.h> // SSE2 |
32 | | #include <xmmintrin.h> // _mm_prefetch |
33 | | #elif HWY_ARCH_ARM_A64 |
34 | | #include <arm_acle.h> |
35 | | #endif |
36 | | #endif // HWY_DISABLE_CACHE_CONTROL |
37 | | |
38 | | namespace hwy { |
39 | | |
40 | | // Even if N*sizeof(T) is smaller, Stream may write a multiple of this size. |
41 | | #define HWY_STREAM_MULTIPLE 16 |
42 | | |
43 | | // The following functions may also require an attribute. |
44 | | #if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL) && !HWY_COMPILER_MSVC |
45 | | #define HWY_ATTR_CACHE __attribute__((target("sse2"))) |
46 | | #else |
47 | | #define HWY_ATTR_CACHE |
48 | | #endif |
49 | | |
50 | | // Windows.h #defines this, which causes infinite recursion. Temporarily |
51 | | // undefine to avoid conflict with our function. |
52 | | // TODO(janwas): remove when this function is removed. |
53 | | #pragma push_macro("LoadFence") |
54 | | #undef LoadFence |
55 | | |
56 | | // Delays subsequent loads until prior loads are visible. Beware of potentially |
57 | | // differing behavior across architectures and vendors: on Intel but not |
58 | | // AMD CPUs, also serves as a full fence (waits for all prior instructions to |
59 | | // complete). |
60 | 0 | HWY_INLINE HWY_ATTR_CACHE void LoadFence() { |
61 | 0 | #if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL) |
62 | 0 | _mm_lfence(); |
63 | 0 | #endif |
64 | 0 | } |
65 | | |
66 | | // TODO(janwas): remove when this function is removed. (See above.) |
67 | | #pragma pop_macro("LoadFence") |
68 | | |
69 | | // Ensures values written by previous `Stream` calls are visible on the current |
70 | | // core. This is NOT sufficient for synchronizing across cores; when `Stream` |
71 | | // outputs are to be consumed by other core(s), the producer must publish |
72 | | // availability (e.g. via mutex or atomic_flag) after `FlushStream`. |
73 | 0 | HWY_INLINE HWY_ATTR_CACHE void FlushStream() { |
74 | 0 | #if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL) |
75 | 0 | _mm_sfence(); |
76 | 0 | #endif |
77 | 0 | } |
78 | | |
79 | | // Optionally begins loading the cache line containing "p" to reduce latency of |
80 | | // subsequent actual loads. |
81 | | template <typename T> |
82 | 33.5M | HWY_INLINE HWY_ATTR_CACHE void Prefetch(const T* p) { |
83 | 33.5M | (void)p; |
84 | 33.5M | #ifndef HWY_DISABLE_CACHE_CONTROL |
85 | 33.5M | #if HWY_ARCH_X86 |
86 | 33.5M | _mm_prefetch(reinterpret_cast<const char*>(p), _MM_HINT_T0); |
87 | | #elif HWY_COMPILER_GCC // includes clang |
88 | | // Hint=0 (NTA) behavior differs, but skipping outer caches is probably not |
89 | | // desirable, so use the default 3 (keep in caches). |
90 | | __builtin_prefetch(p, /*write=*/0, /*hint=*/3); |
91 | | #endif |
92 | 33.5M | #endif // HWY_DISABLE_CACHE_CONTROL |
93 | 33.5M | } |
94 | | |
95 | | // Invalidates and flushes the cache line containing "p", if possible. |
96 | 0 | HWY_INLINE HWY_ATTR_CACHE void FlushCacheline(const void* p) { |
97 | 0 | #if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL) |
98 | 0 | _mm_clflush(p); |
99 | 0 | #else |
100 | 0 | (void)p; |
101 | 0 | #endif |
102 | 0 | } |
103 | | |
104 | | // Hints that we are inside a spin loop and potentially reduces power |
105 | | // consumption and coherency traffic. For example, x86 avoids multiple |
106 | | // outstanding load requests, which reduces the memory order violation penalty |
107 | | // when exiting the loop. |
108 | 0 | HWY_INLINE HWY_ATTR_CACHE void Pause() { |
109 | 0 | #ifndef HWY_DISABLE_CACHE_CONTROL |
110 | 0 | #if HWY_ARCH_X86 |
111 | 0 | _mm_pause(); |
112 | 0 | #elif HWY_ARCH_ARM_A64 && HWY_COMPILER_CLANG |
113 | 0 | // This is documented in ACLE and the YIELD instruction is also available in |
114 | 0 | // Armv7, but the intrinsic is broken for Armv7 clang, hence A64 only. |
115 | 0 | __yield(); |
116 | 0 | #elif HWY_ARCH_ARM && HWY_COMPILER_GCC // includes clang |
117 | 0 | __asm__ volatile("yield" ::: "memory"); |
118 | 0 | #elif HWY_ARCH_PPC && HWY_COMPILER_GCC // includes clang |
119 | 0 | __asm__ volatile("or 27,27,27" ::: "memory"); |
120 | 0 | #endif |
121 | 0 | #endif // HWY_DISABLE_CACHE_CONTROL |
122 | 0 | } |
123 | | |
124 | | } // namespace hwy |
125 | | |
126 | | #endif // HIGHWAY_HWY_CACHE_CONTROL_H_ |