Coverage Report

Created: 2025-06-16 07:00

/src/libjxl/third_party/highway/hwy/cache_control.h
Line
Count
Source (jump to first uncovered line)
1
// Copyright 2020 Google LLC
2
// SPDX-License-Identifier: Apache-2.0
3
//
4
// Licensed under the Apache License, Version 2.0 (the "License");
5
// you may not use this file except in compliance with the License.
6
// You may obtain a copy of the License at
7
//
8
//      http://www.apache.org/licenses/LICENSE-2.0
9
//
10
// Unless required by applicable law or agreed to in writing, software
11
// distributed under the License is distributed on an "AS IS" BASIS,
12
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
// See the License for the specific language governing permissions and
14
// limitations under the License.
15
16
#ifndef HIGHWAY_HWY_CACHE_CONTROL_H_
17
#define HIGHWAY_HWY_CACHE_CONTROL_H_
18
19
#include "hwy/base.h"
20
21
// Requires SSE2; fails to compile on 32-bit Clang 7 (see
22
// https://github.com/gperftools/gperftools/issues/946).
23
#if !defined(__SSE2__) || (HWY_COMPILER_CLANG && HWY_ARCH_X86_32)
24
#undef HWY_DISABLE_CACHE_CONTROL
25
#define HWY_DISABLE_CACHE_CONTROL
26
#endif
27
28
#ifndef HWY_DISABLE_CACHE_CONTROL
29
// intrin.h is sufficient on MSVC and already included by base.h.
30
#if HWY_ARCH_X86 && !HWY_COMPILER_MSVC
31
#include <emmintrin.h>  // SSE2
32
#include <xmmintrin.h>  // _mm_prefetch
33
#elif HWY_ARCH_ARM_A64
34
#include <arm_acle.h>
35
#endif
36
#endif  // HWY_DISABLE_CACHE_CONTROL
37
38
namespace hwy {
39
40
// Even if N*sizeof(T) is smaller, Stream may write a multiple of this size.
41
#define HWY_STREAM_MULTIPLE 16
42
43
// The following functions may also require an attribute.
44
#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL) && !HWY_COMPILER_MSVC
45
#define HWY_ATTR_CACHE __attribute__((target("sse2")))
46
#else
47
#define HWY_ATTR_CACHE
48
#endif
49
50
// Windows.h #defines this, which causes infinite recursion. Temporarily
51
// undefine to avoid conflict with our function.
52
// TODO(janwas): remove when this function is removed.
53
#pragma push_macro("LoadFence")
54
#undef LoadFence
55
56
// Delays subsequent loads until prior loads are visible. Beware of potentially
57
// differing behavior across architectures and vendors: on Intel but not
58
// AMD CPUs, also serves as a full fence (waits for all prior instructions to
59
// complete).
60
0
HWY_INLINE HWY_ATTR_CACHE void LoadFence() {
61
0
#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
62
0
  _mm_lfence();
63
0
#endif
64
0
}
65
66
// TODO(janwas): remove when this function is removed. (See above.)
67
#pragma pop_macro("LoadFence")
68
69
// Ensures values written by previous `Stream` calls are visible on the current
70
// core. This is NOT sufficient for synchronizing across cores; when `Stream`
71
// outputs are to be consumed by other core(s), the producer must publish
72
// availability (e.g. via mutex or atomic_flag) after `FlushStream`.
73
0
HWY_INLINE HWY_ATTR_CACHE void FlushStream() {
74
0
#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
75
0
  _mm_sfence();
76
0
#endif
77
0
}
78
79
// Optionally begins loading the cache line containing "p" to reduce latency of
80
// subsequent actual loads.
81
template <typename T>
82
33.5M
HWY_INLINE HWY_ATTR_CACHE void Prefetch(const T* p) {
83
33.5M
  (void)p;
84
33.5M
#ifndef HWY_DISABLE_CACHE_CONTROL
85
33.5M
#if HWY_ARCH_X86
86
33.5M
  _mm_prefetch(reinterpret_cast<const char*>(p), _MM_HINT_T0);
87
#elif HWY_COMPILER_GCC  // includes clang
88
  // Hint=0 (NTA) behavior differs, but skipping outer caches is probably not
89
  // desirable, so use the default 3 (keep in caches).
90
  __builtin_prefetch(p, /*write=*/0, /*hint=*/3);
91
#endif
92
33.5M
#endif  //  HWY_DISABLE_CACHE_CONTROL
93
33.5M
}
94
95
// Invalidates and flushes the cache line containing "p", if possible.
96
0
HWY_INLINE HWY_ATTR_CACHE void FlushCacheline(const void* p) {
97
0
#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
98
0
  _mm_clflush(p);
99
0
#else
100
0
  (void)p;
101
0
#endif
102
0
}
103
104
// Hints that we are inside a spin loop and potentially reduces power
105
// consumption and coherency traffic. For example, x86 avoids multiple
106
// outstanding load requests, which reduces the memory order violation penalty
107
// when exiting the loop.
108
0
HWY_INLINE HWY_ATTR_CACHE void Pause() {
109
0
#ifndef HWY_DISABLE_CACHE_CONTROL
110
0
#if HWY_ARCH_X86
111
0
  _mm_pause();
112
0
#elif HWY_ARCH_ARM_A64 && HWY_COMPILER_CLANG
113
0
  // This is documented in ACLE and the YIELD instruction is also available in
114
0
  // Armv7, but the intrinsic is broken for Armv7 clang, hence A64 only.
115
0
  __yield();
116
0
#elif HWY_ARCH_ARM && HWY_COMPILER_GCC  // includes clang
117
0
  __asm__ volatile("yield" ::: "memory");
118
0
#elif HWY_ARCH_PPC && HWY_COMPILER_GCC  // includes clang
119
0
  __asm__ volatile("or 27,27,27" ::: "memory");
120
0
#endif
121
0
#endif  // HWY_DISABLE_CACHE_CONTROL
122
0
}
123
124
}  // namespace hwy
125
126
#endif  // HIGHWAY_HWY_CACHE_CONTROL_H_