/src/libjxl/third_party/highway/hwy/targets.h
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright 2020 Google LLC |
2 | | // SPDX-License-Identifier: Apache-2.0 |
3 | | // |
4 | | // Licensed under the Apache License, Version 2.0 (the "License"); |
5 | | // you may not use this file except in compliance with the License. |
6 | | // You may obtain a copy of the License at |
7 | | // |
8 | | // http://www.apache.org/licenses/LICENSE-2.0 |
9 | | // |
10 | | // Unless required by applicable law or agreed to in writing, software |
11 | | // distributed under the License is distributed on an "AS IS" BASIS, |
12 | | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | | // See the License for the specific language governing permissions and |
14 | | // limitations under the License. |
15 | | |
16 | | #ifndef HIGHWAY_HWY_TARGETS_H_ |
17 | | #define HIGHWAY_HWY_TARGETS_H_ |
18 | | |
19 | | // Allows opting out of C++ standard library usage, which is not available in |
20 | | // some Compiler Explorer environments. |
21 | | #ifndef HWY_NO_LIBCXX |
22 | | #include <vector> |
23 | | #endif |
24 | | |
25 | | // For SIMD module implementations and their callers. Defines which targets to |
26 | | // generate and call. |
27 | | |
28 | | #include "hwy/base.h" |
29 | | #include "hwy/detect_targets.h" |
30 | | #include "hwy/highway_export.h" |
31 | | |
32 | | #if !defined(HWY_NO_LIBCXX) |
33 | | #include <atomic> |
34 | | #endif |
35 | | |
36 | | namespace hwy { |
37 | | |
38 | | // Returns bitfield of enabled targets that are supported on this CPU; there is |
39 | | // always at least one such target, hence the return value is never 0. The |
40 | | // targets returned may change after calling DisableTargets. This function is |
41 | | // always defined, but the HWY_SUPPORTED_TARGETS wrapper may allow eliding |
42 | | // calls to it if there is only a single target enabled. |
43 | | HWY_DLLEXPORT int64_t SupportedTargets(); |
44 | | |
45 | | // Evaluates to a function call, or literal if there is a single target. |
46 | | #if (HWY_TARGETS & (HWY_TARGETS - 1)) == 0 |
47 | | #define HWY_SUPPORTED_TARGETS HWY_TARGETS |
48 | | #else |
49 | | #define HWY_SUPPORTED_TARGETS hwy::SupportedTargets() |
50 | | #endif |
51 | | |
52 | | // Subsequent SupportedTargets will not return targets whose bit(s) are set in |
53 | | // `disabled_targets`. Exception: if SupportedTargets would return 0, it will |
54 | | // instead return HWY_STATIC_TARGET (there must always be one target to call). |
55 | | // |
56 | | // This function is useful for disabling targets known to be buggy, or if the |
57 | | // best available target is undesirable (perhaps due to throttling or memory |
58 | | // bandwidth limitations). Use SetSupportedTargetsForTest instead of this |
59 | | // function for iteratively enabling specific targets for testing. |
60 | | HWY_DLLEXPORT void DisableTargets(int64_t disabled_targets); |
61 | | |
62 | | // Subsequent SupportedTargets will return the given set of targets, except |
63 | | // those disabled via DisableTargets. Call with a mask of 0 to disable the mock |
64 | | // and return to the normal SupportedTargets behavior. Used to run tests for |
65 | | // all targets. |
66 | | HWY_DLLEXPORT void SetSupportedTargetsForTest(int64_t targets); |
67 | | |
68 | | #ifndef HWY_NO_LIBCXX |
69 | | |
70 | | // Return the list of targets in HWY_TARGETS supported by the CPU as a list of |
71 | | // individual HWY_* target macros such as HWY_SCALAR or HWY_NEON. This list |
72 | | // is affected by the current SetSupportedTargetsForTest() mock if any. |
73 | 0 | HWY_INLINE std::vector<int64_t> SupportedAndGeneratedTargets() { |
74 | 0 | std::vector<int64_t> ret; |
75 | 0 | for (int64_t targets = SupportedTargets() & HWY_TARGETS; targets != 0; |
76 | 0 | targets = targets & (targets - 1)) { |
77 | 0 | int64_t current_target = targets & ~(targets - 1); |
78 | 0 | ret.push_back(current_target); |
79 | 0 | } |
80 | 0 | return ret; |
81 | 0 | } |
82 | | |
83 | | #endif // HWY_NO_LIBCXX |
84 | | |
85 | 0 | static inline HWY_MAYBE_UNUSED const char* TargetName(int64_t target) { |
86 | 0 | switch (target) { |
87 | 0 | #if HWY_ARCH_X86 |
88 | 0 | case HWY_SSE2: |
89 | 0 | return "SSE2"; |
90 | 0 | case HWY_SSSE3: |
91 | 0 | return "SSSE3"; |
92 | 0 | case HWY_SSE4: |
93 | 0 | return "SSE4"; |
94 | 0 | case HWY_AVX2: |
95 | 0 | return "AVX2"; |
96 | 0 | case HWY_AVX3: |
97 | 0 | return "AVX3"; |
98 | 0 | case HWY_AVX3_DL: |
99 | 0 | return "AVX3_DL"; |
100 | 0 | case HWY_AVX3_ZEN4: |
101 | 0 | return "AVX3_ZEN4"; |
102 | 0 | case HWY_AVX3_SPR: |
103 | 0 | return "AVX3_SPR"; |
104 | 0 | #endif |
105 | 0 |
|
106 | 0 | #if HWY_ARCH_ARM |
107 | 0 | case HWY_SVE2_128: |
108 | 0 | return "SVE2_128"; |
109 | 0 | case HWY_SVE_256: |
110 | 0 | return "SVE_256"; |
111 | 0 | case HWY_SVE2: |
112 | 0 | return "SVE2"; |
113 | 0 | case HWY_SVE: |
114 | 0 | return "SVE"; |
115 | 0 | case HWY_NEON_BF16: |
116 | 0 | return "NEON_BF16"; |
117 | 0 | case HWY_NEON: |
118 | 0 | return "NEON"; |
119 | 0 | case HWY_NEON_WITHOUT_AES: |
120 | 0 | return "NEON_WITHOUT_AES"; |
121 | 0 | #endif |
122 | 0 |
|
123 | 0 | #if HWY_ARCH_PPC |
124 | 0 | case HWY_PPC8: |
125 | 0 | return "PPC8"; |
126 | 0 | case HWY_PPC9: |
127 | 0 | return "PPC9"; |
128 | 0 | case HWY_PPC10: |
129 | 0 | return "PPC10"; |
130 | 0 | #endif |
131 | 0 |
|
132 | 0 | #if HWY_ARCH_S390X |
133 | 0 | case HWY_Z14: |
134 | 0 | return "Z14"; |
135 | 0 | case HWY_Z15: |
136 | 0 | return "Z15"; |
137 | 0 | #endif |
138 | 0 |
|
139 | 0 | #if HWY_ARCH_WASM |
140 | 0 | case HWY_WASM: |
141 | 0 | return "WASM"; |
142 | 0 | case HWY_WASM_EMU256: |
143 | 0 | return "WASM_EMU256"; |
144 | 0 | #endif |
145 | 0 |
|
146 | 0 | #if HWY_ARCH_RISCV |
147 | 0 | case HWY_RVV: |
148 | 0 | return "RVV"; |
149 | 0 | #endif |
150 | 0 |
|
151 | 0 | case HWY_EMU128: |
152 | 0 | return "EMU128"; |
153 | 0 | case HWY_SCALAR: |
154 | 0 | return "SCALAR"; |
155 | 0 |
|
156 | 0 | default: |
157 | 0 | return "Unknown"; // must satisfy gtest IsValidParamName() |
158 | 0 | } |
159 | 0 | } Unexecuted instantiation: enc_cluster.cc:hwy::TargetName(long) Unexecuted instantiation: targets.cc:hwy::TargetName(long) Unexecuted instantiation: enc_lz77.cc:hwy::TargetName(long) Unexecuted instantiation: enc_detect_dots.cc:hwy::TargetName(long) Unexecuted instantiation: enc_convolve_separable5.cc:hwy::TargetName(long) Unexecuted instantiation: enc_xyb.cc:hwy::TargetName(long) Unexecuted instantiation: butteraugli.cc:hwy::TargetName(long) Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::TargetName(long) Unexecuted instantiation: enc_group.cc:hwy::TargetName(long) Unexecuted instantiation: enc_chroma_from_luma.cc:hwy::TargetName(long) Unexecuted instantiation: enc_ac_strategy.cc:hwy::TargetName(long) Unexecuted instantiation: enc_entropy_coder.cc:hwy::TargetName(long) Unexecuted instantiation: jxl_cms.cc:hwy::TargetName(long) Unexecuted instantiation: enc_ma.cc:hwy::TargetName(long) Unexecuted instantiation: compressed_dc.cc:hwy::TargetName(long) Unexecuted instantiation: convolve_symmetric5.cc:hwy::TargetName(long) Unexecuted instantiation: dec_context_map.cc:hwy::TargetName(long) Unexecuted instantiation: dec_external_image.cc:hwy::TargetName(long) Unexecuted instantiation: dec_group.cc:hwy::TargetName(long) Unexecuted instantiation: dec_modular.cc:hwy::TargetName(long) Unexecuted instantiation: dec_noise.cc:hwy::TargetName(long) Unexecuted instantiation: dec_xyb.cc:hwy::TargetName(long) Unexecuted instantiation: squeeze.cc:hwy::TargetName(long) Unexecuted instantiation: rct.cc:hwy::TargetName(long) Unexecuted instantiation: quant_weights.cc:hwy::TargetName(long) Unexecuted instantiation: stage_blending.cc:hwy::TargetName(long) Unexecuted instantiation: stage_chroma_upsampling.cc:hwy::TargetName(long) Unexecuted instantiation: stage_cms.cc:hwy::TargetName(long) Unexecuted instantiation: stage_epf.cc:hwy::TargetName(long) Unexecuted instantiation: stage_from_linear.cc:hwy::TargetName(long) Unexecuted instantiation: stage_gaborish.cc:hwy::TargetName(long) Unexecuted instantiation: stage_noise.cc:hwy::TargetName(long) Unexecuted instantiation: stage_splines.cc:hwy::TargetName(long) Unexecuted instantiation: stage_to_linear.cc:hwy::TargetName(long) Unexecuted instantiation: stage_tone_mapping.cc:hwy::TargetName(long) Unexecuted instantiation: stage_upsampling.cc:hwy::TargetName(long) Unexecuted instantiation: stage_write.cc:hwy::TargetName(long) Unexecuted instantiation: stage_xyb.cc:hwy::TargetName(long) Unexecuted instantiation: stage_ycbcr.cc:hwy::TargetName(long) Unexecuted instantiation: simd_util.cc:hwy::TargetName(long) Unexecuted instantiation: splines.cc:hwy::TargetName(long) |
160 | | |
161 | | // The maximum number of dynamic targets on any architecture is defined by |
162 | | // HWY_MAX_DYNAMIC_TARGETS and depends on the arch. |
163 | | |
164 | | // For the ChosenTarget mask and index we use a different bit arrangement than |
165 | | // in the HWY_TARGETS mask. Only the targets involved in the current |
166 | | // architecture are used in this mask, and therefore only the least significant |
167 | | // (HWY_MAX_DYNAMIC_TARGETS + 2) bits of the int64_t mask are used. The least |
168 | | // significant bit is set when the mask is not initialized, the next |
169 | | // HWY_MAX_DYNAMIC_TARGETS more significant bits are a range of bits from the |
170 | | // HWY_TARGETS or SupportedTargets() mask for the given architecture shifted to |
171 | | // that position and the next more significant bit is used for HWY_SCALAR (if |
172 | | // HWY_COMPILE_ONLY_SCALAR is defined) or HWY_EMU128. Because of this we need to |
173 | | // define equivalent values for HWY_TARGETS in this representation. |
174 | | // This mask representation allows to use ctz() on this mask and obtain a small |
175 | | // number that's used as an index of the table for dynamic dispatch. In this |
176 | | // way the first entry is used when the mask is uninitialized, the following |
177 | | // HWY_MAX_DYNAMIC_TARGETS are for dynamic dispatch and the last one is for |
178 | | // scalar. |
179 | | |
180 | | // The HWY_SCALAR/HWY_EMU128 bit in the ChosenTarget mask format. |
181 | 8.46M | #define HWY_CHOSEN_TARGET_MASK_SCALAR (1LL << (HWY_MAX_DYNAMIC_TARGETS + 1)) |
182 | | |
183 | | // Converts from a HWY_TARGETS mask to a ChosenTarget mask format for the |
184 | | // current architecture. |
185 | | #define HWY_CHOSEN_TARGET_SHIFT(X) \ |
186 | 8.46M | ((((X) >> (HWY_HIGHEST_TARGET_BIT + 1 - HWY_MAX_DYNAMIC_TARGETS)) & \ |
187 | 8.46M | ((1LL << HWY_MAX_DYNAMIC_TARGETS) - 1)) \ |
188 | 8.46M | << 1) |
189 | | |
190 | | // The HWY_TARGETS mask in the ChosenTarget mask format. |
191 | | #define HWY_CHOSEN_TARGET_MASK_TARGETS \ |
192 | 8.46M | (HWY_CHOSEN_TARGET_SHIFT(HWY_TARGETS) | HWY_CHOSEN_TARGET_MASK_SCALAR | 1LL) |
193 | | |
194 | | #if HWY_ARCH_X86 |
195 | | // Maximum number of dynamic targets, changing this value is an ABI incompatible |
196 | | // change |
197 | 25.4M | #define HWY_MAX_DYNAMIC_TARGETS 15 |
198 | 8.46M | #define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_X86 |
199 | | // These must match the order in which the HWY_TARGETS are defined |
200 | | // starting by the least significant (HWY_HIGHEST_TARGET_BIT + 1 - |
201 | | // HWY_MAX_DYNAMIC_TARGETS) bit. This list must contain exactly |
202 | | // HWY_MAX_DYNAMIC_TARGETS elements and does not include SCALAR. The first entry |
203 | | // corresponds to the best target. Don't include a "," at the end of the list. |
204 | | #define HWY_CHOOSE_TARGET_LIST(func_name) \ |
205 | | nullptr, /* reserved */ \ |
206 | | nullptr, /* reserved */ \ |
207 | | nullptr, /* reserved */ \ |
208 | | nullptr, /* reserved */ \ |
209 | | HWY_CHOOSE_AVX3_SPR(func_name), /* AVX3_SPR */ \ |
210 | | nullptr, /* reserved */ \ |
211 | | HWY_CHOOSE_AVX3_ZEN4(func_name), /* AVX3_ZEN4 */ \ |
212 | | HWY_CHOOSE_AVX3_DL(func_name), /* AVX3_DL */ \ |
213 | | HWY_CHOOSE_AVX3(func_name), /* AVX3 */ \ |
214 | | HWY_CHOOSE_AVX2(func_name), /* AVX2 */ \ |
215 | | nullptr, /* AVX */ \ |
216 | | HWY_CHOOSE_SSE4(func_name), /* SSE4 */ \ |
217 | | HWY_CHOOSE_SSSE3(func_name), /* SSSE3 */ \ |
218 | | nullptr, /* reserved - SSE3? */ \ |
219 | | HWY_CHOOSE_SSE2(func_name) /* SSE2 */ |
220 | | |
221 | | #elif HWY_ARCH_ARM |
222 | | // See HWY_ARCH_X86 above for details. |
223 | | #define HWY_MAX_DYNAMIC_TARGETS 15 |
224 | | #define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_ARM |
225 | | #define HWY_CHOOSE_TARGET_LIST(func_name) \ |
226 | | nullptr, /* reserved */ \ |
227 | | nullptr, /* reserved */ \ |
228 | | nullptr, /* reserved */ \ |
229 | | HWY_CHOOSE_SVE2_128(func_name), /* SVE2 128-bit */ \ |
230 | | HWY_CHOOSE_SVE_256(func_name), /* SVE 256-bit */ \ |
231 | | nullptr, /* reserved */ \ |
232 | | nullptr, /* reserved */ \ |
233 | | nullptr, /* reserved */ \ |
234 | | HWY_CHOOSE_SVE2(func_name), /* SVE2 */ \ |
235 | | HWY_CHOOSE_SVE(func_name), /* SVE */ \ |
236 | | nullptr, /* reserved */ \ |
237 | | HWY_CHOOSE_NEON_BF16(func_name), /* NEON + f16/dot/bf16 */ \ |
238 | | nullptr, /* reserved */ \ |
239 | | HWY_CHOOSE_NEON(func_name), /* NEON */ \ |
240 | | HWY_CHOOSE_NEON_WITHOUT_AES(func_name) /* NEON without AES */ |
241 | | |
242 | | #elif HWY_ARCH_RISCV |
243 | | // See HWY_ARCH_X86 above for details. |
244 | | #define HWY_MAX_DYNAMIC_TARGETS 9 |
245 | | #define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_RVV |
246 | | #define HWY_CHOOSE_TARGET_LIST(func_name) \ |
247 | | nullptr, /* reserved */ \ |
248 | | nullptr, /* reserved */ \ |
249 | | nullptr, /* reserved */ \ |
250 | | nullptr, /* reserved */ \ |
251 | | nullptr, /* reserved */ \ |
252 | | nullptr, /* reserved */ \ |
253 | | nullptr, /* reserved */ \ |
254 | | HWY_CHOOSE_RVV(func_name), /* RVV */ \ |
255 | | nullptr /* reserved */ |
256 | | |
257 | | #elif HWY_ARCH_PPC || HWY_ARCH_S390X |
258 | | // See HWY_ARCH_X86 above for details. |
259 | | #define HWY_MAX_DYNAMIC_TARGETS 9 |
260 | | #define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_PPC |
261 | | #define HWY_CHOOSE_TARGET_LIST(func_name) \ |
262 | | nullptr, /* reserved */ \ |
263 | | nullptr, /* reserved */ \ |
264 | | nullptr, /* reserved */ \ |
265 | | nullptr, /* reserved */ \ |
266 | | HWY_CHOOSE_PPC10(func_name), /* PPC10 */ \ |
267 | | HWY_CHOOSE_PPC9(func_name), /* PPC9 */ \ |
268 | | HWY_CHOOSE_PPC8(func_name), /* PPC8 */ \ |
269 | | HWY_CHOOSE_Z15(func_name), /* Z15 */ \ |
270 | | HWY_CHOOSE_Z14(func_name) /* Z14 */ |
271 | | |
272 | | #elif HWY_ARCH_WASM |
273 | | // See HWY_ARCH_X86 above for details. |
274 | | #define HWY_MAX_DYNAMIC_TARGETS 9 |
275 | | #define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_WASM |
276 | | #define HWY_CHOOSE_TARGET_LIST(func_name) \ |
277 | | nullptr, /* reserved */ \ |
278 | | nullptr, /* reserved */ \ |
279 | | nullptr, /* reserved */ \ |
280 | | nullptr, /* reserved */ \ |
281 | | nullptr, /* reserved */ \ |
282 | | nullptr, /* reserved */ \ |
283 | | HWY_CHOOSE_WASM_EMU256(func_name), /* WASM_EMU256 */ \ |
284 | | HWY_CHOOSE_WASM(func_name), /* WASM */ \ |
285 | | nullptr /* reserved */ |
286 | | |
287 | | #else |
288 | | // Unknown architecture, will use HWY_SCALAR without dynamic dispatch, though |
289 | | // still creating single-entry tables in HWY_EXPORT to ensure portability. |
290 | | #define HWY_MAX_DYNAMIC_TARGETS 1 |
291 | | #define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_SCALAR |
292 | | #endif |
293 | | |
294 | | // Bitfield of supported and enabled targets. The format differs from that of |
295 | | // HWY_TARGETS; the lowest bit governs the first function pointer (which is |
296 | | // special in that it calls FunctionCache, then Update, then dispatches to the |
297 | | // actual implementation) in the tables created by HWY_EXPORT. Monostate (see |
298 | | // GetChosenTarget), thread-safe except on RVV. |
299 | | struct ChosenTarget { |
300 | | public: |
301 | | // Reset bits according to `targets` (typically the return value of |
302 | | // SupportedTargets()). Postcondition: IsInitialized() == true. |
303 | 4 | void Update(int64_t targets) { |
304 | | // These are `targets` shifted downwards, see above. Also include SCALAR |
305 | | // (corresponds to the last entry in the function table) as fallback. |
306 | 4 | StoreMask(HWY_CHOSEN_TARGET_SHIFT(targets) | HWY_CHOSEN_TARGET_MASK_SCALAR); |
307 | 4 | } |
308 | | |
309 | | // Reset to the uninitialized state, so that FunctionCache will call Update |
310 | | // during the next HWY_DYNAMIC_DISPATCH, and IsInitialized returns false. |
311 | 0 | void DeInit() { StoreMask(1); } |
312 | | |
313 | | // Whether Update was called. This indicates whether any HWY_DYNAMIC_DISPATCH |
314 | | // function was called, which we check in tests. |
315 | 0 | bool IsInitialized() const { return LoadMask() != 1; } |
316 | | |
317 | | // Return the index in the dynamic dispatch table to be used by the current |
318 | | // CPU. Note that this method must be in the header file so it uses the value |
319 | | // of HWY_CHOSEN_TARGET_MASK_TARGETS defined in the translation unit that |
320 | | // calls it, which may be different from others. This means we only enable |
321 | | // those targets that were actually compiled in this module. |
322 | 8.46M | size_t HWY_INLINE GetIndex() const { |
323 | 8.46M | return hwy::Num0BitsBelowLS1Bit_Nonzero64( |
324 | 8.46M | static_cast<uint64_t>(LoadMask() & HWY_CHOSEN_TARGET_MASK_TARGETS)); |
325 | 8.46M | } |
326 | | |
327 | | private: |
328 | | #if defined(HWY_NO_LIBCXX) |
329 | | int64_t LoadMask() const { return mask_; } |
330 | | void StoreMask(int64_t mask) { mask_ = mask; } |
331 | | |
332 | | int64_t mask_{1}; // Initialized to 1 so GetIndex() returns 0. |
333 | | #else |
334 | 8.46M | int64_t LoadMask() const { return mask_.load(); } |
335 | 4 | void StoreMask(int64_t mask) { mask_.store(mask); } |
336 | | |
337 | | std::atomic<int64_t> mask_{1}; // Initialized to 1 so GetIndex() returns 0. |
338 | | #endif // HWY_ARCH_RISCV |
339 | | }; |
340 | | |
341 | | // For internal use (e.g. by FunctionCache and DisableTargets). |
342 | | HWY_DLLEXPORT ChosenTarget& GetChosenTarget(); |
343 | | |
344 | | } // namespace hwy |
345 | | |
346 | | #endif // HIGHWAY_HWY_TARGETS_H_ |