Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2016, Alliance for Open Media. All rights reserved. |
3 | | * |
4 | | * This source code is subject to the terms of the BSD 2 Clause License and |
5 | | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
6 | | * was not distributed with this source code in the LICENSE file, you can |
7 | | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
8 | | * Media Patent License 1.0 was not distributed with this source code in the |
9 | | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
10 | | */ |
11 | | |
12 | | #ifndef AOM_AOM_PORTS_X86_H_ |
13 | | #define AOM_AOM_PORTS_X86_H_ |
14 | | #include <stdlib.h> |
15 | | |
16 | | #if defined(_MSC_VER) |
17 | | #include <intrin.h> /* For __cpuidex, __rdtsc */ |
18 | | #endif |
19 | | |
20 | | #include "aom/aom_integer.h" |
21 | | #include "config/aom_config.h" |
22 | | |
23 | | #ifdef __cplusplus |
24 | | extern "C" { |
25 | | #endif |
26 | | |
27 | | typedef enum { |
28 | | AOM_CPU_UNKNOWN = -1, |
29 | | AOM_CPU_AMD, |
30 | | AOM_CPU_AMD_OLD, |
31 | | AOM_CPU_CENTAUR, |
32 | | AOM_CPU_CYRIX, |
33 | | AOM_CPU_INTEL, |
34 | | AOM_CPU_NEXGEN, |
35 | | AOM_CPU_NSC, |
36 | | AOM_CPU_RISE, |
37 | | AOM_CPU_SIS, |
38 | | AOM_CPU_TRANSMETA, |
39 | | AOM_CPU_TRANSMETA_OLD, |
40 | | AOM_CPU_UMC, |
41 | | AOM_CPU_VIA, |
42 | | |
43 | | AOM_CPU_LAST |
44 | | } aom_cpu_t; |
45 | | |
46 | | #if defined(__GNUC__) || defined(__ANDROID__) |
47 | | #if AOM_ARCH_X86_64 |
48 | | #define cpuid(func, func2, ax, bx, cx, dx) \ |
49 | 9 | __asm__ __volatile__("cpuid \n\t" \ |
50 | 9 | : "=a"(ax), "=b"(bx), "=c"(cx), "=d"(dx) \ |
51 | 9 | : "a"(func), "c"(func2)) |
52 | | #else |
53 | | #define cpuid(func, func2, ax, bx, cx, dx) \ |
54 | | __asm__ __volatile__( \ |
55 | | "mov %%ebx, %%edi \n\t" \ |
56 | | "cpuid \n\t" \ |
57 | | "xchg %%edi, %%ebx \n\t" \ |
58 | | : "=a"(ax), "=D"(bx), "=c"(cx), "=d"(dx) \ |
59 | | : "a"(func), "c"(func2)) |
60 | | #endif |
61 | | #elif defined(__SUNPRO_C) || \ |
62 | | defined(__SUNPRO_CC) /* end __GNUC__ or __ANDROID__*/ |
63 | | #if AOM_ARCH_X86_64 |
64 | | #define cpuid(func, func2, ax, bx, cx, dx) \ |
65 | | asm volatile( \ |
66 | | "xchg %rsi, %rbx \n\t" \ |
67 | | "cpuid \n\t" \ |
68 | | "movl %ebx, %edi \n\t" \ |
69 | | "xchg %rsi, %rbx \n\t" \ |
70 | | : "=a"(ax), "=D"(bx), "=c"(cx), "=d"(dx) \ |
71 | | : "a"(func), "c"(func2)) |
72 | | #else |
73 | | #define cpuid(func, func2, ax, bx, cx, dx) \ |
74 | | asm volatile( \ |
75 | | "pushl %ebx \n\t" \ |
76 | | "cpuid \n\t" \ |
77 | | "movl %ebx, %edi \n\t" \ |
78 | | "popl %ebx \n\t" \ |
79 | | : "=a"(ax), "=D"(bx), "=c"(cx), "=d"(dx) \ |
80 | | : "a"(func), "c"(func2)) |
81 | | #endif |
82 | | #else /* end __SUNPRO__ */ |
83 | | #if AOM_ARCH_X86_64 |
84 | | #if defined(_MSC_VER) && _MSC_VER > 1500 |
85 | | #define cpuid(func, func2, a, b, c, d) \ |
86 | | do { \ |
87 | | int regs[4]; \ |
88 | | __cpuidex(regs, func, func2); \ |
89 | | a = regs[0]; \ |
90 | | b = regs[1]; \ |
91 | | c = regs[2]; \ |
92 | | d = regs[3]; \ |
93 | | } while (0) |
94 | | #else |
95 | | #define cpuid(func, func2, a, b, c, d) \ |
96 | | do { \ |
97 | | int regs[4]; \ |
98 | | __cpuid(regs, func); \ |
99 | | a = regs[0]; \ |
100 | | b = regs[1]; \ |
101 | | c = regs[2]; \ |
102 | | d = regs[3]; \ |
103 | | } while (0) |
104 | | #endif |
105 | | #else |
106 | | /* clang-format off */ |
107 | | #define cpuid(func, func2, a, b, c, d) \ |
108 | | __asm mov eax, func \ |
109 | | __asm mov ecx, func2 \ |
110 | | __asm cpuid \ |
111 | | __asm mov a, eax \ |
112 | | __asm mov b, ebx \ |
113 | | __asm mov c, ecx \ |
114 | | __asm mov d, edx |
115 | | #endif |
116 | | /* clang-format on */ |
117 | | #endif /* end others */ |
118 | | |
119 | | // NaCl has no support for xgetbv or the raw opcode. |
120 | | #if !defined(__native_client__) && (defined(__i386__) || defined(__x86_64__)) |
121 | 6 | static inline uint64_t xgetbv(void) { |
122 | 6 | const uint32_t ecx = 0; |
123 | 6 | uint32_t eax, edx; |
124 | | // Use the raw opcode for xgetbv for compatibility with older toolchains. |
125 | 6 | __asm__ volatile(".byte 0x0f, 0x01, 0xd0\n" |
126 | 6 | : "=a"(eax), "=d"(edx) |
127 | 6 | : "c"(ecx)); |
128 | 6 | return ((uint64_t)edx << 32) | eax; |
129 | 6 | } Line | Count | Source | 121 | 2 | static inline uint64_t xgetbv(void) { | 122 | 2 | const uint32_t ecx = 0; | 123 | 2 | uint32_t eax, edx; | 124 | | // Use the raw opcode for xgetbv for compatibility with older toolchains. | 125 | 2 | __asm__ volatile(".byte 0x0f, 0x01, 0xd0\n" | 126 | 2 | : "=a"(eax), "=d"(edx) | 127 | 2 | : "c"(ecx)); | 128 | 2 | return ((uint64_t)edx << 32) | eax; | 129 | 2 | } |
Line | Count | Source | 121 | 2 | static inline uint64_t xgetbv(void) { | 122 | 2 | const uint32_t ecx = 0; | 123 | 2 | uint32_t eax, edx; | 124 | | // Use the raw opcode for xgetbv for compatibility with older toolchains. | 125 | 2 | __asm__ volatile(".byte 0x0f, 0x01, 0xd0\n" | 126 | 2 | : "=a"(eax), "=d"(edx) | 127 | 2 | : "c"(ecx)); | 128 | 2 | return ((uint64_t)edx << 32) | eax; | 129 | 2 | } |
Line | Count | Source | 121 | 2 | static inline uint64_t xgetbv(void) { | 122 | 2 | const uint32_t ecx = 0; | 123 | 2 | uint32_t eax, edx; | 124 | | // Use the raw opcode for xgetbv for compatibility with older toolchains. | 125 | 2 | __asm__ volatile(".byte 0x0f, 0x01, 0xd0\n" | 126 | 2 | : "=a"(eax), "=d"(edx) | 127 | 2 | : "c"(ecx)); | 128 | 2 | return ((uint64_t)edx << 32) | eax; | 129 | 2 | } |
|
130 | | #elif (defined(_M_X64) || defined(_M_IX86)) && defined(_MSC_FULL_VER) && \ |
131 | | _MSC_FULL_VER >= 160040219 // >= VS2010 SP1 |
132 | | #include <immintrin.h> |
133 | | #define xgetbv() _xgetbv(0) |
134 | | #elif defined(_MSC_VER) && defined(_M_IX86) |
135 | | static inline uint64_t xgetbv(void) { |
136 | | uint32_t eax_, edx_; |
137 | | __asm { |
138 | | xor ecx, ecx // ecx = 0 |
139 | | // Use the raw opcode for xgetbv for compatibility with older toolchains. |
140 | | __asm _emit 0x0f __asm _emit 0x01 __asm _emit 0xd0 |
141 | | mov eax_, eax |
142 | | mov edx_, edx |
143 | | } |
144 | | return ((uint64_t)edx_ << 32) | eax_; |
145 | | } |
146 | | #else |
147 | | #define xgetbv() 0U // no AVX for older x64 or unrecognized toolchains. |
148 | | #endif |
149 | | |
150 | | #if defined(_MSC_VER) && _MSC_VER >= 1700 |
151 | | #undef NOMINMAX |
152 | | #define NOMINMAX |
153 | | #undef WIN32_LEAN_AND_MEAN |
154 | | #define WIN32_LEAN_AND_MEAN |
155 | | #include <windows.h> |
156 | | #if WINAPI_FAMILY_PARTITION(WINAPI_FAMILY_APP) |
157 | | #define getenv(x) NULL |
158 | | #endif |
159 | | #endif |
160 | | |
161 | 3 | #define HAS_MMX 0x01 |
162 | 3 | #define HAS_SSE 0x02 |
163 | 3 | #define HAS_SSE2 0x04 |
164 | 3 | #define HAS_SSE3 0x08 |
165 | 98 | #define HAS_SSSE3 0x10 |
166 | 50 | #define HAS_SSE4_1 0x20 |
167 | 3 | #define HAS_AVX 0x40 |
168 | 117 | #define HAS_AVX2 0x80 |
169 | | #define HAS_SSE4_2 0x100 |
170 | 0 | #define HAS_AVX512 0x200 |
171 | | |
172 | | #ifndef BIT |
173 | 54 | #define BIT(n) (1u << (n)) |
174 | | #endif |
175 | | |
176 | 6 | #define MMX_BITS BIT(23) |
177 | 6 | #define SSE_BITS BIT(25) |
178 | 6 | #define SSE2_BITS BIT(26) |
179 | 6 | #define SSE3_BITS BIT(0) |
180 | 6 | #define SSSE3_BITS BIT(9) |
181 | 6 | #define SSE4_1_BITS BIT(19) |
182 | | // Bits 27 (OSXSAVE) & 28 (256-bit AVX) |
183 | 6 | #define AVX_BITS (BIT(27) | BIT(28)) |
184 | 6 | #define AVX2_BITS BIT(5) |
185 | | // Bits 16 (AVX512-F) & 17 (AVX512-DQ) & 28 (AVX512-CD) & 30 (AVX512-BW) |
186 | | // & 31 (AVX512-VL) |
187 | 0 | #define AVX512_BITS (BIT(16) | BIT(17) | BIT(28) | BIT(30) | BIT(31)) |
188 | | // Bits 1 (AVX512-VBMI) & 6 (AVX512-VBMI2) & 8 (AVX512-GFNI) & 9 (AVX512-VAES) & |
189 | | // 10 (AVX512-VPCLMULQDQ) & 11 (AVX512-VNNI) & 12 (AVX512-BITALG) & |
190 | | // 14 (AVX512-POPCNTDQ) |
191 | | #define AVX512_DL_BITS \ |
192 | 0 | (BIT(1) | BIT(6) | BIT(8) | BIT(9) | BIT(10) | BIT(11) | BIT(12) | BIT(14)) |
193 | | |
194 | | #define FEATURE_SET(reg, feature) \ |
195 | 24 | (((reg) & (feature##_BITS)) == (feature##_BITS)) |
196 | | |
197 | 3 | static inline int x86_simd_caps(void) { |
198 | 3 | unsigned int flags = 0; |
199 | 3 | unsigned int mask = ~0u; |
200 | 3 | unsigned int max_cpuid_val, reg_eax, reg_ebx, reg_ecx, reg_edx; |
201 | 3 | char *env; |
202 | | |
203 | | /* See if the CPU capabilities are being overridden by the environment */ |
204 | 3 | env = getenv("AOM_SIMD_CAPS"); |
205 | 3 | if (env && *env) return (int)strtol(env, NULL, 0); |
206 | | |
207 | 3 | env = getenv("AOM_SIMD_CAPS_MASK"); |
208 | 3 | if (env && *env) mask = (unsigned int)strtoul(env, NULL, 0); |
209 | | |
210 | | /* Ensure that the CPUID instruction supports extended features */ |
211 | 3 | cpuid(0, 0, max_cpuid_val, reg_ebx, reg_ecx, reg_edx); |
212 | | |
213 | 3 | if (max_cpuid_val < 1) return 0; |
214 | | |
215 | | /* Get the standard feature flags */ |
216 | 3 | cpuid(1, 0, reg_eax, reg_ebx, reg_ecx, reg_edx); |
217 | | |
218 | 3 | flags |= FEATURE_SET(reg_edx, MMX) ? HAS_MMX : 0; |
219 | 3 | flags |= FEATURE_SET(reg_edx, SSE) ? HAS_SSE : 0; |
220 | 3 | flags |= FEATURE_SET(reg_edx, SSE2) ? HAS_SSE2 : 0; |
221 | 3 | flags |= FEATURE_SET(reg_ecx, SSE3) ? HAS_SSE3 : 0; |
222 | 3 | flags |= FEATURE_SET(reg_ecx, SSSE3) ? HAS_SSSE3 : 0; |
223 | 3 | flags |= FEATURE_SET(reg_ecx, SSE4_1) ? HAS_SSE4_1 : 0; |
224 | | |
225 | | // bits 27 (OSXSAVE) & 28 (256-bit AVX) |
226 | 3 | if (FEATURE_SET(reg_ecx, AVX)) { |
227 | | // Check for OS-support of YMM state. Necessary for AVX and AVX2. |
228 | 3 | if ((xgetbv() & 0x6) == 0x6) { |
229 | 3 | flags |= HAS_AVX; |
230 | 3 | if (max_cpuid_val >= 7) { |
231 | | /* Get the leaf 7 feature flags. Needed to check for AVX2 support */ |
232 | 3 | cpuid(7, 0, reg_eax, reg_ebx, reg_ecx, reg_edx); |
233 | 3 | flags |= FEATURE_SET(reg_ebx, AVX2) ? HAS_AVX2 : 0; |
234 | | // Check for OS-support of ZMM and YMM state. Necessary for AVX512. |
235 | | // Only set HAS_AVX512 flag if AVX512_DL feature are supported. |
236 | | // Older AVX512 implementations (such as Skylake) have turbo curves that |
237 | | // are currently problematic for mixed AVX512/AVX2 code |
238 | 3 | if ((xgetbv() & 0xe6) == 0xe6) { |
239 | 0 | flags |= |
240 | 0 | FEATURE_SET(reg_ebx, AVX512) && FEATURE_SET(reg_ecx, AVX512_DL) |
241 | 0 | ? HAS_AVX512 |
242 | 0 | : 0; |
243 | 0 | } |
244 | 3 | } |
245 | 3 | } |
246 | 3 | } |
247 | 3 | (void)reg_eax; // Avoid compiler warning on unused-but-set variable. |
248 | 3 | return flags & mask; |
249 | 3 | } aom_dsp_rtcd.c:x86_simd_caps Line | Count | Source | 197 | 1 | static inline int x86_simd_caps(void) { | 198 | 1 | unsigned int flags = 0; | 199 | 1 | unsigned int mask = ~0u; | 200 | 1 | unsigned int max_cpuid_val, reg_eax, reg_ebx, reg_ecx, reg_edx; | 201 | 1 | char *env; | 202 | | | 203 | | /* See if the CPU capabilities are being overridden by the environment */ | 204 | 1 | env = getenv("AOM_SIMD_CAPS"); | 205 | 1 | if (env && *env) return (int)strtol(env, NULL, 0); | 206 | | | 207 | 1 | env = getenv("AOM_SIMD_CAPS_MASK"); | 208 | 1 | if (env && *env) mask = (unsigned int)strtoul(env, NULL, 0); | 209 | | | 210 | | /* Ensure that the CPUID instruction supports extended features */ | 211 | 1 | cpuid(0, 0, max_cpuid_val, reg_ebx, reg_ecx, reg_edx); | 212 | | | 213 | 1 | if (max_cpuid_val < 1) return 0; | 214 | | | 215 | | /* Get the standard feature flags */ | 216 | 1 | cpuid(1, 0, reg_eax, reg_ebx, reg_ecx, reg_edx); | 217 | | | 218 | 1 | flags |= FEATURE_SET(reg_edx, MMX) ? HAS_MMX : 0; | 219 | 1 | flags |= FEATURE_SET(reg_edx, SSE) ? HAS_SSE : 0; | 220 | 1 | flags |= FEATURE_SET(reg_edx, SSE2) ? HAS_SSE2 : 0; | 221 | 1 | flags |= FEATURE_SET(reg_ecx, SSE3) ? HAS_SSE3 : 0; | 222 | 1 | flags |= FEATURE_SET(reg_ecx, SSSE3) ? HAS_SSSE3 : 0; | 223 | 1 | flags |= FEATURE_SET(reg_ecx, SSE4_1) ? HAS_SSE4_1 : 0; | 224 | | | 225 | | // bits 27 (OSXSAVE) & 28 (256-bit AVX) | 226 | 1 | if (FEATURE_SET(reg_ecx, AVX)) { | 227 | | // Check for OS-support of YMM state. Necessary for AVX and AVX2. | 228 | 1 | if ((xgetbv() & 0x6) == 0x6) { | 229 | 1 | flags |= HAS_AVX; | 230 | 1 | if (max_cpuid_val >= 7) { | 231 | | /* Get the leaf 7 feature flags. Needed to check for AVX2 support */ | 232 | 1 | cpuid(7, 0, reg_eax, reg_ebx, reg_ecx, reg_edx); | 233 | 1 | flags |= FEATURE_SET(reg_ebx, AVX2) ? HAS_AVX2 : 0; | 234 | | // Check for OS-support of ZMM and YMM state. Necessary for AVX512. | 235 | | // Only set HAS_AVX512 flag if AVX512_DL feature are supported. | 236 | | // Older AVX512 implementations (such as Skylake) have turbo curves that | 237 | | // are currently problematic for mixed AVX512/AVX2 code | 238 | 1 | if ((xgetbv() & 0xe6) == 0xe6) { | 239 | 0 | flags |= | 240 | 0 | FEATURE_SET(reg_ebx, AVX512) && FEATURE_SET(reg_ecx, AVX512_DL) | 241 | 0 | ? HAS_AVX512 | 242 | 0 | : 0; | 243 | 0 | } | 244 | 1 | } | 245 | 1 | } | 246 | 1 | } | 247 | 1 | (void)reg_eax; // Avoid compiler warning on unused-but-set variable. | 248 | 1 | return flags & mask; | 249 | 1 | } |
aom_scale_rtcd.c:x86_simd_caps Line | Count | Source | 197 | 1 | static inline int x86_simd_caps(void) { | 198 | 1 | unsigned int flags = 0; | 199 | 1 | unsigned int mask = ~0u; | 200 | 1 | unsigned int max_cpuid_val, reg_eax, reg_ebx, reg_ecx, reg_edx; | 201 | 1 | char *env; | 202 | | | 203 | | /* See if the CPU capabilities are being overridden by the environment */ | 204 | 1 | env = getenv("AOM_SIMD_CAPS"); | 205 | 1 | if (env && *env) return (int)strtol(env, NULL, 0); | 206 | | | 207 | 1 | env = getenv("AOM_SIMD_CAPS_MASK"); | 208 | 1 | if (env && *env) mask = (unsigned int)strtoul(env, NULL, 0); | 209 | | | 210 | | /* Ensure that the CPUID instruction supports extended features */ | 211 | 1 | cpuid(0, 0, max_cpuid_val, reg_ebx, reg_ecx, reg_edx); | 212 | | | 213 | 1 | if (max_cpuid_val < 1) return 0; | 214 | | | 215 | | /* Get the standard feature flags */ | 216 | 1 | cpuid(1, 0, reg_eax, reg_ebx, reg_ecx, reg_edx); | 217 | | | 218 | 1 | flags |= FEATURE_SET(reg_edx, MMX) ? HAS_MMX : 0; | 219 | 1 | flags |= FEATURE_SET(reg_edx, SSE) ? HAS_SSE : 0; | 220 | 1 | flags |= FEATURE_SET(reg_edx, SSE2) ? HAS_SSE2 : 0; | 221 | 1 | flags |= FEATURE_SET(reg_ecx, SSE3) ? HAS_SSE3 : 0; | 222 | 1 | flags |= FEATURE_SET(reg_ecx, SSSE3) ? HAS_SSSE3 : 0; | 223 | 1 | flags |= FEATURE_SET(reg_ecx, SSE4_1) ? HAS_SSE4_1 : 0; | 224 | | | 225 | | // bits 27 (OSXSAVE) & 28 (256-bit AVX) | 226 | 1 | if (FEATURE_SET(reg_ecx, AVX)) { | 227 | | // Check for OS-support of YMM state. Necessary for AVX and AVX2. | 228 | 1 | if ((xgetbv() & 0x6) == 0x6) { | 229 | 1 | flags |= HAS_AVX; | 230 | 1 | if (max_cpuid_val >= 7) { | 231 | | /* Get the leaf 7 feature flags. Needed to check for AVX2 support */ | 232 | 1 | cpuid(7, 0, reg_eax, reg_ebx, reg_ecx, reg_edx); | 233 | 1 | flags |= FEATURE_SET(reg_ebx, AVX2) ? HAS_AVX2 : 0; | 234 | | // Check for OS-support of ZMM and YMM state. Necessary for AVX512. | 235 | | // Only set HAS_AVX512 flag if AVX512_DL feature are supported. | 236 | | // Older AVX512 implementations (such as Skylake) have turbo curves that | 237 | | // are currently problematic for mixed AVX512/AVX2 code | 238 | 1 | if ((xgetbv() & 0xe6) == 0xe6) { | 239 | 0 | flags |= | 240 | 0 | FEATURE_SET(reg_ebx, AVX512) && FEATURE_SET(reg_ecx, AVX512_DL) | 241 | 0 | ? HAS_AVX512 | 242 | 0 | : 0; | 243 | 0 | } | 244 | 1 | } | 245 | 1 | } | 246 | 1 | } | 247 | 1 | (void)reg_eax; // Avoid compiler warning on unused-but-set variable. | 248 | 1 | return flags & mask; | 249 | 1 | } |
Line | Count | Source | 197 | 1 | static inline int x86_simd_caps(void) { | 198 | 1 | unsigned int flags = 0; | 199 | 1 | unsigned int mask = ~0u; | 200 | 1 | unsigned int max_cpuid_val, reg_eax, reg_ebx, reg_ecx, reg_edx; | 201 | 1 | char *env; | 202 | | | 203 | | /* See if the CPU capabilities are being overridden by the environment */ | 204 | 1 | env = getenv("AOM_SIMD_CAPS"); | 205 | 1 | if (env && *env) return (int)strtol(env, NULL, 0); | 206 | | | 207 | 1 | env = getenv("AOM_SIMD_CAPS_MASK"); | 208 | 1 | if (env && *env) mask = (unsigned int)strtoul(env, NULL, 0); | 209 | | | 210 | | /* Ensure that the CPUID instruction supports extended features */ | 211 | 1 | cpuid(0, 0, max_cpuid_val, reg_ebx, reg_ecx, reg_edx); | 212 | | | 213 | 1 | if (max_cpuid_val < 1) return 0; | 214 | | | 215 | | /* Get the standard feature flags */ | 216 | 1 | cpuid(1, 0, reg_eax, reg_ebx, reg_ecx, reg_edx); | 217 | | | 218 | 1 | flags |= FEATURE_SET(reg_edx, MMX) ? HAS_MMX : 0; | 219 | 1 | flags |= FEATURE_SET(reg_edx, SSE) ? HAS_SSE : 0; | 220 | 1 | flags |= FEATURE_SET(reg_edx, SSE2) ? HAS_SSE2 : 0; | 221 | 1 | flags |= FEATURE_SET(reg_ecx, SSE3) ? HAS_SSE3 : 0; | 222 | 1 | flags |= FEATURE_SET(reg_ecx, SSSE3) ? HAS_SSSE3 : 0; | 223 | 1 | flags |= FEATURE_SET(reg_ecx, SSE4_1) ? HAS_SSE4_1 : 0; | 224 | | | 225 | | // bits 27 (OSXSAVE) & 28 (256-bit AVX) | 226 | 1 | if (FEATURE_SET(reg_ecx, AVX)) { | 227 | | // Check for OS-support of YMM state. Necessary for AVX and AVX2. | 228 | 1 | if ((xgetbv() & 0x6) == 0x6) { | 229 | 1 | flags |= HAS_AVX; | 230 | 1 | if (max_cpuid_val >= 7) { | 231 | | /* Get the leaf 7 feature flags. Needed to check for AVX2 support */ | 232 | 1 | cpuid(7, 0, reg_eax, reg_ebx, reg_ecx, reg_edx); | 233 | 1 | flags |= FEATURE_SET(reg_ebx, AVX2) ? HAS_AVX2 : 0; | 234 | | // Check for OS-support of ZMM and YMM state. Necessary for AVX512. | 235 | | // Only set HAS_AVX512 flag if AVX512_DL feature are supported. | 236 | | // Older AVX512 implementations (such as Skylake) have turbo curves that | 237 | | // are currently problematic for mixed AVX512/AVX2 code | 238 | 1 | if ((xgetbv() & 0xe6) == 0xe6) { | 239 | 0 | flags |= | 240 | 0 | FEATURE_SET(reg_ebx, AVX512) && FEATURE_SET(reg_ecx, AVX512_DL) | 241 | 0 | ? HAS_AVX512 | 242 | 0 | : 0; | 243 | 0 | } | 244 | 1 | } | 245 | 1 | } | 246 | 1 | } | 247 | 1 | (void)reg_eax; // Avoid compiler warning on unused-but-set variable. | 248 | 1 | return flags & mask; | 249 | 1 | } |
|
250 | | |
251 | | // Fine-Grain Measurement Functions |
252 | | // |
253 | | // If you are timing a small region of code, access the timestamp counter |
254 | | // (TSC) via: |
255 | | // |
256 | | // unsigned int start = x86_tsc_start(); |
257 | | // ... |
258 | | // unsigned int end = x86_tsc_end(); |
259 | | // unsigned int diff = end - start; |
260 | | // |
261 | | // The start/end functions introduce a few more instructions than using |
262 | | // x86_readtsc directly, but prevent the CPU's out-of-order execution from |
263 | | // affecting the measurement (by having earlier/later instructions be evaluated |
264 | | // in the time interval). See the white paper, "How to Benchmark Code |
265 | | // Execution Times on Intel(R) IA-32 and IA-64 Instruction Set Architectures" by |
266 | | // Gabriele Paoloni for more information. |
267 | | // |
268 | | // If you are timing a large function (CPU time > a couple of seconds), use |
269 | | // x86_readtsc64 to read the timestamp counter in a 64-bit integer. The |
270 | | // out-of-order leakage that can occur is minimal compared to total runtime. |
271 | 0 | static inline unsigned int x86_readtsc(void) { |
272 | 0 | #if defined(__GNUC__) |
273 | 0 | unsigned int tsc; |
274 | 0 | __asm__ __volatile__("rdtsc\n\t" : "=a"(tsc) :); |
275 | 0 | return tsc; |
276 | 0 | #elif defined(__SUNPRO_C) || defined(__SUNPRO_CC) |
277 | 0 | unsigned int tsc; |
278 | 0 | asm volatile("rdtsc\n\t" : "=a"(tsc) :); |
279 | 0 | return tsc; |
280 | 0 | #else |
281 | 0 | #if AOM_ARCH_X86_64 |
282 | 0 | return (unsigned int)__rdtsc(); |
283 | 0 | #else |
284 | 0 | __asm rdtsc; |
285 | 0 | #endif |
286 | 0 | #endif |
287 | 0 | } Unexecuted instantiation: aom_dsp_rtcd.c:x86_readtsc Unexecuted instantiation: aom_scale_rtcd.c:x86_readtsc Unexecuted instantiation: av1_rtcd.c:x86_readtsc |
288 | | // 64-bit CPU cycle counter |
289 | 0 | static inline uint64_t x86_readtsc64(void) { |
290 | 0 | #if defined(__GNUC__) |
291 | 0 | uint32_t hi, lo; |
292 | 0 | __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi)); |
293 | 0 | return ((uint64_t)hi << 32) | lo; |
294 | 0 | #elif defined(__SUNPRO_C) || defined(__SUNPRO_CC) |
295 | 0 | uint_t hi, lo; |
296 | 0 | asm volatile("rdtsc\n\t" : "=a"(lo), "=d"(hi)); |
297 | 0 | return ((uint64_t)hi << 32) | lo; |
298 | 0 | #else |
299 | 0 | #if AOM_ARCH_X86_64 |
300 | 0 | return (uint64_t)__rdtsc(); |
301 | 0 | #else |
302 | 0 | __asm rdtsc; |
303 | 0 | #endif |
304 | 0 | #endif |
305 | 0 | } Unexecuted instantiation: aom_dsp_rtcd.c:x86_readtsc64 Unexecuted instantiation: aom_scale_rtcd.c:x86_readtsc64 Unexecuted instantiation: av1_rtcd.c:x86_readtsc64 |
306 | | |
307 | | // 32-bit CPU cycle counter with a partial fence against out-of-order execution. |
308 | 0 | static inline unsigned int x86_readtscp(void) { |
309 | 0 | #if defined(__GNUC__) |
310 | 0 | unsigned int tscp; |
311 | 0 | __asm__ __volatile__("rdtscp\n\t" : "=a"(tscp) :); |
312 | 0 | return tscp; |
313 | 0 | #elif defined(__SUNPRO_C) || defined(__SUNPRO_CC) |
314 | 0 | unsigned int tscp; |
315 | 0 | asm volatile("rdtscp\n\t" : "=a"(tscp) :); |
316 | 0 | return tscp; |
317 | 0 | #elif defined(_MSC_VER) |
318 | 0 | unsigned int ui; |
319 | 0 | return (unsigned int)__rdtscp(&ui); |
320 | 0 | #else |
321 | 0 | #if AOM_ARCH_X86_64 |
322 | 0 | return (unsigned int)__rdtscp(); |
323 | 0 | #else |
324 | 0 | __asm rdtscp; |
325 | 0 | #endif |
326 | 0 | #endif |
327 | 0 | } Unexecuted instantiation: aom_dsp_rtcd.c:x86_readtscp Unexecuted instantiation: aom_scale_rtcd.c:x86_readtscp Unexecuted instantiation: av1_rtcd.c:x86_readtscp |
328 | | |
329 | 0 | static inline unsigned int x86_tsc_start(void) { |
330 | 0 | unsigned int reg_eax, reg_ebx, reg_ecx, reg_edx; |
331 | 0 | // This call should not be removed. See function notes above. |
332 | 0 | cpuid(0, 0, reg_eax, reg_ebx, reg_ecx, reg_edx); |
333 | 0 | // Avoid compiler warnings on unused-but-set variables. |
334 | 0 | (void)reg_eax; |
335 | 0 | (void)reg_ebx; |
336 | 0 | (void)reg_ecx; |
337 | 0 | (void)reg_edx; |
338 | 0 | return x86_readtsc(); |
339 | 0 | } Unexecuted instantiation: aom_dsp_rtcd.c:x86_tsc_start Unexecuted instantiation: aom_scale_rtcd.c:x86_tsc_start Unexecuted instantiation: av1_rtcd.c:x86_tsc_start |
340 | | |
341 | 0 | static inline unsigned int x86_tsc_end(void) { |
342 | 0 | uint32_t v = x86_readtscp(); |
343 | 0 | unsigned int reg_eax, reg_ebx, reg_ecx, reg_edx; |
344 | 0 | // This call should not be removed. See function notes above. |
345 | 0 | cpuid(0, 0, reg_eax, reg_ebx, reg_ecx, reg_edx); |
346 | 0 | // Avoid compiler warnings on unused-but-set variables. |
347 | 0 | (void)reg_eax; |
348 | 0 | (void)reg_ebx; |
349 | 0 | (void)reg_ecx; |
350 | 0 | (void)reg_edx; |
351 | 0 | return v; |
352 | 0 | } Unexecuted instantiation: aom_dsp_rtcd.c:x86_tsc_end Unexecuted instantiation: aom_scale_rtcd.c:x86_tsc_end Unexecuted instantiation: av1_rtcd.c:x86_tsc_end |
353 | | |
354 | | #if defined(__GNUC__) |
355 | | #define x86_pause_hint() __asm__ __volatile__("pause \n\t") |
356 | | #elif defined(__SUNPRO_C) || defined(__SUNPRO_CC) |
357 | | #define x86_pause_hint() asm volatile("pause \n\t") |
358 | | #else |
359 | | #if AOM_ARCH_X86_64 |
360 | | #define x86_pause_hint() _mm_pause(); |
361 | | #else |
362 | | #define x86_pause_hint() __asm pause |
363 | | #endif |
364 | | #endif |
365 | | |
366 | | #if defined(__GNUC__) |
367 | 0 | static void x87_set_control_word(unsigned short mode) { |
368 | 0 | __asm__ __volatile__("fldcw %0" : : "m"(*&mode)); |
369 | 0 | } Unexecuted instantiation: aom_dsp_rtcd.c:x87_set_control_word Unexecuted instantiation: aom_scale_rtcd.c:x87_set_control_word Unexecuted instantiation: av1_rtcd.c:x87_set_control_word |
370 | 0 | static unsigned short x87_get_control_word(void) { |
371 | 0 | unsigned short mode; |
372 | 0 | __asm__ __volatile__("fstcw %0\n\t" : "=m"(*&mode) :); |
373 | 0 | return mode; |
374 | 0 | } Unexecuted instantiation: aom_dsp_rtcd.c:x87_get_control_word Unexecuted instantiation: aom_scale_rtcd.c:x87_get_control_word Unexecuted instantiation: av1_rtcd.c:x87_get_control_word |
375 | | #elif defined(__SUNPRO_C) || defined(__SUNPRO_CC) |
376 | | static void x87_set_control_word(unsigned short mode) { |
377 | | asm volatile("fldcw %0" : : "m"(*&mode)); |
378 | | } |
379 | | static unsigned short x87_get_control_word(void) { |
380 | | unsigned short mode; |
381 | | asm volatile("fstcw %0\n\t" : "=m"(*&mode) :); |
382 | | return mode; |
383 | | } |
384 | | #elif AOM_ARCH_X86_64 |
385 | | /* No fldcw intrinsics on Windows x64, punt to external asm */ |
386 | | extern void aom_winx64_fldcw(unsigned short mode); |
387 | | extern unsigned short aom_winx64_fstcw(void); |
388 | | #define x87_set_control_word aom_winx64_fldcw |
389 | | #define x87_get_control_word aom_winx64_fstcw |
390 | | #else |
391 | | static void x87_set_control_word(unsigned short mode) { |
392 | | __asm { fldcw mode } |
393 | | } |
394 | | static unsigned short x87_get_control_word(void) { |
395 | | unsigned short mode; |
396 | | __asm { fstcw mode } |
397 | | return mode; |
398 | | } |
399 | | #endif |
400 | | |
401 | 0 | static inline unsigned int x87_set_double_precision(void) { |
402 | 0 | unsigned int mode = x87_get_control_word(); |
403 | 0 | // Intel 64 and IA-32 Architectures Developer's Manual: Vol. 1 |
404 | 0 | // https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-vol-1-manual.pdf |
405 | 0 | // 8.1.5.2 Precision Control Field |
406 | 0 | // Bits 8 and 9 (0x300) of the x87 FPU Control Word ("Precision Control") |
407 | 0 | // determine the number of bits used in floating point calculations. To match |
408 | 0 | // later SSE instructions restrict x87 operations to Double Precision (0x200). |
409 | 0 | // Precision PC Field |
410 | 0 | // Single Precision (24-Bits) 00B |
411 | 0 | // Reserved 01B |
412 | 0 | // Double Precision (53-Bits) 10B |
413 | 0 | // Extended Precision (64-Bits) 11B |
414 | 0 | x87_set_control_word((mode & ~0x300u) | 0x200u); |
415 | 0 | return mode; |
416 | 0 | } Unexecuted instantiation: aom_dsp_rtcd.c:x87_set_double_precision Unexecuted instantiation: aom_scale_rtcd.c:x87_set_double_precision Unexecuted instantiation: av1_rtcd.c:x87_set_double_precision |
417 | | |
418 | | #ifdef __cplusplus |
419 | | } // extern "C" |
420 | | #endif |
421 | | |
422 | | #endif // AOM_AOM_PORTS_X86_H_ |