/src/libvpx/vpx_ports/x86.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
3 | | * |
4 | | * Use of this source code is governed by a BSD-style license |
5 | | * that can be found in the LICENSE file in the root of the source |
6 | | * tree. An additional intellectual property rights grant can be found |
7 | | * in the file PATENTS. All contributing project authors may |
8 | | * be found in the AUTHORS file in the root of the source tree. |
9 | | */ |
10 | | |
11 | | #ifndef VPX_VPX_PORTS_X86_H_ |
12 | | #define VPX_VPX_PORTS_X86_H_ |
13 | | #include <stdlib.h> |
14 | | |
15 | | #if defined(_MSC_VER) |
16 | | #include <intrin.h> /* For __cpuidex, __rdtsc */ |
17 | | #endif |
18 | | |
19 | | #include "vpx_config.h" |
20 | | #include "vpx/vpx_integer.h" |
21 | | |
22 | | #ifdef __cplusplus |
23 | | extern "C" { |
24 | | #endif |
25 | | |
26 | | typedef enum { |
27 | | VPX_CPU_UNKNOWN = -1, |
28 | | VPX_CPU_AMD, |
29 | | VPX_CPU_AMD_OLD, |
30 | | VPX_CPU_CENTAUR, |
31 | | VPX_CPU_CYRIX, |
32 | | VPX_CPU_INTEL, |
33 | | VPX_CPU_NEXGEN, |
34 | | VPX_CPU_NSC, |
35 | | VPX_CPU_RISE, |
36 | | VPX_CPU_SIS, |
37 | | VPX_CPU_TRANSMETA, |
38 | | VPX_CPU_TRANSMETA_OLD, |
39 | | VPX_CPU_UMC, |
40 | | VPX_CPU_VIA, |
41 | | |
42 | | VPX_CPU_LAST |
43 | | } vpx_cpu_t; |
44 | | |
45 | | #if defined(__GNUC__) || defined(__ANDROID__) |
46 | | #if VPX_ARCH_X86_64 |
47 | | #define cpuid(func, func2, ax, bx, cx, dx) \ |
48 | 18 | __asm__ __volatile__("cpuid \n\t" \ |
49 | 18 | : "=a"(ax), "=b"(bx), "=c"(cx), "=d"(dx) \ |
50 | 18 | : "a"(func), "c"(func2)) |
51 | | #else |
52 | | #define cpuid(func, func2, ax, bx, cx, dx) \ |
53 | | __asm__ __volatile__( \ |
54 | | "mov %%ebx, %%edi \n\t" \ |
55 | | "cpuid \n\t" \ |
56 | | "xchg %%edi, %%ebx \n\t" \ |
57 | | : "=a"(ax), "=D"(bx), "=c"(cx), "=d"(dx) \ |
58 | | : "a"(func), "c"(func2)) |
59 | | #endif |
60 | | #elif defined(__SUNPRO_C) || \ |
61 | | defined(__SUNPRO_CC) /* end __GNUC__ or __ANDROID__*/ |
62 | | #if VPX_ARCH_X86_64 |
63 | | #define cpuid(func, func2, ax, bx, cx, dx) \ |
64 | | asm volatile( \ |
65 | | "xchg %rsi, %rbx \n\t" \ |
66 | | "cpuid \n\t" \ |
67 | | "movl %ebx, %edi \n\t" \ |
68 | | "xchg %rsi, %rbx \n\t" \ |
69 | | : "=a"(ax), "=D"(bx), "=c"(cx), "=d"(dx) \ |
70 | | : "a"(func), "c"(func2)) |
71 | | #else |
72 | | #define cpuid(func, func2, ax, bx, cx, dx) \ |
73 | | asm volatile( \ |
74 | | "pushl %ebx \n\t" \ |
75 | | "cpuid \n\t" \ |
76 | | "movl %ebx, %edi \n\t" \ |
77 | | "popl %ebx \n\t" \ |
78 | | : "=a"(ax), "=D"(bx), "=c"(cx), "=d"(dx) \ |
79 | | : "a"(func), "c"(func2)) |
80 | | #endif |
81 | | #else /* end __SUNPRO__ */ |
82 | | #if VPX_ARCH_X86_64 |
83 | | #if defined(_MSC_VER) && _MSC_VER > 1500 |
84 | | #define cpuid(func, func2, a, b, c, d) \ |
85 | | do { \ |
86 | | int regs[4]; \ |
87 | | __cpuidex(regs, func, func2); \ |
88 | | a = regs[0]; \ |
89 | | b = regs[1]; \ |
90 | | c = regs[2]; \ |
91 | | d = regs[3]; \ |
92 | | } while (0) |
93 | | #else |
94 | | #define cpuid(func, func2, a, b, c, d) \ |
95 | | do { \ |
96 | | int regs[4]; \ |
97 | | __cpuid(regs, func); \ |
98 | | a = regs[0]; \ |
99 | | b = regs[1]; \ |
100 | | c = regs[2]; \ |
101 | | d = regs[3]; \ |
102 | | } while (0) |
103 | | #endif |
104 | | #else |
105 | | #define cpuid(func, func2, a, b, c, d) \ |
106 | | __asm mov eax, func __asm mov ecx, func2 __asm cpuid __asm mov a, \ |
107 | | eax __asm mov b, ebx __asm mov c, ecx __asm mov d, edx |
108 | | #endif |
109 | | #endif /* end others */ |
110 | | |
111 | | // NaCl has no support for xgetbv or the raw opcode. |
112 | | #if !defined(__native_client__) && (defined(__i386__) || defined(__x86_64__)) |
113 | 6 | static INLINE uint64_t xgetbv(void) { |
114 | 6 | const uint32_t ecx = 0; |
115 | 6 | uint32_t eax, edx; |
116 | | // Use the raw opcode for xgetbv for compatibility with older toolchains. |
117 | 6 | __asm__ volatile(".byte 0x0f, 0x01, 0xd0\n" |
118 | 6 | : "=a"(eax), "=d"(edx) |
119 | 6 | : "c"(ecx)); |
120 | 6 | return ((uint64_t)edx << 32) | eax; |
121 | 6 | } Line | Count | Source | 113 | 2 | static INLINE uint64_t xgetbv(void) { | 114 | 2 | const uint32_t ecx = 0; | 115 | 2 | uint32_t eax, edx; | 116 | | // Use the raw opcode for xgetbv for compatibility with older toolchains. | 117 | 2 | __asm__ volatile(".byte 0x0f, 0x01, 0xd0\n" | 118 | 2 | : "=a"(eax), "=d"(edx) | 119 | 2 | : "c"(ecx)); | 120 | 2 | return ((uint64_t)edx << 32) | eax; | 121 | 2 | } |
Line | Count | Source | 113 | 2 | static INLINE uint64_t xgetbv(void) { | 114 | 2 | const uint32_t ecx = 0; | 115 | 2 | uint32_t eax, edx; | 116 | | // Use the raw opcode for xgetbv for compatibility with older toolchains. | 117 | 2 | __asm__ volatile(".byte 0x0f, 0x01, 0xd0\n" | 118 | 2 | : "=a"(eax), "=d"(edx) | 119 | 2 | : "c"(ecx)); | 120 | 2 | return ((uint64_t)edx << 32) | eax; | 121 | 2 | } |
Line | Count | Source | 113 | 1 | static INLINE uint64_t xgetbv(void) { | 114 | 1 | const uint32_t ecx = 0; | 115 | 1 | uint32_t eax, edx; | 116 | | // Use the raw opcode for xgetbv for compatibility with older toolchains. | 117 | 1 | __asm__ volatile(".byte 0x0f, 0x01, 0xd0\n" | 118 | 1 | : "=a"(eax), "=d"(edx) | 119 | 1 | : "c"(ecx)); | 120 | 1 | return ((uint64_t)edx << 32) | eax; | 121 | 1 | } |
Unexecuted instantiation: vp8_dx_iface.c:xgetbv Unexecuted instantiation: onyxd_if.c:xgetbv Unexecuted instantiation: threading.c:xgetbv Unexecuted instantiation: systemdependent.c:xgetbv Line | Count | Source | 113 | 1 | static INLINE uint64_t xgetbv(void) { | 114 | 1 | const uint32_t ecx = 0; | 115 | 1 | uint32_t eax, edx; | 116 | | // Use the raw opcode for xgetbv for compatibility with older toolchains. | 117 | 1 | __asm__ volatile(".byte 0x0f, 0x01, 0xd0\n" | 118 | 1 | : "=a"(eax), "=d"(edx) | 119 | 1 | : "c"(ecx)); | 120 | 1 | return ((uint64_t)edx << 32) | eax; | 121 | 1 | } |
Unexecuted instantiation: decodeframe.c:xgetbv Unexecuted instantiation: detokenize.c:xgetbv Unexecuted instantiation: decodemv.c:xgetbv |
122 | | #elif (defined(_M_X64) || defined(_M_IX86)) && defined(_MSC_FULL_VER) && \ |
123 | | _MSC_FULL_VER >= 160040219 // >= VS2010 SP1 |
124 | | #include <immintrin.h> |
125 | | #define xgetbv() _xgetbv(0) |
126 | | #elif defined(_MSC_VER) && defined(_M_IX86) |
127 | | static INLINE uint64_t xgetbv(void) { |
128 | | uint32_t eax_, edx_; |
129 | | __asm { |
130 | | xor ecx, ecx // ecx = 0 |
131 | | // Use the raw opcode for xgetbv for compatibility with older toolchains. |
132 | | __asm _emit 0x0f __asm _emit 0x01 __asm _emit 0xd0 |
133 | | mov eax_, eax |
134 | | mov edx_, edx |
135 | | } |
136 | | return ((uint64_t)edx_ << 32) | eax_; |
137 | | } |
138 | | #else |
139 | | #define xgetbv() 0U // no AVX for older x64 or unrecognized toolchains. |
140 | | #endif |
141 | | |
142 | | #if defined(_MSC_VER) && _MSC_VER >= 1700 |
143 | | #undef NOMINMAX |
144 | | #define NOMINMAX |
145 | | #ifndef WIN32_LEAN_AND_MEAN |
146 | | #define WIN32_LEAN_AND_MEAN |
147 | | #endif |
148 | | #include <windows.h> |
149 | | #if WINAPI_FAMILY_PARTITION(WINAPI_FAMILY_APP) |
150 | | #define getenv(x) NULL |
151 | | #endif |
152 | | #endif |
153 | | |
154 | 6 | #define HAS_MMX 0x001 |
155 | 6 | #define HAS_SSE 0x002 |
156 | 6 | #define HAS_SSE2 0x004 |
157 | 6 | #define HAS_SSE3 0x008 |
158 | 148 | #define HAS_SSSE3 0x010 |
159 | 27 | #define HAS_SSE4_1 0x020 |
160 | 6 | #define HAS_AVX 0x040 |
161 | 82 | #define HAS_AVX2 0x080 |
162 | 0 | #define HAS_AVX512 0x100 |
163 | | #ifndef BIT |
164 | 168 | #define BIT(n) (1u << (n)) |
165 | | #endif |
166 | | |
167 | 12 | #define MMX_BITS BIT(23) |
168 | 12 | #define SSE_BITS BIT(25) |
169 | 12 | #define SSE2_BITS BIT(26) |
170 | 12 | #define SSE3_BITS BIT(0) |
171 | 12 | #define SSSE3_BITS BIT(9) |
172 | 12 | #define SSE4_1_BITS BIT(19) |
173 | | // Bits 27 (OSXSAVE) & 28 (256-bit AVX) |
174 | 12 | #define AVX_BITS (BIT(27) | BIT(28)) |
175 | 12 | #define AVX2_BITS BIT(5) |
176 | | // Bits 16 (AVX-512F) & 17 (AVX-512DQ) & 28 (AVX-512CD) & 30 (AVX-512BW) |
177 | | // & 31 (AVX-512VL) |
178 | 12 | #define AVX512_BITS (BIT(16) | BIT(17) | BIT(28) | BIT(30) | BIT(31)) |
179 | | |
180 | | #define FEATURE_SET(reg, feature) \ |
181 | 54 | (((reg) & (feature##_BITS)) == (feature##_BITS)) |
182 | | |
183 | 6 | static INLINE int x86_simd_caps(void) { |
184 | 6 | unsigned int flags = 0; |
185 | 6 | unsigned int mask = ~0u; |
186 | 6 | unsigned int max_cpuid_val, reg_eax, reg_ebx, reg_ecx, reg_edx; |
187 | 6 | char *env; |
188 | 6 | (void)reg_ebx; |
189 | | |
190 | | /* See if the CPU capabilities are being overridden by the environment */ |
191 | 6 | env = getenv("VPX_SIMD_CAPS"); |
192 | 6 | if (env && *env) return (int)strtol(env, NULL, 0); |
193 | | |
194 | 6 | env = getenv("VPX_SIMD_CAPS_MASK"); |
195 | 6 | if (env && *env) mask = (unsigned int)strtoul(env, NULL, 0); |
196 | | |
197 | | /* Ensure that the CPUID instruction supports extended features */ |
198 | 6 | cpuid(0, 0, max_cpuid_val, reg_ebx, reg_ecx, reg_edx); |
199 | 6 | if (max_cpuid_val < 1) return 0; |
200 | | |
201 | | /* Get the standard feature flags */ |
202 | 6 | cpuid(1, 0, reg_eax, reg_ebx, reg_ecx, reg_edx); |
203 | | |
204 | 6 | flags |= FEATURE_SET(reg_edx, MMX) ? HAS_MMX : 0; |
205 | 6 | flags |= FEATURE_SET(reg_edx, SSE) ? HAS_SSE : 0; |
206 | 6 | flags |= FEATURE_SET(reg_edx, SSE2) ? HAS_SSE2 : 0; |
207 | 6 | flags |= FEATURE_SET(reg_ecx, SSE3) ? HAS_SSE3 : 0; |
208 | 6 | flags |= FEATURE_SET(reg_ecx, SSSE3) ? HAS_SSSE3 : 0; |
209 | 6 | flags |= FEATURE_SET(reg_ecx, SSE4_1) ? HAS_SSE4_1 : 0; |
210 | | |
211 | 6 | if (FEATURE_SET(reg_ecx, AVX)) { |
212 | | // Check for OS-support of YMM state. Necessary for AVX and AVX2. |
213 | 6 | if ((xgetbv() & 0x6) == 0x6) { |
214 | 6 | flags |= HAS_AVX; |
215 | 6 | if (max_cpuid_val >= 7) { |
216 | | /* Get the leaf 7 feature flags. Needed to check for AVX2 support */ |
217 | 6 | cpuid(7, 0, reg_eax, reg_ebx, reg_ecx, reg_edx); |
218 | 6 | flags |= FEATURE_SET(reg_ebx, AVX2) ? HAS_AVX2 : 0; |
219 | 6 | if (FEATURE_SET(reg_ebx, AVX512)) { |
220 | | // Check for OS-support of ZMM and YMM state. Necessary for AVX-512. |
221 | 0 | if ((xgetbv() & 0xe6) == 0xe6) flags |= HAS_AVX512; |
222 | 0 | } |
223 | 6 | } |
224 | 6 | } |
225 | 6 | } |
226 | 6 | (void)reg_eax; // Avoid compiler warning on unused-but-set variable. |
227 | 6 | return flags & mask; |
228 | 6 | } vpx_scale_rtcd.c:x86_simd_caps Line | Count | Source | 183 | 2 | static INLINE int x86_simd_caps(void) { | 184 | 2 | unsigned int flags = 0; | 185 | 2 | unsigned int mask = ~0u; | 186 | 2 | unsigned int max_cpuid_val, reg_eax, reg_ebx, reg_ecx, reg_edx; | 187 | 2 | char *env; | 188 | 2 | (void)reg_ebx; | 189 | | | 190 | | /* See if the CPU capabilities are being overridden by the environment */ | 191 | 2 | env = getenv("VPX_SIMD_CAPS"); | 192 | 2 | if (env && *env) return (int)strtol(env, NULL, 0); | 193 | | | 194 | 2 | env = getenv("VPX_SIMD_CAPS_MASK"); | 195 | 2 | if (env && *env) mask = (unsigned int)strtoul(env, NULL, 0); | 196 | | | 197 | | /* Ensure that the CPUID instruction supports extended features */ | 198 | 2 | cpuid(0, 0, max_cpuid_val, reg_ebx, reg_ecx, reg_edx); | 199 | 2 | if (max_cpuid_val < 1) return 0; | 200 | | | 201 | | /* Get the standard feature flags */ | 202 | 2 | cpuid(1, 0, reg_eax, reg_ebx, reg_ecx, reg_edx); | 203 | | | 204 | 2 | flags |= FEATURE_SET(reg_edx, MMX) ? HAS_MMX : 0; | 205 | 2 | flags |= FEATURE_SET(reg_edx, SSE) ? HAS_SSE : 0; | 206 | 2 | flags |= FEATURE_SET(reg_edx, SSE2) ? HAS_SSE2 : 0; | 207 | 2 | flags |= FEATURE_SET(reg_ecx, SSE3) ? HAS_SSE3 : 0; | 208 | 2 | flags |= FEATURE_SET(reg_ecx, SSSE3) ? HAS_SSSE3 : 0; | 209 | 2 | flags |= FEATURE_SET(reg_ecx, SSE4_1) ? HAS_SSE4_1 : 0; | 210 | | | 211 | 2 | if (FEATURE_SET(reg_ecx, AVX)) { | 212 | | // Check for OS-support of YMM state. Necessary for AVX and AVX2. | 213 | 2 | if ((xgetbv() & 0x6) == 0x6) { | 214 | 2 | flags |= HAS_AVX; | 215 | 2 | if (max_cpuid_val >= 7) { | 216 | | /* Get the leaf 7 feature flags. Needed to check for AVX2 support */ | 217 | 2 | cpuid(7, 0, reg_eax, reg_ebx, reg_ecx, reg_edx); | 218 | 2 | flags |= FEATURE_SET(reg_ebx, AVX2) ? HAS_AVX2 : 0; | 219 | 2 | if (FEATURE_SET(reg_ebx, AVX512)) { | 220 | | // Check for OS-support of ZMM and YMM state. Necessary for AVX-512. | 221 | 0 | if ((xgetbv() & 0xe6) == 0xe6) flags |= HAS_AVX512; | 222 | 0 | } | 223 | 2 | } | 224 | 2 | } | 225 | 2 | } | 226 | 2 | (void)reg_eax; // Avoid compiler warning on unused-but-set variable. | 227 | 2 | return flags & mask; | 228 | 2 | } |
vpx_dsp_rtcd.c:x86_simd_caps Line | Count | Source | 183 | 2 | static INLINE int x86_simd_caps(void) { | 184 | 2 | unsigned int flags = 0; | 185 | 2 | unsigned int mask = ~0u; | 186 | 2 | unsigned int max_cpuid_val, reg_eax, reg_ebx, reg_ecx, reg_edx; | 187 | 2 | char *env; | 188 | 2 | (void)reg_ebx; | 189 | | | 190 | | /* See if the CPU capabilities are being overridden by the environment */ | 191 | 2 | env = getenv("VPX_SIMD_CAPS"); | 192 | 2 | if (env && *env) return (int)strtol(env, NULL, 0); | 193 | | | 194 | 2 | env = getenv("VPX_SIMD_CAPS_MASK"); | 195 | 2 | if (env && *env) mask = (unsigned int)strtoul(env, NULL, 0); | 196 | | | 197 | | /* Ensure that the CPUID instruction supports extended features */ | 198 | 2 | cpuid(0, 0, max_cpuid_val, reg_ebx, reg_ecx, reg_edx); | 199 | 2 | if (max_cpuid_val < 1) return 0; | 200 | | | 201 | | /* Get the standard feature flags */ | 202 | 2 | cpuid(1, 0, reg_eax, reg_ebx, reg_ecx, reg_edx); | 203 | | | 204 | 2 | flags |= FEATURE_SET(reg_edx, MMX) ? HAS_MMX : 0; | 205 | 2 | flags |= FEATURE_SET(reg_edx, SSE) ? HAS_SSE : 0; | 206 | 2 | flags |= FEATURE_SET(reg_edx, SSE2) ? HAS_SSE2 : 0; | 207 | 2 | flags |= FEATURE_SET(reg_ecx, SSE3) ? HAS_SSE3 : 0; | 208 | 2 | flags |= FEATURE_SET(reg_ecx, SSSE3) ? HAS_SSSE3 : 0; | 209 | 2 | flags |= FEATURE_SET(reg_ecx, SSE4_1) ? HAS_SSE4_1 : 0; | 210 | | | 211 | 2 | if (FEATURE_SET(reg_ecx, AVX)) { | 212 | | // Check for OS-support of YMM state. Necessary for AVX and AVX2. | 213 | 2 | if ((xgetbv() & 0x6) == 0x6) { | 214 | 2 | flags |= HAS_AVX; | 215 | 2 | if (max_cpuid_val >= 7) { | 216 | | /* Get the leaf 7 feature flags. Needed to check for AVX2 support */ | 217 | 2 | cpuid(7, 0, reg_eax, reg_ebx, reg_ecx, reg_edx); | 218 | 2 | flags |= FEATURE_SET(reg_ebx, AVX2) ? HAS_AVX2 : 0; | 219 | 2 | if (FEATURE_SET(reg_ebx, AVX512)) { | 220 | | // Check for OS-support of ZMM and YMM state. Necessary for AVX-512. | 221 | 0 | if ((xgetbv() & 0xe6) == 0xe6) flags |= HAS_AVX512; | 222 | 0 | } | 223 | 2 | } | 224 | 2 | } | 225 | 2 | } | 226 | 2 | (void)reg_eax; // Avoid compiler warning on unused-but-set variable. | 227 | 2 | return flags & mask; | 228 | 2 | } |
Line | Count | Source | 183 | 1 | static INLINE int x86_simd_caps(void) { | 184 | 1 | unsigned int flags = 0; | 185 | 1 | unsigned int mask = ~0u; | 186 | 1 | unsigned int max_cpuid_val, reg_eax, reg_ebx, reg_ecx, reg_edx; | 187 | 1 | char *env; | 188 | 1 | (void)reg_ebx; | 189 | | | 190 | | /* See if the CPU capabilities are being overridden by the environment */ | 191 | 1 | env = getenv("VPX_SIMD_CAPS"); | 192 | 1 | if (env && *env) return (int)strtol(env, NULL, 0); | 193 | | | 194 | 1 | env = getenv("VPX_SIMD_CAPS_MASK"); | 195 | 1 | if (env && *env) mask = (unsigned int)strtoul(env, NULL, 0); | 196 | | | 197 | | /* Ensure that the CPUID instruction supports extended features */ | 198 | 1 | cpuid(0, 0, max_cpuid_val, reg_ebx, reg_ecx, reg_edx); | 199 | 1 | if (max_cpuid_val < 1) return 0; | 200 | | | 201 | | /* Get the standard feature flags */ | 202 | 1 | cpuid(1, 0, reg_eax, reg_ebx, reg_ecx, reg_edx); | 203 | | | 204 | 1 | flags |= FEATURE_SET(reg_edx, MMX) ? HAS_MMX : 0; | 205 | 1 | flags |= FEATURE_SET(reg_edx, SSE) ? HAS_SSE : 0; | 206 | 1 | flags |= FEATURE_SET(reg_edx, SSE2) ? HAS_SSE2 : 0; | 207 | 1 | flags |= FEATURE_SET(reg_ecx, SSE3) ? HAS_SSE3 : 0; | 208 | 1 | flags |= FEATURE_SET(reg_ecx, SSSE3) ? HAS_SSSE3 : 0; | 209 | 1 | flags |= FEATURE_SET(reg_ecx, SSE4_1) ? HAS_SSE4_1 : 0; | 210 | | | 211 | 1 | if (FEATURE_SET(reg_ecx, AVX)) { | 212 | | // Check for OS-support of YMM state. Necessary for AVX and AVX2. | 213 | 1 | if ((xgetbv() & 0x6) == 0x6) { | 214 | 1 | flags |= HAS_AVX; | 215 | 1 | if (max_cpuid_val >= 7) { | 216 | | /* Get the leaf 7 feature flags. Needed to check for AVX2 support */ | 217 | 1 | cpuid(7, 0, reg_eax, reg_ebx, reg_ecx, reg_edx); | 218 | 1 | flags |= FEATURE_SET(reg_ebx, AVX2) ? HAS_AVX2 : 0; | 219 | 1 | if (FEATURE_SET(reg_ebx, AVX512)) { | 220 | | // Check for OS-support of ZMM and YMM state. Necessary for AVX-512. | 221 | 0 | if ((xgetbv() & 0xe6) == 0xe6) flags |= HAS_AVX512; | 222 | 0 | } | 223 | 1 | } | 224 | 1 | } | 225 | 1 | } | 226 | 1 | (void)reg_eax; // Avoid compiler warning on unused-but-set variable. | 227 | 1 | return flags & mask; | 228 | 1 | } |
Unexecuted instantiation: vp8_dx_iface.c:x86_simd_caps Unexecuted instantiation: onyxd_if.c:x86_simd_caps Unexecuted instantiation: threading.c:x86_simd_caps Unexecuted instantiation: systemdependent.c:x86_simd_caps Line | Count | Source | 183 | 1 | static INLINE int x86_simd_caps(void) { | 184 | 1 | unsigned int flags = 0; | 185 | 1 | unsigned int mask = ~0u; | 186 | 1 | unsigned int max_cpuid_val, reg_eax, reg_ebx, reg_ecx, reg_edx; | 187 | 1 | char *env; | 188 | 1 | (void)reg_ebx; | 189 | | | 190 | | /* See if the CPU capabilities are being overridden by the environment */ | 191 | 1 | env = getenv("VPX_SIMD_CAPS"); | 192 | 1 | if (env && *env) return (int)strtol(env, NULL, 0); | 193 | | | 194 | 1 | env = getenv("VPX_SIMD_CAPS_MASK"); | 195 | 1 | if (env && *env) mask = (unsigned int)strtoul(env, NULL, 0); | 196 | | | 197 | | /* Ensure that the CPUID instruction supports extended features */ | 198 | 1 | cpuid(0, 0, max_cpuid_val, reg_ebx, reg_ecx, reg_edx); | 199 | 1 | if (max_cpuid_val < 1) return 0; | 200 | | | 201 | | /* Get the standard feature flags */ | 202 | 1 | cpuid(1, 0, reg_eax, reg_ebx, reg_ecx, reg_edx); | 203 | | | 204 | 1 | flags |= FEATURE_SET(reg_edx, MMX) ? HAS_MMX : 0; | 205 | 1 | flags |= FEATURE_SET(reg_edx, SSE) ? HAS_SSE : 0; | 206 | 1 | flags |= FEATURE_SET(reg_edx, SSE2) ? HAS_SSE2 : 0; | 207 | 1 | flags |= FEATURE_SET(reg_ecx, SSE3) ? HAS_SSE3 : 0; | 208 | 1 | flags |= FEATURE_SET(reg_ecx, SSSE3) ? HAS_SSSE3 : 0; | 209 | 1 | flags |= FEATURE_SET(reg_ecx, SSE4_1) ? HAS_SSE4_1 : 0; | 210 | | | 211 | 1 | if (FEATURE_SET(reg_ecx, AVX)) { | 212 | | // Check for OS-support of YMM state. Necessary for AVX and AVX2. | 213 | 1 | if ((xgetbv() & 0x6) == 0x6) { | 214 | 1 | flags |= HAS_AVX; | 215 | 1 | if (max_cpuid_val >= 7) { | 216 | | /* Get the leaf 7 feature flags. Needed to check for AVX2 support */ | 217 | 1 | cpuid(7, 0, reg_eax, reg_ebx, reg_ecx, reg_edx); | 218 | 1 | flags |= FEATURE_SET(reg_ebx, AVX2) ? HAS_AVX2 : 0; | 219 | 1 | if (FEATURE_SET(reg_ebx, AVX512)) { | 220 | | // Check for OS-support of ZMM and YMM state. Necessary for AVX-512. | 221 | 0 | if ((xgetbv() & 0xe6) == 0xe6) flags |= HAS_AVX512; | 222 | 0 | } | 223 | 1 | } | 224 | 1 | } | 225 | 1 | } | 226 | 1 | (void)reg_eax; // Avoid compiler warning on unused-but-set variable. | 227 | 1 | return flags & mask; | 228 | 1 | } |
Unexecuted instantiation: decodeframe.c:x86_simd_caps Unexecuted instantiation: detokenize.c:x86_simd_caps Unexecuted instantiation: decodemv.c:x86_simd_caps |
229 | | |
230 | | // Fine-Grain Measurement Functions |
231 | | // |
232 | | // If you are timing a small region of code, access the timestamp counter |
233 | | // (TSC) via: |
234 | | // |
235 | | // unsigned int start = x86_tsc_start(); |
236 | | // ... |
237 | | // unsigned int end = x86_tsc_end(); |
238 | | // unsigned int diff = end - start; |
239 | | // |
240 | | // The start/end functions introduce a few more instructions than using |
241 | | // x86_readtsc directly, but prevent the CPU's out-of-order execution from |
242 | | // affecting the measurement (by having earlier/later instructions be evaluated |
243 | | // in the time interval). See the white paper, "How to Benchmark Code |
244 | | // Execution Times on Intel(R) IA-32 and IA-64 Instruction Set Architectures" by |
245 | | // Gabriele Paoloni for more information. |
246 | | // |
247 | | // If you are timing a large function (CPU time > a couple of seconds), use |
248 | | // x86_readtsc64 to read the timestamp counter in a 64-bit integer. The |
249 | | // out-of-order leakage that can occur is minimal compared to total runtime. |
250 | 0 | static INLINE unsigned int x86_readtsc(void) { |
251 | 0 | #if defined(__GNUC__) |
252 | 0 | unsigned int tsc; |
253 | 0 | __asm__ __volatile__("rdtsc\n\t" : "=a"(tsc) :); |
254 | 0 | return tsc; |
255 | 0 | #elif defined(__SUNPRO_C) || defined(__SUNPRO_CC) |
256 | 0 | unsigned int tsc; |
257 | 0 | asm volatile("rdtsc\n\t" : "=a"(tsc) :); |
258 | 0 | return tsc; |
259 | 0 | #else |
260 | 0 | #if VPX_ARCH_X86_64 |
261 | 0 | return (unsigned int)__rdtsc(); |
262 | 0 | #else |
263 | 0 | __asm rdtsc; |
264 | 0 | #endif |
265 | 0 | #endif |
266 | 0 | } Unexecuted instantiation: vpx_scale_rtcd.c:x86_readtsc Unexecuted instantiation: vpx_dsp_rtcd.c:x86_readtsc Unexecuted instantiation: vp9_rtcd.c:x86_readtsc Unexecuted instantiation: vp8_dx_iface.c:x86_readtsc Unexecuted instantiation: onyxd_if.c:x86_readtsc Unexecuted instantiation: threading.c:x86_readtsc Unexecuted instantiation: systemdependent.c:x86_readtsc Unexecuted instantiation: rtcd.c:x86_readtsc Unexecuted instantiation: decodeframe.c:x86_readtsc Unexecuted instantiation: detokenize.c:x86_readtsc Unexecuted instantiation: decodemv.c:x86_readtsc |
267 | | // 64-bit CPU cycle counter |
268 | 0 | static INLINE uint64_t x86_readtsc64(void) { |
269 | 0 | #if defined(__GNUC__) |
270 | 0 | uint32_t hi, lo; |
271 | 0 | __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi)); |
272 | 0 | return ((uint64_t)hi << 32) | lo; |
273 | 0 | #elif defined(__SUNPRO_C) || defined(__SUNPRO_CC) |
274 | 0 | uint_t hi, lo; |
275 | 0 | asm volatile("rdtsc\n\t" : "=a"(lo), "=d"(hi)); |
276 | 0 | return ((uint64_t)hi << 32) | lo; |
277 | 0 | #else |
278 | 0 | #if VPX_ARCH_X86_64 |
279 | 0 | return (uint64_t)__rdtsc(); |
280 | 0 | #else |
281 | 0 | __asm rdtsc; |
282 | 0 | #endif |
283 | 0 | #endif |
284 | 0 | } Unexecuted instantiation: vpx_scale_rtcd.c:x86_readtsc64 Unexecuted instantiation: vpx_dsp_rtcd.c:x86_readtsc64 Unexecuted instantiation: vp9_rtcd.c:x86_readtsc64 Unexecuted instantiation: vp8_dx_iface.c:x86_readtsc64 Unexecuted instantiation: onyxd_if.c:x86_readtsc64 Unexecuted instantiation: threading.c:x86_readtsc64 Unexecuted instantiation: systemdependent.c:x86_readtsc64 Unexecuted instantiation: rtcd.c:x86_readtsc64 Unexecuted instantiation: decodeframe.c:x86_readtsc64 Unexecuted instantiation: detokenize.c:x86_readtsc64 Unexecuted instantiation: decodemv.c:x86_readtsc64 |
285 | | |
286 | | // 32-bit CPU cycle counter with a partial fence against out-of-order execution. |
287 | 0 | static INLINE unsigned int x86_readtscp(void) { |
288 | 0 | #if defined(__GNUC__) |
289 | 0 | unsigned int tscp; |
290 | 0 | __asm__ __volatile__("rdtscp\n\t" : "=a"(tscp) :); |
291 | 0 | return tscp; |
292 | 0 | #elif defined(__SUNPRO_C) || defined(__SUNPRO_CC) |
293 | 0 | unsigned int tscp; |
294 | 0 | asm volatile("rdtscp\n\t" : "=a"(tscp) :); |
295 | 0 | return tscp; |
296 | 0 | #elif defined(_MSC_VER) |
297 | 0 | unsigned int ui; |
298 | 0 | return (unsigned int)__rdtscp(&ui); |
299 | 0 | #else |
300 | 0 | #if VPX_ARCH_X86_64 |
301 | 0 | return (unsigned int)__rdtscp(); |
302 | 0 | #else |
303 | 0 | __asm rdtscp; |
304 | 0 | #endif |
305 | 0 | #endif |
306 | 0 | } Unexecuted instantiation: vpx_scale_rtcd.c:x86_readtscp Unexecuted instantiation: vpx_dsp_rtcd.c:x86_readtscp Unexecuted instantiation: vp9_rtcd.c:x86_readtscp Unexecuted instantiation: vp8_dx_iface.c:x86_readtscp Unexecuted instantiation: onyxd_if.c:x86_readtscp Unexecuted instantiation: threading.c:x86_readtscp Unexecuted instantiation: systemdependent.c:x86_readtscp Unexecuted instantiation: rtcd.c:x86_readtscp Unexecuted instantiation: decodeframe.c:x86_readtscp Unexecuted instantiation: detokenize.c:x86_readtscp Unexecuted instantiation: decodemv.c:x86_readtscp |
307 | | |
308 | 0 | static INLINE unsigned int x86_tsc_start(void) { |
309 | 0 | unsigned int reg_eax, reg_ebx, reg_ecx, reg_edx; |
310 | 0 | // This call should not be removed. See function notes above. |
311 | 0 | cpuid(0, 0, reg_eax, reg_ebx, reg_ecx, reg_edx); |
312 | 0 | // Avoid compiler warnings on unused-but-set variables. |
313 | 0 | (void)reg_eax; |
314 | 0 | (void)reg_ebx; |
315 | 0 | (void)reg_ecx; |
316 | 0 | (void)reg_edx; |
317 | 0 | return x86_readtsc(); |
318 | 0 | } Unexecuted instantiation: vpx_scale_rtcd.c:x86_tsc_start Unexecuted instantiation: vpx_dsp_rtcd.c:x86_tsc_start Unexecuted instantiation: vp9_rtcd.c:x86_tsc_start Unexecuted instantiation: vp8_dx_iface.c:x86_tsc_start Unexecuted instantiation: onyxd_if.c:x86_tsc_start Unexecuted instantiation: threading.c:x86_tsc_start Unexecuted instantiation: systemdependent.c:x86_tsc_start Unexecuted instantiation: rtcd.c:x86_tsc_start Unexecuted instantiation: decodeframe.c:x86_tsc_start Unexecuted instantiation: detokenize.c:x86_tsc_start Unexecuted instantiation: decodemv.c:x86_tsc_start |
319 | | |
320 | 0 | static INLINE unsigned int x86_tsc_end(void) { |
321 | 0 | uint32_t v = x86_readtscp(); |
322 | 0 | unsigned int reg_eax, reg_ebx, reg_ecx, reg_edx; |
323 | 0 | // This call should not be removed. See function notes above. |
324 | 0 | cpuid(0, 0, reg_eax, reg_ebx, reg_ecx, reg_edx); |
325 | 0 | // Avoid compiler warnings on unused-but-set variables. |
326 | 0 | (void)reg_eax; |
327 | 0 | (void)reg_ebx; |
328 | 0 | (void)reg_ecx; |
329 | 0 | (void)reg_edx; |
330 | 0 | return v; |
331 | 0 | } Unexecuted instantiation: vpx_scale_rtcd.c:x86_tsc_end Unexecuted instantiation: vpx_dsp_rtcd.c:x86_tsc_end Unexecuted instantiation: vp9_rtcd.c:x86_tsc_end Unexecuted instantiation: vp8_dx_iface.c:x86_tsc_end Unexecuted instantiation: onyxd_if.c:x86_tsc_end Unexecuted instantiation: threading.c:x86_tsc_end Unexecuted instantiation: systemdependent.c:x86_tsc_end Unexecuted instantiation: rtcd.c:x86_tsc_end Unexecuted instantiation: decodeframe.c:x86_tsc_end Unexecuted instantiation: detokenize.c:x86_tsc_end Unexecuted instantiation: decodemv.c:x86_tsc_end |
332 | | |
333 | | #if defined(__GNUC__) |
334 | 32.7M | #define x86_pause_hint() __asm__ __volatile__("pause \n\t") |
335 | | #elif defined(__SUNPRO_C) || defined(__SUNPRO_CC) |
336 | | #define x86_pause_hint() asm volatile("pause \n\t") |
337 | | #else |
338 | | #if VPX_ARCH_X86_64 |
339 | | #define x86_pause_hint() _mm_pause(); |
340 | | #else |
341 | | #define x86_pause_hint() __asm pause |
342 | | #endif |
343 | | #endif |
344 | | |
345 | | #if defined(__GNUC__) |
346 | 0 | static void x87_set_control_word(unsigned short mode) { |
347 | 0 | __asm__ __volatile__("fldcw %0" : : "m"(*&mode)); |
348 | 0 | } Unexecuted instantiation: vpx_scale_rtcd.c:x87_set_control_word Unexecuted instantiation: vpx_dsp_rtcd.c:x87_set_control_word Unexecuted instantiation: vp9_rtcd.c:x87_set_control_word Unexecuted instantiation: vp8_dx_iface.c:x87_set_control_word Unexecuted instantiation: onyxd_if.c:x87_set_control_word Unexecuted instantiation: threading.c:x87_set_control_word Unexecuted instantiation: systemdependent.c:x87_set_control_word Unexecuted instantiation: rtcd.c:x87_set_control_word Unexecuted instantiation: decodeframe.c:x87_set_control_word Unexecuted instantiation: detokenize.c:x87_set_control_word Unexecuted instantiation: decodemv.c:x87_set_control_word |
349 | 0 | static unsigned short x87_get_control_word(void) { |
350 | 0 | unsigned short mode; |
351 | 0 | __asm__ __volatile__("fstcw %0\n\t" : "=m"(*&mode) :); |
352 | 0 | return mode; |
353 | 0 | } Unexecuted instantiation: vpx_scale_rtcd.c:x87_get_control_word Unexecuted instantiation: vpx_dsp_rtcd.c:x87_get_control_word Unexecuted instantiation: vp9_rtcd.c:x87_get_control_word Unexecuted instantiation: vp8_dx_iface.c:x87_get_control_word Unexecuted instantiation: onyxd_if.c:x87_get_control_word Unexecuted instantiation: threading.c:x87_get_control_word Unexecuted instantiation: systemdependent.c:x87_get_control_word Unexecuted instantiation: rtcd.c:x87_get_control_word Unexecuted instantiation: decodeframe.c:x87_get_control_word Unexecuted instantiation: detokenize.c:x87_get_control_word Unexecuted instantiation: decodemv.c:x87_get_control_word |
354 | | #elif defined(__SUNPRO_C) || defined(__SUNPRO_CC) |
355 | | static void x87_set_control_word(unsigned short mode) { |
356 | | asm volatile("fldcw %0" : : "m"(*&mode)); |
357 | | } |
358 | | static unsigned short x87_get_control_word(void) { |
359 | | unsigned short mode; |
360 | | asm volatile("fstcw %0\n\t" : "=m"(*&mode) :); |
361 | | return mode; |
362 | | } |
363 | | #elif VPX_ARCH_X86_64 |
364 | | /* No fldcw intrinsics on Windows x64, punt to external asm */ |
365 | | extern void vpx_winx64_fldcw(unsigned short mode); |
366 | | extern unsigned short vpx_winx64_fstcw(void); |
367 | | #define x87_set_control_word vpx_winx64_fldcw |
368 | | #define x87_get_control_word vpx_winx64_fstcw |
369 | | #else |
370 | | static void x87_set_control_word(unsigned short mode) { |
371 | | __asm { fldcw mode } |
372 | | } |
373 | | static unsigned short x87_get_control_word(void) { |
374 | | unsigned short mode; |
375 | | __asm { fstcw mode } |
376 | | return mode; |
377 | | } |
378 | | #endif |
379 | | |
380 | 0 | static INLINE unsigned int x87_set_double_precision(void) { |
381 | 0 | unsigned int mode = x87_get_control_word(); |
382 | 0 | // Intel 64 and IA-32 Architectures Developer's Manual: Vol. 1 |
383 | 0 | // https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-vol-1-manual.pdf |
384 | 0 | // 8.1.5.2 Precision Control Field |
385 | 0 | // Bits 8 and 9 (0x300) of the x87 FPU Control Word ("Precision Control") |
386 | 0 | // determine the number of bits used in floating point calculations. To match |
387 | 0 | // later SSE instructions restrict x87 operations to Double Precision (0x200). |
388 | 0 | // Precision PC Field |
389 | 0 | // Single Precision (24-Bits) 00B |
390 | 0 | // Reserved 01B |
391 | 0 | // Double Precision (53-Bits) 10B |
392 | 0 | // Extended Precision (64-Bits) 11B |
393 | 0 | x87_set_control_word((mode & ~0x300u) | 0x200u); |
394 | 0 | return mode; |
395 | 0 | } Unexecuted instantiation: vpx_scale_rtcd.c:x87_set_double_precision Unexecuted instantiation: vpx_dsp_rtcd.c:x87_set_double_precision Unexecuted instantiation: vp9_rtcd.c:x87_set_double_precision Unexecuted instantiation: vp8_dx_iface.c:x87_set_double_precision Unexecuted instantiation: onyxd_if.c:x87_set_double_precision Unexecuted instantiation: threading.c:x87_set_double_precision Unexecuted instantiation: systemdependent.c:x87_set_double_precision Unexecuted instantiation: rtcd.c:x87_set_double_precision Unexecuted instantiation: decodeframe.c:x87_set_double_precision Unexecuted instantiation: detokenize.c:x87_set_double_precision Unexecuted instantiation: decodemv.c:x87_set_double_precision |
396 | | |
397 | | #ifdef __cplusplus |
398 | | } // extern "C" |
399 | | #endif |
400 | | |
401 | | #endif // VPX_VPX_PORTS_X86_H_ |