Coverage Report

Created: 2026-02-14 07:00

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/aom/aom_ports/x86.h
Line
Count
Source
1
/*
2
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 */
11
12
#ifndef AOM_AOM_PORTS_X86_H_
13
#define AOM_AOM_PORTS_X86_H_
14
#include <stdlib.h>
15
16
#if defined(_MSC_VER)
17
#include <intrin.h> /* For __cpuidex, __rdtsc */
18
#endif
19
20
#include "aom/aom_integer.h"
21
#include "config/aom_config.h"
22
23
#ifdef __cplusplus
24
extern "C" {
25
#endif
26
27
typedef enum {
28
  AOM_CPU_UNKNOWN = -1,
29
  AOM_CPU_AMD,
30
  AOM_CPU_AMD_OLD,
31
  AOM_CPU_CENTAUR,
32
  AOM_CPU_CYRIX,
33
  AOM_CPU_INTEL,
34
  AOM_CPU_NEXGEN,
35
  AOM_CPU_NSC,
36
  AOM_CPU_RISE,
37
  AOM_CPU_SIS,
38
  AOM_CPU_TRANSMETA,
39
  AOM_CPU_TRANSMETA_OLD,
40
  AOM_CPU_UMC,
41
  AOM_CPU_VIA,
42
43
  AOM_CPU_LAST
44
} aom_cpu_t;
45
46
#if defined(__GNUC__) || defined(__ANDROID__)
47
#if AOM_ARCH_X86_64
48
#define cpuid(func, func2, ax, bx, cx, dx)                      \
49
9
  __asm__ __volatile__("cpuid           \n\t"                   \
50
9
                       : "=a"(ax), "=b"(bx), "=c"(cx), "=d"(dx) \
51
9
                       : "a"(func), "c"(func2))
52
#else
53
#define cpuid(func, func2, ax, bx, cx, dx)     \
54
  __asm__ __volatile__(                        \
55
      "mov %%ebx, %%edi   \n\t"                \
56
      "cpuid              \n\t"                \
57
      "xchg %%edi, %%ebx  \n\t"                \
58
      : "=a"(ax), "=D"(bx), "=c"(cx), "=d"(dx) \
59
      : "a"(func), "c"(func2))
60
#endif
61
#elif defined(__SUNPRO_C) || \
62
    defined(__SUNPRO_CC) /* end __GNUC__ or __ANDROID__*/
63
#if AOM_ARCH_X86_64
64
#define cpuid(func, func2, ax, bx, cx, dx)     \
65
  asm volatile(                                \
66
      "xchg %rsi, %rbx \n\t"                   \
67
      "cpuid           \n\t"                   \
68
      "movl %ebx, %edi \n\t"                   \
69
      "xchg %rsi, %rbx \n\t"                   \
70
      : "=a"(ax), "=D"(bx), "=c"(cx), "=d"(dx) \
71
      : "a"(func), "c"(func2))
72
#else
73
#define cpuid(func, func2, ax, bx, cx, dx)     \
74
  asm volatile(                                \
75
      "pushl %ebx       \n\t"                  \
76
      "cpuid            \n\t"                  \
77
      "movl %ebx, %edi  \n\t"                  \
78
      "popl %ebx        \n\t"                  \
79
      : "=a"(ax), "=D"(bx), "=c"(cx), "=d"(dx) \
80
      : "a"(func), "c"(func2))
81
#endif
82
#else /* end __SUNPRO__ */
83
#if AOM_ARCH_X86_64
84
#if defined(_MSC_VER) && _MSC_VER > 1500
85
#define cpuid(func, func2, a, b, c, d) \
86
  do {                                 \
87
    int regs[4];                       \
88
    __cpuidex(regs, func, func2);      \
89
    a = regs[0];                       \
90
    b = regs[1];                       \
91
    c = regs[2];                       \
92
    d = regs[3];                       \
93
  } while (0)
94
#else
95
#define cpuid(func, func2, a, b, c, d) \
96
  do {                                 \
97
    int regs[4];                       \
98
    __cpuid(regs, func);               \
99
    a = regs[0];                       \
100
    b = regs[1];                       \
101
    c = regs[2];                       \
102
    d = regs[3];                       \
103
  } while (0)
104
#endif
105
#else
106
/* clang-format off */
107
#define cpuid(func, func2, a, b, c, d) \
108
  __asm mov eax, func                  \
109
  __asm mov ecx, func2                 \
110
  __asm cpuid                          \
111
  __asm mov a, eax                     \
112
  __asm mov b, ebx                     \
113
  __asm mov c, ecx                     \
114
  __asm mov d, edx
115
#endif
116
/* clang-format on */
117
#endif /* end others */
118
119
// NaCl has no support for xgetbv or the raw opcode.
120
#if !defined(__native_client__) && (defined(__i386__) || defined(__x86_64__))
121
6
static inline uint64_t xgetbv(void) {
122
6
  const uint32_t ecx = 0;
123
6
  uint32_t eax, edx;
124
  // Use the raw opcode for xgetbv for compatibility with older toolchains.
125
6
  __asm__ volatile(".byte 0x0f, 0x01, 0xd0\n"
126
6
                   : "=a"(eax), "=d"(edx)
127
6
                   : "c"(ecx));
128
6
  return ((uint64_t)edx << 32) | eax;
129
6
}
aom_dsp_rtcd.c:xgetbv
Line
Count
Source
121
2
static inline uint64_t xgetbv(void) {
122
2
  const uint32_t ecx = 0;
123
2
  uint32_t eax, edx;
124
  // Use the raw opcode for xgetbv for compatibility with older toolchains.
125
2
  __asm__ volatile(".byte 0x0f, 0x01, 0xd0\n"
126
2
                   : "=a"(eax), "=d"(edx)
127
2
                   : "c"(ecx));
128
2
  return ((uint64_t)edx << 32) | eax;
129
2
}
aom_scale_rtcd.c:xgetbv
Line
Count
Source
121
2
static inline uint64_t xgetbv(void) {
122
2
  const uint32_t ecx = 0;
123
2
  uint32_t eax, edx;
124
  // Use the raw opcode for xgetbv for compatibility with older toolchains.
125
2
  __asm__ volatile(".byte 0x0f, 0x01, 0xd0\n"
126
2
                   : "=a"(eax), "=d"(edx)
127
2
                   : "c"(ecx));
128
2
  return ((uint64_t)edx << 32) | eax;
129
2
}
av1_rtcd.c:xgetbv
Line
Count
Source
121
2
static inline uint64_t xgetbv(void) {
122
2
  const uint32_t ecx = 0;
123
2
  uint32_t eax, edx;
124
  // Use the raw opcode for xgetbv for compatibility with older toolchains.
125
2
  __asm__ volatile(".byte 0x0f, 0x01, 0xd0\n"
126
2
                   : "=a"(eax), "=d"(edx)
127
2
                   : "c"(ecx));
128
2
  return ((uint64_t)edx << 32) | eax;
129
2
}
130
#elif (defined(_M_X64) || defined(_M_IX86)) && defined(_MSC_FULL_VER) && \
131
    _MSC_FULL_VER >= 160040219  // >= VS2010 SP1
132
#include <immintrin.h>
133
#define xgetbv() _xgetbv(0)
134
#elif defined(_MSC_VER) && defined(_M_IX86)
135
static inline uint64_t xgetbv(void) {
136
  uint32_t eax_, edx_;
137
  __asm {
138
    xor ecx, ecx  // ecx = 0
139
    // Use the raw opcode for xgetbv for compatibility with older toolchains.
140
    __asm _emit 0x0f __asm _emit 0x01 __asm _emit 0xd0
141
    mov eax_, eax
142
    mov edx_, edx
143
  }
144
  return ((uint64_t)edx_ << 32) | eax_;
145
}
146
#else
147
#define xgetbv() 0U  // no AVX for older x64 or unrecognized toolchains.
148
#endif
149
150
#if defined(_MSC_VER) && _MSC_VER >= 1700
151
#undef NOMINMAX
152
#define NOMINMAX
153
#undef WIN32_LEAN_AND_MEAN
154
#define WIN32_LEAN_AND_MEAN
155
#include <windows.h>
156
#if WINAPI_FAMILY_PARTITION(WINAPI_FAMILY_APP)
157
#define getenv(x) NULL
158
#endif
159
#endif
160
161
3
#define HAS_MMX 0x01
162
3
#define HAS_SSE 0x02
163
3
#define HAS_SSE2 0x04
164
3
#define HAS_SSE3 0x08
165
98
#define HAS_SSSE3 0x10
166
50
#define HAS_SSE4_1 0x20
167
3
#define HAS_AVX 0x40
168
117
#define HAS_AVX2 0x80
169
3
#define HAS_SSE4_2 0x100
170
0
#define HAS_AVX512 0x200
171
172
#ifndef BIT
173
60
#define BIT(n) (1u << (n))
174
#endif
175
176
6
#define MMX_BITS BIT(23)
177
6
#define SSE_BITS BIT(25)
178
6
#define SSE2_BITS BIT(26)
179
6
#define SSE3_BITS BIT(0)
180
6
#define SSSE3_BITS BIT(9)
181
6
#define SSE4_1_BITS BIT(19)
182
6
#define SSE4_2_BITS BIT(20)
183
// Bits 27 (OSXSAVE) & 28 (256-bit AVX)
184
6
#define AVX_BITS (BIT(27) | BIT(28))
185
6
#define AVX2_BITS BIT(5)
186
// Bits 16 (AVX512-F) & 17 (AVX512-DQ) & 28 (AVX512-CD) & 30 (AVX512-BW)
187
// & 31 (AVX512-VL)
188
0
#define AVX512_BITS (BIT(16) | BIT(17) | BIT(28) | BIT(30) | BIT(31))
189
// Bits 1 (AVX512-VBMI) & 6 (AVX512-VBMI2) & 8 (AVX512-GFNI) & 9 (AVX512-VAES) &
190
// 10 (AVX512-VPCLMULQDQ) & 11 (AVX512-VNNI) & 12 (AVX512-BITALG) &
191
// 14 (AVX512-POPCNTDQ)
192
#define AVX512_DL_BITS \
193
0
  (BIT(1) | BIT(6) | BIT(8) | BIT(9) | BIT(10) | BIT(11) | BIT(12) | BIT(14))
194
195
#define FEATURE_SET(reg, feature) \
196
27
  (((reg) & (feature##_BITS)) == (feature##_BITS))
197
198
3
static inline int x86_simd_caps(void) {
199
3
  unsigned int flags = 0;
200
3
  unsigned int mask = ~0u;
201
3
  unsigned int max_cpuid_val, reg_eax, reg_ebx, reg_ecx, reg_edx;
202
3
  char *env;
203
204
  /* See if the CPU capabilities are being overridden by the environment */
205
3
  env = getenv("AOM_SIMD_CAPS");
206
3
  if (env && *env) return (int)strtol(env, NULL, 0);
207
208
3
  env = getenv("AOM_SIMD_CAPS_MASK");
209
3
  if (env && *env) mask = (unsigned int)strtoul(env, NULL, 0);
210
211
  /* Ensure that the CPUID instruction supports extended features */
212
3
  cpuid(0, 0, max_cpuid_val, reg_ebx, reg_ecx, reg_edx);
213
214
3
  if (max_cpuid_val < 1) return 0;
215
216
  /* Get the standard feature flags */
217
3
  cpuid(1, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
218
219
3
  flags |= FEATURE_SET(reg_edx, MMX) ? HAS_MMX : 0;
220
3
  flags |= FEATURE_SET(reg_edx, SSE) ? HAS_SSE : 0;
221
3
  flags |= FEATURE_SET(reg_edx, SSE2) ? HAS_SSE2 : 0;
222
3
  flags |= FEATURE_SET(reg_ecx, SSE3) ? HAS_SSE3 : 0;
223
3
  flags |= FEATURE_SET(reg_ecx, SSSE3) ? HAS_SSSE3 : 0;
224
3
  flags |= FEATURE_SET(reg_ecx, SSE4_1) ? HAS_SSE4_1 : 0;
225
3
  flags |= FEATURE_SET(reg_ecx, SSE4_2) ? HAS_SSE4_2 : 0;
226
227
  // bits 27 (OSXSAVE) & 28 (256-bit AVX)
228
3
  if (FEATURE_SET(reg_ecx, AVX)) {
229
    // Check for OS-support of YMM state. Necessary for AVX and AVX2.
230
3
    if ((xgetbv() & 0x6) == 0x6) {
231
3
      flags |= HAS_AVX;
232
3
      if (max_cpuid_val >= 7) {
233
        /* Get the leaf 7 feature flags. Needed to check for AVX2 support */
234
3
        cpuid(7, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
235
3
        flags |= FEATURE_SET(reg_ebx, AVX2) ? HAS_AVX2 : 0;
236
        // Check for OS-support of ZMM and YMM state. Necessary for AVX512.
237
        // Only set HAS_AVX512 flag if AVX512_DL feature are supported.
238
        // Older AVX512 implementations (such as Skylake) have turbo curves that
239
        // are currently problematic for mixed AVX512/AVX2 code
240
3
        if ((xgetbv() & 0xe6) == 0xe6) {
241
0
          flags |=
242
0
              FEATURE_SET(reg_ebx, AVX512) && FEATURE_SET(reg_ecx, AVX512_DL)
243
0
                  ? HAS_AVX512
244
0
                  : 0;
245
0
        }
246
3
      }
247
3
    }
248
3
  }
249
3
  (void)reg_eax;  // Avoid compiler warning on unused-but-set variable.
250
3
  return flags & mask;
251
3
}
aom_dsp_rtcd.c:x86_simd_caps
Line
Count
Source
198
1
static inline int x86_simd_caps(void) {
199
1
  unsigned int flags = 0;
200
1
  unsigned int mask = ~0u;
201
1
  unsigned int max_cpuid_val, reg_eax, reg_ebx, reg_ecx, reg_edx;
202
1
  char *env;
203
204
  /* See if the CPU capabilities are being overridden by the environment */
205
1
  env = getenv("AOM_SIMD_CAPS");
206
1
  if (env && *env) return (int)strtol(env, NULL, 0);
207
208
1
  env = getenv("AOM_SIMD_CAPS_MASK");
209
1
  if (env && *env) mask = (unsigned int)strtoul(env, NULL, 0);
210
211
  /* Ensure that the CPUID instruction supports extended features */
212
1
  cpuid(0, 0, max_cpuid_val, reg_ebx, reg_ecx, reg_edx);
213
214
1
  if (max_cpuid_val < 1) return 0;
215
216
  /* Get the standard feature flags */
217
1
  cpuid(1, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
218
219
1
  flags |= FEATURE_SET(reg_edx, MMX) ? HAS_MMX : 0;
220
1
  flags |= FEATURE_SET(reg_edx, SSE) ? HAS_SSE : 0;
221
1
  flags |= FEATURE_SET(reg_edx, SSE2) ? HAS_SSE2 : 0;
222
1
  flags |= FEATURE_SET(reg_ecx, SSE3) ? HAS_SSE3 : 0;
223
1
  flags |= FEATURE_SET(reg_ecx, SSSE3) ? HAS_SSSE3 : 0;
224
1
  flags |= FEATURE_SET(reg_ecx, SSE4_1) ? HAS_SSE4_1 : 0;
225
1
  flags |= FEATURE_SET(reg_ecx, SSE4_2) ? HAS_SSE4_2 : 0;
226
227
  // bits 27 (OSXSAVE) & 28 (256-bit AVX)
228
1
  if (FEATURE_SET(reg_ecx, AVX)) {
229
    // Check for OS-support of YMM state. Necessary for AVX and AVX2.
230
1
    if ((xgetbv() & 0x6) == 0x6) {
231
1
      flags |= HAS_AVX;
232
1
      if (max_cpuid_val >= 7) {
233
        /* Get the leaf 7 feature flags. Needed to check for AVX2 support */
234
1
        cpuid(7, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
235
1
        flags |= FEATURE_SET(reg_ebx, AVX2) ? HAS_AVX2 : 0;
236
        // Check for OS-support of ZMM and YMM state. Necessary for AVX512.
237
        // Only set HAS_AVX512 flag if AVX512_DL feature are supported.
238
        // Older AVX512 implementations (such as Skylake) have turbo curves that
239
        // are currently problematic for mixed AVX512/AVX2 code
240
1
        if ((xgetbv() & 0xe6) == 0xe6) {
241
0
          flags |=
242
0
              FEATURE_SET(reg_ebx, AVX512) && FEATURE_SET(reg_ecx, AVX512_DL)
243
0
                  ? HAS_AVX512
244
0
                  : 0;
245
0
        }
246
1
      }
247
1
    }
248
1
  }
249
1
  (void)reg_eax;  // Avoid compiler warning on unused-but-set variable.
250
1
  return flags & mask;
251
1
}
aom_scale_rtcd.c:x86_simd_caps
Line
Count
Source
198
1
static inline int x86_simd_caps(void) {
199
1
  unsigned int flags = 0;
200
1
  unsigned int mask = ~0u;
201
1
  unsigned int max_cpuid_val, reg_eax, reg_ebx, reg_ecx, reg_edx;
202
1
  char *env;
203
204
  /* See if the CPU capabilities are being overridden by the environment */
205
1
  env = getenv("AOM_SIMD_CAPS");
206
1
  if (env && *env) return (int)strtol(env, NULL, 0);
207
208
1
  env = getenv("AOM_SIMD_CAPS_MASK");
209
1
  if (env && *env) mask = (unsigned int)strtoul(env, NULL, 0);
210
211
  /* Ensure that the CPUID instruction supports extended features */
212
1
  cpuid(0, 0, max_cpuid_val, reg_ebx, reg_ecx, reg_edx);
213
214
1
  if (max_cpuid_val < 1) return 0;
215
216
  /* Get the standard feature flags */
217
1
  cpuid(1, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
218
219
1
  flags |= FEATURE_SET(reg_edx, MMX) ? HAS_MMX : 0;
220
1
  flags |= FEATURE_SET(reg_edx, SSE) ? HAS_SSE : 0;
221
1
  flags |= FEATURE_SET(reg_edx, SSE2) ? HAS_SSE2 : 0;
222
1
  flags |= FEATURE_SET(reg_ecx, SSE3) ? HAS_SSE3 : 0;
223
1
  flags |= FEATURE_SET(reg_ecx, SSSE3) ? HAS_SSSE3 : 0;
224
1
  flags |= FEATURE_SET(reg_ecx, SSE4_1) ? HAS_SSE4_1 : 0;
225
1
  flags |= FEATURE_SET(reg_ecx, SSE4_2) ? HAS_SSE4_2 : 0;
226
227
  // bits 27 (OSXSAVE) & 28 (256-bit AVX)
228
1
  if (FEATURE_SET(reg_ecx, AVX)) {
229
    // Check for OS-support of YMM state. Necessary for AVX and AVX2.
230
1
    if ((xgetbv() & 0x6) == 0x6) {
231
1
      flags |= HAS_AVX;
232
1
      if (max_cpuid_val >= 7) {
233
        /* Get the leaf 7 feature flags. Needed to check for AVX2 support */
234
1
        cpuid(7, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
235
1
        flags |= FEATURE_SET(reg_ebx, AVX2) ? HAS_AVX2 : 0;
236
        // Check for OS-support of ZMM and YMM state. Necessary for AVX512.
237
        // Only set HAS_AVX512 flag if AVX512_DL feature are supported.
238
        // Older AVX512 implementations (such as Skylake) have turbo curves that
239
        // are currently problematic for mixed AVX512/AVX2 code
240
1
        if ((xgetbv() & 0xe6) == 0xe6) {
241
0
          flags |=
242
0
              FEATURE_SET(reg_ebx, AVX512) && FEATURE_SET(reg_ecx, AVX512_DL)
243
0
                  ? HAS_AVX512
244
0
                  : 0;
245
0
        }
246
1
      }
247
1
    }
248
1
  }
249
1
  (void)reg_eax;  // Avoid compiler warning on unused-but-set variable.
250
1
  return flags & mask;
251
1
}
av1_rtcd.c:x86_simd_caps
Line
Count
Source
198
1
static inline int x86_simd_caps(void) {
199
1
  unsigned int flags = 0;
200
1
  unsigned int mask = ~0u;
201
1
  unsigned int max_cpuid_val, reg_eax, reg_ebx, reg_ecx, reg_edx;
202
1
  char *env;
203
204
  /* See if the CPU capabilities are being overridden by the environment */
205
1
  env = getenv("AOM_SIMD_CAPS");
206
1
  if (env && *env) return (int)strtol(env, NULL, 0);
207
208
1
  env = getenv("AOM_SIMD_CAPS_MASK");
209
1
  if (env && *env) mask = (unsigned int)strtoul(env, NULL, 0);
210
211
  /* Ensure that the CPUID instruction supports extended features */
212
1
  cpuid(0, 0, max_cpuid_val, reg_ebx, reg_ecx, reg_edx);
213
214
1
  if (max_cpuid_val < 1) return 0;
215
216
  /* Get the standard feature flags */
217
1
  cpuid(1, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
218
219
1
  flags |= FEATURE_SET(reg_edx, MMX) ? HAS_MMX : 0;
220
1
  flags |= FEATURE_SET(reg_edx, SSE) ? HAS_SSE : 0;
221
1
  flags |= FEATURE_SET(reg_edx, SSE2) ? HAS_SSE2 : 0;
222
1
  flags |= FEATURE_SET(reg_ecx, SSE3) ? HAS_SSE3 : 0;
223
1
  flags |= FEATURE_SET(reg_ecx, SSSE3) ? HAS_SSSE3 : 0;
224
1
  flags |= FEATURE_SET(reg_ecx, SSE4_1) ? HAS_SSE4_1 : 0;
225
1
  flags |= FEATURE_SET(reg_ecx, SSE4_2) ? HAS_SSE4_2 : 0;
226
227
  // bits 27 (OSXSAVE) & 28 (256-bit AVX)
228
1
  if (FEATURE_SET(reg_ecx, AVX)) {
229
    // Check for OS-support of YMM state. Necessary for AVX and AVX2.
230
1
    if ((xgetbv() & 0x6) == 0x6) {
231
1
      flags |= HAS_AVX;
232
1
      if (max_cpuid_val >= 7) {
233
        /* Get the leaf 7 feature flags. Needed to check for AVX2 support */
234
1
        cpuid(7, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
235
1
        flags |= FEATURE_SET(reg_ebx, AVX2) ? HAS_AVX2 : 0;
236
        // Check for OS-support of ZMM and YMM state. Necessary for AVX512.
237
        // Only set HAS_AVX512 flag if AVX512_DL feature are supported.
238
        // Older AVX512 implementations (such as Skylake) have turbo curves that
239
        // are currently problematic for mixed AVX512/AVX2 code
240
1
        if ((xgetbv() & 0xe6) == 0xe6) {
241
0
          flags |=
242
0
              FEATURE_SET(reg_ebx, AVX512) && FEATURE_SET(reg_ecx, AVX512_DL)
243
0
                  ? HAS_AVX512
244
0
                  : 0;
245
0
        }
246
1
      }
247
1
    }
248
1
  }
249
1
  (void)reg_eax;  // Avoid compiler warning on unused-but-set variable.
250
1
  return flags & mask;
251
1
}
252
253
// Fine-Grain Measurement Functions
254
//
255
// If you are timing a small region of code, access the timestamp counter
256
// (TSC) via:
257
//
258
// unsigned int start = x86_tsc_start();
259
//   ...
260
// unsigned int end = x86_tsc_end();
261
// unsigned int diff = end - start;
262
//
263
// The start/end functions introduce a few more instructions than using
264
// x86_readtsc directly, but prevent the CPU's out-of-order execution from
265
// affecting the measurement (by having earlier/later instructions be evaluated
266
// in the time interval). See the white paper, "How to Benchmark Code
267
// Execution Times on Intel(R) IA-32 and IA-64 Instruction Set Architectures" by
268
// Gabriele Paoloni for more information.
269
//
270
// If you are timing a large function (CPU time > a couple of seconds), use
271
// x86_readtsc64 to read the timestamp counter in a 64-bit integer. The
272
// out-of-order leakage that can occur is minimal compared to total runtime.
273
0
static inline unsigned int x86_readtsc(void) {
274
0
#if defined(__GNUC__)
275
0
  unsigned int tsc;
276
0
  __asm__ __volatile__("rdtsc\n\t" : "=a"(tsc) :);
277
0
  return tsc;
278
0
#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC)
279
0
  unsigned int tsc;
280
0
  asm volatile("rdtsc\n\t" : "=a"(tsc) :);
281
0
  return tsc;
282
0
#else
283
0
#if AOM_ARCH_X86_64
284
0
  return (unsigned int)__rdtsc();
285
0
#else
286
0
  __asm rdtsc;
287
0
#endif
288
0
#endif
289
0
}
Unexecuted instantiation: aom_dsp_rtcd.c:x86_readtsc
Unexecuted instantiation: aom_scale_rtcd.c:x86_readtsc
Unexecuted instantiation: av1_rtcd.c:x86_readtsc
290
// 64-bit CPU cycle counter
291
0
static inline uint64_t x86_readtsc64(void) {
292
0
#if defined(__GNUC__)
293
0
  uint32_t hi, lo;
294
0
  __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi));
295
0
  return ((uint64_t)hi << 32) | lo;
296
0
#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC)
297
0
  uint_t hi, lo;
298
0
  asm volatile("rdtsc\n\t" : "=a"(lo), "=d"(hi));
299
0
  return ((uint64_t)hi << 32) | lo;
300
0
#else
301
0
#if AOM_ARCH_X86_64
302
0
  return (uint64_t)__rdtsc();
303
0
#else
304
0
  __asm rdtsc;
305
0
#endif
306
0
#endif
307
0
}
Unexecuted instantiation: aom_dsp_rtcd.c:x86_readtsc64
Unexecuted instantiation: aom_scale_rtcd.c:x86_readtsc64
Unexecuted instantiation: av1_rtcd.c:x86_readtsc64
308
309
// 32-bit CPU cycle counter with a partial fence against out-of-order execution.
310
0
static inline unsigned int x86_readtscp(void) {
311
0
#if defined(__GNUC__)
312
0
  unsigned int tscp;
313
0
  __asm__ __volatile__("rdtscp\n\t" : "=a"(tscp) :);
314
0
  return tscp;
315
0
#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC)
316
0
  unsigned int tscp;
317
0
  asm volatile("rdtscp\n\t" : "=a"(tscp) :);
318
0
  return tscp;
319
0
#elif defined(_MSC_VER)
320
0
  unsigned int ui;
321
0
  return (unsigned int)__rdtscp(&ui);
322
0
#else
323
0
#if AOM_ARCH_X86_64
324
0
  return (unsigned int)__rdtscp();
325
0
#else
326
0
  __asm rdtscp;
327
0
#endif
328
0
#endif
329
0
}
Unexecuted instantiation: aom_dsp_rtcd.c:x86_readtscp
Unexecuted instantiation: aom_scale_rtcd.c:x86_readtscp
Unexecuted instantiation: av1_rtcd.c:x86_readtscp
330
331
0
static inline unsigned int x86_tsc_start(void) {
332
0
  unsigned int reg_eax, reg_ebx, reg_ecx, reg_edx;
333
0
  // This call should not be removed. See function notes above.
334
0
  cpuid(0, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
335
0
  // Avoid compiler warnings on unused-but-set variables.
336
0
  (void)reg_eax;
337
0
  (void)reg_ebx;
338
0
  (void)reg_ecx;
339
0
  (void)reg_edx;
340
0
  return x86_readtsc();
341
0
}
Unexecuted instantiation: aom_dsp_rtcd.c:x86_tsc_start
Unexecuted instantiation: aom_scale_rtcd.c:x86_tsc_start
Unexecuted instantiation: av1_rtcd.c:x86_tsc_start
342
343
0
static inline unsigned int x86_tsc_end(void) {
344
0
  uint32_t v = x86_readtscp();
345
0
  unsigned int reg_eax, reg_ebx, reg_ecx, reg_edx;
346
0
  // This call should not be removed. See function notes above.
347
0
  cpuid(0, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
348
0
  // Avoid compiler warnings on unused-but-set variables.
349
0
  (void)reg_eax;
350
0
  (void)reg_ebx;
351
0
  (void)reg_ecx;
352
0
  (void)reg_edx;
353
0
  return v;
354
0
}
Unexecuted instantiation: aom_dsp_rtcd.c:x86_tsc_end
Unexecuted instantiation: aom_scale_rtcd.c:x86_tsc_end
Unexecuted instantiation: av1_rtcd.c:x86_tsc_end
355
356
#if defined(__GNUC__)
357
#define x86_pause_hint() __asm__ __volatile__("pause \n\t")
358
#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC)
359
#define x86_pause_hint() asm volatile("pause \n\t")
360
#else
361
#if AOM_ARCH_X86_64
362
#define x86_pause_hint() _mm_pause();
363
#else
364
#define x86_pause_hint() __asm pause
365
#endif
366
#endif
367
368
#if defined(__GNUC__)
369
0
static void x87_set_control_word(unsigned short mode) {
370
0
  __asm__ __volatile__("fldcw %0" : : "m"(*&mode));
371
0
}
Unexecuted instantiation: aom_dsp_rtcd.c:x87_set_control_word
Unexecuted instantiation: aom_scale_rtcd.c:x87_set_control_word
Unexecuted instantiation: av1_rtcd.c:x87_set_control_word
372
0
static unsigned short x87_get_control_word(void) {
373
0
  unsigned short mode;
374
0
  __asm__ __volatile__("fstcw %0\n\t" : "=m"(*&mode) :);
375
0
  return mode;
376
0
}
Unexecuted instantiation: aom_dsp_rtcd.c:x87_get_control_word
Unexecuted instantiation: aom_scale_rtcd.c:x87_get_control_word
Unexecuted instantiation: av1_rtcd.c:x87_get_control_word
377
#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC)
378
static void x87_set_control_word(unsigned short mode) {
379
  asm volatile("fldcw %0" : : "m"(*&mode));
380
}
381
static unsigned short x87_get_control_word(void) {
382
  unsigned short mode;
383
  asm volatile("fstcw %0\n\t" : "=m"(*&mode) :);
384
  return mode;
385
}
386
#elif AOM_ARCH_X86_64
387
/* No fldcw intrinsics on Windows x64, punt to external asm */
388
extern void aom_winx64_fldcw(unsigned short mode);
389
extern unsigned short aom_winx64_fstcw(void);
390
#define x87_set_control_word aom_winx64_fldcw
391
#define x87_get_control_word aom_winx64_fstcw
392
#else
393
static void x87_set_control_word(unsigned short mode) {
394
  __asm { fldcw mode }
395
}
396
static unsigned short x87_get_control_word(void) {
397
  unsigned short mode;
398
  __asm { fstcw mode }
399
  return mode;
400
}
401
#endif
402
403
0
static inline unsigned int x87_set_double_precision(void) {
404
0
  unsigned int mode = x87_get_control_word();
405
0
  // Intel 64 and IA-32 Architectures Developer's Manual: Vol. 1
406
0
  // https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-vol-1-manual.pdf
407
0
  // 8.1.5.2 Precision Control Field
408
0
  // Bits 8 and 9 (0x300) of the x87 FPU Control Word ("Precision Control")
409
0
  // determine the number of bits used in floating point calculations. To match
410
0
  // later SSE instructions restrict x87 operations to Double Precision (0x200).
411
0
  // Precision                     PC Field
412
0
  // Single Precision (24-Bits)    00B
413
0
  // Reserved                      01B
414
0
  // Double Precision (53-Bits)    10B
415
0
  // Extended Precision (64-Bits)  11B
416
0
  x87_set_control_word((mode & ~0x300u) | 0x200u);
417
0
  return mode;
418
0
}
Unexecuted instantiation: aom_dsp_rtcd.c:x87_set_double_precision
Unexecuted instantiation: aom_scale_rtcd.c:x87_set_double_precision
Unexecuted instantiation: av1_rtcd.c:x87_set_double_precision
419
420
#ifdef __cplusplus
421
}  // extern "C"
422
#endif
423
424
#endif  // AOM_AOM_PORTS_X86_H_