Coverage Report

Created: 2025-06-13 07:07

/src/aom/aom_dsp/simd/v64_intrinsics_x86.h
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 */
11
12
#ifndef AOM_AOM_DSP_SIMD_V64_INTRINSICS_X86_H_
13
#define AOM_AOM_DSP_SIMD_V64_INTRINSICS_X86_H_
14
15
#include <emmintrin.h>
16
#if defined(__SSSE3__)
17
#include <tmmintrin.h>
18
#endif
19
#if defined(__SSE4_1__)
20
#include <smmintrin.h>
21
#endif
22
23
typedef __m128i v64;
24
25
44.4M
SIMD_INLINE uint32_t v64_low_u32(v64 a) {
26
44.4M
  return (uint32_t)_mm_cvtsi128_si32(a);
27
44.4M
}
Unexecuted instantiation: cdef_block_sse4.c:v64_low_u32
cdef_block_avx2.c:v64_low_u32
Line
Count
Source
25
44.4M
SIMD_INLINE uint32_t v64_low_u32(v64 a) {
26
44.4M
  return (uint32_t)_mm_cvtsi128_si32(a);
27
44.4M
}
28
29
42.7M
SIMD_INLINE uint32_t v64_high_u32(v64 a) {
30
42.7M
  return (uint32_t)_mm_cvtsi128_si32(_mm_srli_si128(a, 4));
31
42.7M
}
Unexecuted instantiation: cdef_block_sse4.c:v64_high_u32
cdef_block_avx2.c:v64_high_u32
Line
Count
Source
29
42.7M
SIMD_INLINE uint32_t v64_high_u32(v64 a) {
30
42.7M
  return (uint32_t)_mm_cvtsi128_si32(_mm_srli_si128(a, 4));
31
42.7M
}
32
33
0
SIMD_INLINE int32_t v64_low_s32(v64 a) { return (int32_t)_mm_cvtsi128_si32(a); }
Unexecuted instantiation: cdef_block_sse4.c:v64_low_s32
Unexecuted instantiation: cdef_block_avx2.c:v64_low_s32
34
35
0
SIMD_INLINE int32_t v64_high_s32(v64 a) {
36
0
  return (int32_t)_mm_cvtsi128_si32(_mm_srli_si128(a, 4));
37
0
}
Unexecuted instantiation: cdef_block_sse4.c:v64_high_s32
Unexecuted instantiation: cdef_block_avx2.c:v64_high_s32
38
39
0
SIMD_INLINE v64 v64_from_16(uint16_t a, uint16_t b, uint16_t c, uint16_t d) {
40
0
  return _mm_packs_epi32(
41
0
      _mm_set_epi32((int16_t)a, (int16_t)b, (int16_t)c, (int16_t)d),
42
0
      _mm_setzero_si128());
43
0
}
Unexecuted instantiation: cdef_block_sse4.c:v64_from_16
Unexecuted instantiation: cdef_block_avx2.c:v64_from_16
44
45
0
SIMD_INLINE v64 v64_from_32(uint32_t x, uint32_t y) {
46
0
  return _mm_set_epi32(0, 0, (int32_t)x, (int32_t)y);
47
0
}
Unexecuted instantiation: cdef_block_sse4.c:v64_from_32
Unexecuted instantiation: cdef_block_avx2.c:v64_from_32
48
49
0
SIMD_INLINE v64 v64_from_64(uint64_t x) {
50
0
#ifdef __x86_64__
51
0
  return _mm_cvtsi64_si128((int64_t)x);
52
0
#else
53
0
  return _mm_set_epi32(0, 0, (int32_t)(x >> 32), (int32_t)x);
54
0
#endif
55
0
}
Unexecuted instantiation: cdef_block_sse4.c:v64_from_64
Unexecuted instantiation: cdef_block_avx2.c:v64_from_64
56
57
0
SIMD_INLINE uint64_t v64_u64(v64 x) {
58
0
  return (uint64_t)v64_low_u32(x) | ((uint64_t)v64_high_u32(x) << 32);
59
0
}
Unexecuted instantiation: cdef_block_sse4.c:v64_u64
Unexecuted instantiation: cdef_block_avx2.c:v64_u64
60
61
0
SIMD_INLINE uint32_t u32_load_aligned(const void *p) {
62
0
  return *((uint32_t *)p);
63
0
}
Unexecuted instantiation: cdef_block_sse4.c:u32_load_aligned
Unexecuted instantiation: cdef_block_avx2.c:u32_load_aligned
64
65
0
SIMD_INLINE uint32_t u32_load_unaligned(const void *p) {
66
0
  return *((uint32_t *)p);
67
0
}
Unexecuted instantiation: cdef_block_sse4.c:u32_load_unaligned
Unexecuted instantiation: cdef_block_avx2.c:u32_load_unaligned
68
69
83.6M
SIMD_INLINE void u32_store_aligned(void *p, uint32_t a) {
70
83.6M
  *((uint32_t *)p) = a;
71
83.6M
}
Unexecuted instantiation: cdef_block_sse4.c:u32_store_aligned
cdef_block_avx2.c:u32_store_aligned
Line
Count
Source
69
83.6M
SIMD_INLINE void u32_store_aligned(void *p, uint32_t a) {
70
83.6M
  *((uint32_t *)p) = a;
71
83.6M
}
72
73
0
SIMD_INLINE void u32_store_unaligned(void *p, uint32_t a) {
74
0
  *((uint32_t *)p) = a;
75
0
}
Unexecuted instantiation: cdef_block_sse4.c:u32_store_unaligned
Unexecuted instantiation: cdef_block_avx2.c:u32_store_unaligned
76
77
187M
SIMD_INLINE v64 v64_load_aligned(const void *p) {
78
187M
  return _mm_loadl_epi64((__m128i *)p);
79
187M
}
Unexecuted instantiation: cdef_block_sse4.c:v64_load_aligned
cdef_block_avx2.c:v64_load_aligned
Line
Count
Source
77
187M
SIMD_INLINE v64 v64_load_aligned(const void *p) {
78
187M
  return _mm_loadl_epi64((__m128i *)p);
79
187M
}
80
81
689M
SIMD_INLINE v64 v64_load_unaligned(const void *p) {
82
689M
  return _mm_loadl_epi64((__m128i *)p);
83
689M
}
Unexecuted instantiation: cdef_block_sse4.c:v64_load_unaligned
cdef_block_avx2.c:v64_load_unaligned
Line
Count
Source
81
689M
SIMD_INLINE v64 v64_load_unaligned(const void *p) {
82
689M
  return _mm_loadl_epi64((__m128i *)p);
83
689M
}
84
85
147M
SIMD_INLINE void v64_store_aligned(void *p, v64 a) {
86
147M
  _mm_storel_epi64((__m128i *)p, a);
87
147M
}
Unexecuted instantiation: cdef_block_sse4.c:v64_store_aligned
cdef_block_avx2.c:v64_store_aligned
Line
Count
Source
85
147M
SIMD_INLINE void v64_store_aligned(void *p, v64 a) {
86
147M
  _mm_storel_epi64((__m128i *)p, a);
87
147M
}
88
89
0
SIMD_INLINE void v64_store_unaligned(void *p, v64 a) {
90
0
  _mm_storel_epi64((__m128i *)p, a);
91
0
}
Unexecuted instantiation: cdef_block_sse4.c:v64_store_unaligned
Unexecuted instantiation: cdef_block_avx2.c:v64_store_unaligned
92
93
#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__)
94
#define v64_align(a, b, c) \
95
  ((c) ? _mm_srli_si128(_mm_unpacklo_epi64(b, a), (c)) : b)
96
#else
97
#define v64_align(a, b, c)                                                  \
98
  ((c) ? v64_from_64((v64_u64(b) >> (c)*8) | (v64_u64(a) << (8 - (c)) * 8)) \
99
       : (b))
100
#endif
101
102
109M
SIMD_INLINE v64 v64_zero(void) { return _mm_setzero_si128(); }
Unexecuted instantiation: cdef_block_sse4.c:v64_zero
cdef_block_avx2.c:v64_zero
Line
Count
Source
102
109M
SIMD_INLINE v64 v64_zero(void) { return _mm_setzero_si128(); }
103
104
0
SIMD_INLINE v64 v64_dup_8(uint8_t x) { return _mm_set1_epi8((char)x); }
Unexecuted instantiation: cdef_block_sse4.c:v64_dup_8
Unexecuted instantiation: cdef_block_avx2.c:v64_dup_8
105
106
0
SIMD_INLINE v64 v64_dup_16(uint16_t x) { return _mm_set1_epi16((short)x); }
Unexecuted instantiation: cdef_block_sse4.c:v64_dup_16
Unexecuted instantiation: cdef_block_avx2.c:v64_dup_16
107
108
0
SIMD_INLINE v64 v64_dup_32(uint32_t x) { return _mm_set1_epi32((int)x); }
Unexecuted instantiation: cdef_block_sse4.c:v64_dup_32
Unexecuted instantiation: cdef_block_avx2.c:v64_dup_32
109
110
0
SIMD_INLINE v64 v64_add_8(v64 a, v64 b) { return _mm_add_epi8(a, b); }
Unexecuted instantiation: cdef_block_sse4.c:v64_add_8
Unexecuted instantiation: cdef_block_avx2.c:v64_add_8
111
112
0
SIMD_INLINE v64 v64_add_16(v64 a, v64 b) { return _mm_add_epi16(a, b); }
Unexecuted instantiation: cdef_block_sse4.c:v64_add_16
Unexecuted instantiation: cdef_block_avx2.c:v64_add_16
113
114
0
SIMD_INLINE v64 v64_sadd_u8(v64 a, v64 b) { return _mm_adds_epu8(a, b); }
Unexecuted instantiation: cdef_block_sse4.c:v64_sadd_u8
Unexecuted instantiation: cdef_block_avx2.c:v64_sadd_u8
115
116
0
SIMD_INLINE v64 v64_sadd_s8(v64 a, v64 b) { return _mm_adds_epi8(a, b); }
Unexecuted instantiation: cdef_block_sse4.c:v64_sadd_s8
Unexecuted instantiation: cdef_block_avx2.c:v64_sadd_s8
117
118
0
SIMD_INLINE v64 v64_sadd_s16(v64 a, v64 b) { return _mm_adds_epi16(a, b); }
Unexecuted instantiation: cdef_block_sse4.c:v64_sadd_s16
Unexecuted instantiation: cdef_block_avx2.c:v64_sadd_s16
119
120
0
SIMD_INLINE v64 v64_add_32(v64 a, v64 b) { return _mm_add_epi32(a, b); }
Unexecuted instantiation: cdef_block_sse4.c:v64_add_32
Unexecuted instantiation: cdef_block_avx2.c:v64_add_32
121
122
0
SIMD_INLINE v64 v64_sub_8(v64 a, v64 b) { return _mm_sub_epi8(a, b); }
Unexecuted instantiation: cdef_block_sse4.c:v64_sub_8
Unexecuted instantiation: cdef_block_avx2.c:v64_sub_8
123
124
0
SIMD_INLINE v64 v64_ssub_u8(v64 a, v64 b) { return _mm_subs_epu8(a, b); }
Unexecuted instantiation: cdef_block_sse4.c:v64_ssub_u8
Unexecuted instantiation: cdef_block_avx2.c:v64_ssub_u8
125
126
0
SIMD_INLINE v64 v64_ssub_s8(v64 a, v64 b) { return _mm_subs_epi8(a, b); }
Unexecuted instantiation: cdef_block_sse4.c:v64_ssub_s8
Unexecuted instantiation: cdef_block_avx2.c:v64_ssub_s8
127
128
0
SIMD_INLINE v64 v64_sub_16(v64 a, v64 b) { return _mm_sub_epi16(a, b); }
Unexecuted instantiation: cdef_block_sse4.c:v64_sub_16
Unexecuted instantiation: cdef_block_avx2.c:v64_sub_16
129
130
0
SIMD_INLINE v64 v64_ssub_s16(v64 a, v64 b) { return _mm_subs_epi16(a, b); }
Unexecuted instantiation: cdef_block_sse4.c:v64_ssub_s16
Unexecuted instantiation: cdef_block_avx2.c:v64_ssub_s16
131
132
0
SIMD_INLINE v64 v64_ssub_u16(v64 a, v64 b) { return _mm_subs_epu16(a, b); }
Unexecuted instantiation: cdef_block_sse4.c:v64_ssub_u16
Unexecuted instantiation: cdef_block_avx2.c:v64_ssub_u16
133
134
0
SIMD_INLINE v64 v64_sub_32(v64 a, v64 b) { return _mm_sub_epi32(a, b); }
Unexecuted instantiation: cdef_block_sse4.c:v64_sub_32
Unexecuted instantiation: cdef_block_avx2.c:v64_sub_32
135
136
0
SIMD_INLINE v64 v64_abs_s16(v64 a) {
137
0
#if defined(__SSSE3__)
138
0
  return _mm_abs_epi16(a);
139
0
#else
140
0
  return _mm_max_epi16(a, _mm_sub_epi16(_mm_setzero_si128(), a));
141
0
#endif
142
0
}
Unexecuted instantiation: cdef_block_sse4.c:v64_abs_s16
Unexecuted instantiation: cdef_block_avx2.c:v64_abs_s16
143
144
0
SIMD_INLINE v64 v64_abs_s8(v64 a) {
145
0
#if defined(__SSSE3__)
146
0
  return _mm_abs_epi8(a);
147
0
#else
148
0
  v64 sign = _mm_cmplt_epi8(a, _mm_setzero_si128());
149
0
  return _mm_xor_si128(sign, _mm_add_epi8(a, sign));
150
0
#endif
151
0
}
Unexecuted instantiation: cdef_block_sse4.c:v64_abs_s8
Unexecuted instantiation: cdef_block_avx2.c:v64_abs_s8
152
153
0
SIMD_INLINE v64 v64_ziplo_8(v64 a, v64 b) { return _mm_unpacklo_epi8(b, a); }
Unexecuted instantiation: cdef_block_sse4.c:v64_ziplo_8
Unexecuted instantiation: cdef_block_avx2.c:v64_ziplo_8
154
155
0
SIMD_INLINE v64 v64_ziphi_8(v64 a, v64 b) {
156
0
  return _mm_srli_si128(_mm_unpacklo_epi8(b, a), 8);
157
0
}
Unexecuted instantiation: cdef_block_sse4.c:v64_ziphi_8
Unexecuted instantiation: cdef_block_avx2.c:v64_ziphi_8
158
159
0
SIMD_INLINE v64 v64_ziplo_16(v64 a, v64 b) { return _mm_unpacklo_epi16(b, a); }
Unexecuted instantiation: cdef_block_sse4.c:v64_ziplo_16
Unexecuted instantiation: cdef_block_avx2.c:v64_ziplo_16
160
161
0
SIMD_INLINE v64 v64_ziphi_16(v64 a, v64 b) {
162
0
  return _mm_srli_si128(_mm_unpacklo_epi16(b, a), 8);
163
0
}
Unexecuted instantiation: cdef_block_sse4.c:v64_ziphi_16
Unexecuted instantiation: cdef_block_avx2.c:v64_ziphi_16
164
165
0
SIMD_INLINE v64 v64_ziplo_32(v64 a, v64 b) { return _mm_unpacklo_epi32(b, a); }
Unexecuted instantiation: cdef_block_sse4.c:v64_ziplo_32
Unexecuted instantiation: cdef_block_avx2.c:v64_ziplo_32
166
167
0
SIMD_INLINE v64 v64_ziphi_32(v64 a, v64 b) {
168
0
  return _mm_srli_si128(_mm_unpacklo_epi32(b, a), 8);
169
0
}
Unexecuted instantiation: cdef_block_sse4.c:v64_ziphi_32
Unexecuted instantiation: cdef_block_avx2.c:v64_ziphi_32
170
171
0
SIMD_INLINE v64 v64_pack_s32_s16(v64 a, v64 b) {
172
0
  __m128i t = _mm_unpacklo_epi64(b, a);
173
0
  return _mm_packs_epi32(t, t);
174
0
}
Unexecuted instantiation: cdef_block_sse4.c:v64_pack_s32_s16
Unexecuted instantiation: cdef_block_avx2.c:v64_pack_s32_s16
175
176
0
SIMD_INLINE v64 v64_pack_s32_u16(v64 a, v64 b) {
177
0
#if defined(__SSE4_1__)
178
0
  __m128i t = _mm_unpacklo_epi64(b, a);
179
0
  return _mm_packus_epi32(t, t);
180
0
#else
181
0
  const int32_t ah = SIMD_CLAMP(v64_high_s32(a), 0, 65535);
182
0
  const int32_t al = SIMD_CLAMP(v64_low_s32(a), 0, 65535);
183
0
  const int32_t bh = SIMD_CLAMP(v64_high_s32(b), 0, 65535);
184
0
  const int32_t bl = SIMD_CLAMP(v64_low_s32(b), 0, 65535);
185
0
  return v64_from_16(ah, al, bh, bl);
186
0
#endif
187
0
}
Unexecuted instantiation: cdef_block_sse4.c:v64_pack_s32_u16
Unexecuted instantiation: cdef_block_avx2.c:v64_pack_s32_u16
188
189
0
SIMD_INLINE v64 v64_pack_s16_u8(v64 a, v64 b) {
190
0
  __m128i t = _mm_unpacklo_epi64(b, a);
191
0
  return _mm_packus_epi16(t, t);
192
0
}
Unexecuted instantiation: cdef_block_sse4.c:v64_pack_s16_u8
Unexecuted instantiation: cdef_block_avx2.c:v64_pack_s16_u8
193
194
0
SIMD_INLINE v64 v64_pack_s16_s8(v64 a, v64 b) {
195
0
  __m128i t = _mm_unpacklo_epi64(b, a);
196
0
  return _mm_packs_epi16(t, t);
197
0
}
Unexecuted instantiation: cdef_block_sse4.c:v64_pack_s16_s8
Unexecuted instantiation: cdef_block_avx2.c:v64_pack_s16_s8
198
199
0
SIMD_INLINE v64 v64_unziphi_8(v64 a, v64 b) {
200
0
#if defined(__SSSE3__)
201
0
  return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
202
0
                          v64_from_64(0x0f0d0b0907050301LL));
203
0
#else
204
0
  return _mm_packus_epi16(
205
0
      _mm_unpacklo_epi64(_mm_srli_epi16(b, 8), _mm_srli_epi16(a, 8)),
206
0
      _mm_setzero_si128());
207
0
#endif
208
0
}
Unexecuted instantiation: cdef_block_sse4.c:v64_unziphi_8
Unexecuted instantiation: cdef_block_avx2.c:v64_unziphi_8
209
210
0
SIMD_INLINE v64 v64_unziplo_8(v64 a, v64 b) {
211
0
#if defined(__SSSE3__)
212
0
  return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
213
0
                          v64_from_64(0x0e0c0a0806040200LL));
214
0
#else
215
0
  return v64_unziphi_8(_mm_slli_si128(a, 1), _mm_slli_si128(b, 1));
216
0
#endif
217
0
}
Unexecuted instantiation: cdef_block_sse4.c:v64_unziplo_8
Unexecuted instantiation: cdef_block_avx2.c:v64_unziplo_8
218
219
0
SIMD_INLINE v64 v64_unziphi_16(v64 a, v64 b) {
220
0
#if defined(__SSSE3__)
221
0
  return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
222
0
                          v64_from_64(0x0f0e0b0a07060302LL));
223
0
#else
224
0
  return _mm_packs_epi32(
225
0
      _mm_unpacklo_epi64(_mm_srai_epi32(b, 16), _mm_srai_epi32(a, 16)),
226
0
      _mm_setzero_si128());
227
0
#endif
228
0
}
Unexecuted instantiation: cdef_block_sse4.c:v64_unziphi_16
Unexecuted instantiation: cdef_block_avx2.c:v64_unziphi_16
229
230
0
SIMD_INLINE v64 v64_unziplo_16(v64 a, v64 b) {
231
0
#if defined(__SSSE3__)
232
0
  return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
233
0
                          v64_from_64(0x0d0c090805040100LL));
234
0
#else
235
0
  return v64_unziphi_16(_mm_slli_si128(a, 2), _mm_slli_si128(b, 2));
236
0
#endif
237
0
}
Unexecuted instantiation: cdef_block_sse4.c:v64_unziplo_16
Unexecuted instantiation: cdef_block_avx2.c:v64_unziplo_16
238
239
0
SIMD_INLINE v64 v64_unpacklo_u8_s16(v64 a) {
240
0
  return _mm_unpacklo_epi8(a, _mm_setzero_si128());
241
0
}
Unexecuted instantiation: cdef_block_sse4.c:v64_unpacklo_u8_s16
Unexecuted instantiation: cdef_block_avx2.c:v64_unpacklo_u8_s16
242
243
0
SIMD_INLINE v64 v64_unpackhi_u8_s16(v64 a) {
244
0
  return _mm_srli_si128(_mm_unpacklo_epi8(a, _mm_setzero_si128()), 8);
245
0
}
Unexecuted instantiation: cdef_block_sse4.c:v64_unpackhi_u8_s16
Unexecuted instantiation: cdef_block_avx2.c:v64_unpackhi_u8_s16
246
247
0
SIMD_INLINE v64 v64_unpacklo_s8_s16(v64 a) {
248
0
  return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8);
249
0
}
Unexecuted instantiation: cdef_block_sse4.c:v64_unpacklo_s8_s16
Unexecuted instantiation: cdef_block_avx2.c:v64_unpacklo_s8_s16
250
251
0
SIMD_INLINE v64 v64_unpackhi_s8_s16(v64 a) {
252
0
  return _mm_srli_si128(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8), 8);
253
0
}
Unexecuted instantiation: cdef_block_sse4.c:v64_unpackhi_s8_s16
Unexecuted instantiation: cdef_block_avx2.c:v64_unpackhi_s8_s16
254
255
0
SIMD_INLINE v64 v64_unpacklo_u16_s32(v64 a) {
256
0
  return _mm_unpacklo_epi16(a, _mm_setzero_si128());
257
0
}
Unexecuted instantiation: cdef_block_sse4.c:v64_unpacklo_u16_s32
Unexecuted instantiation: cdef_block_avx2.c:v64_unpacklo_u16_s32
258
259
0
SIMD_INLINE v64 v64_unpacklo_s16_s32(v64 a) {
260
0
  return _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), a), 16);
261
0
}
Unexecuted instantiation: cdef_block_sse4.c:v64_unpacklo_s16_s32
Unexecuted instantiation: cdef_block_avx2.c:v64_unpacklo_s16_s32
262
263
0
SIMD_INLINE v64 v64_unpackhi_u16_s32(v64 a) {
264
0
  return _mm_srli_si128(_mm_unpacklo_epi16(a, _mm_setzero_si128()), 8);
265
0
}
Unexecuted instantiation: cdef_block_sse4.c:v64_unpackhi_u16_s32
Unexecuted instantiation: cdef_block_avx2.c:v64_unpackhi_u16_s32
266
267
0
SIMD_INLINE v64 v64_unpackhi_s16_s32(v64 a) {
268
0
  return _mm_srli_si128(
269
0
      _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), a), 16), 8);
270
0
}
Unexecuted instantiation: cdef_block_sse4.c:v64_unpackhi_s16_s32
Unexecuted instantiation: cdef_block_avx2.c:v64_unpackhi_s16_s32
271
272
0
SIMD_INLINE v64 v64_shuffle_8(v64 x, v64 pattern) {
273
0
#if defined(__SSSE3__)
274
0
  return _mm_shuffle_epi8(x, pattern);
275
0
#else
276
0
  v64 output;
277
0
  unsigned char *input = (unsigned char *)&x;
278
0
  unsigned char *index = (unsigned char *)&pattern;
279
0
  unsigned char *selected = (unsigned char *)&output;
280
0
  int counter;
281
0
282
0
  for (counter = 0; counter < 8; counter++) {
283
0
    selected[counter] = input[index[counter]];
284
0
  }
285
0
286
0
  return output;
287
0
#endif
288
0
}
Unexecuted instantiation: cdef_block_sse4.c:v64_shuffle_8
Unexecuted instantiation: cdef_block_avx2.c:v64_shuffle_8
289
290
0
SIMD_INLINE int64_t v64_dotp_su8(v64 a, v64 b) {
291
0
  __m128i t = _mm_madd_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8),
292
0
                             _mm_unpacklo_epi8(b, _mm_setzero_si128()));
293
0
  t = _mm_add_epi32(t, _mm_srli_si128(t, 8));
294
0
  t = _mm_add_epi32(t, _mm_srli_si128(t, 4));
295
0
  return (int32_t)v64_low_u32(t);
296
0
}
Unexecuted instantiation: cdef_block_sse4.c:v64_dotp_su8
Unexecuted instantiation: cdef_block_avx2.c:v64_dotp_su8
297
298
0
SIMD_INLINE int64_t v64_dotp_s16(v64 a, v64 b) {
299
0
  __m128i r = _mm_madd_epi16(a, b);
300
0
#if defined(__SSE4_1__) && defined(__x86_64__)
301
0
  __m128i x = _mm_cvtepi32_epi64(r);
302
0
  return _mm_cvtsi128_si64(_mm_add_epi64(x, _mm_srli_si128(x, 8)));
303
0
#else
304
0
  return (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 4)) +
305
0
         (int64_t)_mm_cvtsi128_si32(r);
306
0
#endif
307
0
}
Unexecuted instantiation: cdef_block_sse4.c:v64_dotp_s16
Unexecuted instantiation: cdef_block_avx2.c:v64_dotp_s16
308
309
0
SIMD_INLINE uint64_t v64_hadd_u8(v64 a) {
310
0
  return v64_low_u32(_mm_sad_epu8(a, _mm_setzero_si128()));
311
0
}
Unexecuted instantiation: cdef_block_sse4.c:v64_hadd_u8
Unexecuted instantiation: cdef_block_avx2.c:v64_hadd_u8
312
313
0
SIMD_INLINE int64_t v64_hadd_s16(v64 a) {
314
0
  return v64_dotp_s16(a, v64_dup_16(1));
315
0
}
Unexecuted instantiation: cdef_block_sse4.c:v64_hadd_s16
Unexecuted instantiation: cdef_block_avx2.c:v64_hadd_s16
316
317
typedef v64 sad64_internal;
318
319
0
SIMD_INLINE sad64_internal v64_sad_u8_init(void) { return _mm_setzero_si128(); }
Unexecuted instantiation: cdef_block_sse4.c:v64_sad_u8_init
Unexecuted instantiation: cdef_block_avx2.c:v64_sad_u8_init
320
321
/* Implementation dependent return value.  Result must be finalised with
322
   v64_sad_u8_sum().
323
   The result for more than 32 v64_sad_u8() calls is undefined. */
324
0
SIMD_INLINE sad64_internal v64_sad_u8(sad64_internal s, v64 a, v64 b) {
325
0
  return _mm_add_epi64(s, _mm_sad_epu8(a, b));
326
0
}
Unexecuted instantiation: cdef_block_sse4.c:v64_sad_u8
Unexecuted instantiation: cdef_block_avx2.c:v64_sad_u8
327
328
0
SIMD_INLINE uint32_t v64_sad_u8_sum(sad64_internal s) { return v64_low_u32(s); }
Unexecuted instantiation: cdef_block_sse4.c:v64_sad_u8_sum
Unexecuted instantiation: cdef_block_avx2.c:v64_sad_u8_sum
329
330
typedef v64 ssd64_internal;
331
332
0
SIMD_INLINE ssd64_internal v64_ssd_u8_init(void) { return _mm_setzero_si128(); }
Unexecuted instantiation: cdef_block_sse4.c:v64_ssd_u8_init
Unexecuted instantiation: cdef_block_avx2.c:v64_ssd_u8_init
333
334
/* Implementation dependent return value.  Result must be finalised with
335
 * v64_ssd_u8_sum(). */
336
0
SIMD_INLINE ssd64_internal v64_ssd_u8(ssd64_internal s, v64 a, v64 b) {
337
0
  v64 l = v64_sub_16(v64_ziplo_8(v64_zero(), a), v64_ziplo_8(v64_zero(), b));
338
0
  v64 h = v64_sub_16(v64_ziphi_8(v64_zero(), a), v64_ziphi_8(v64_zero(), b));
339
0
  v64 r = v64_add_32(_mm_madd_epi16(l, l), _mm_madd_epi16(h, h));
340
0
  return _mm_add_epi64(
341
0
      s, v64_ziplo_32(v64_zero(), _mm_add_epi32(r, _mm_srli_si128(r, 4))));
342
0
}
Unexecuted instantiation: cdef_block_sse4.c:v64_ssd_u8
Unexecuted instantiation: cdef_block_avx2.c:v64_ssd_u8
343
344
0
SIMD_INLINE uint32_t v64_ssd_u8_sum(sad64_internal s) { return v64_low_u32(s); }
Unexecuted instantiation: cdef_block_sse4.c:v64_ssd_u8_sum
Unexecuted instantiation: cdef_block_avx2.c:v64_ssd_u8_sum
345
346
0
SIMD_INLINE v64 v64_or(v64 a, v64 b) { return _mm_or_si128(a, b); }
Unexecuted instantiation: cdef_block_sse4.c:v64_or
Unexecuted instantiation: cdef_block_avx2.c:v64_or
347
348
0
SIMD_INLINE v64 v64_xor(v64 a, v64 b) { return _mm_xor_si128(a, b); }
Unexecuted instantiation: cdef_block_sse4.c:v64_xor
Unexecuted instantiation: cdef_block_avx2.c:v64_xor
349
350
0
SIMD_INLINE v64 v64_and(v64 a, v64 b) { return _mm_and_si128(a, b); }
Unexecuted instantiation: cdef_block_sse4.c:v64_and
Unexecuted instantiation: cdef_block_avx2.c:v64_and
351
352
0
SIMD_INLINE v64 v64_andn(v64 a, v64 b) { return _mm_andnot_si128(b, a); }
Unexecuted instantiation: cdef_block_sse4.c:v64_andn
Unexecuted instantiation: cdef_block_avx2.c:v64_andn
353
354
0
SIMD_INLINE v64 v64_mullo_s16(v64 a, v64 b) { return _mm_mullo_epi16(a, b); }
Unexecuted instantiation: cdef_block_sse4.c:v64_mullo_s16
Unexecuted instantiation: cdef_block_avx2.c:v64_mullo_s16
355
356
0
SIMD_INLINE v64 v64_mulhi_s16(v64 a, v64 b) { return _mm_mulhi_epi16(a, b); }
Unexecuted instantiation: cdef_block_sse4.c:v64_mulhi_s16
Unexecuted instantiation: cdef_block_avx2.c:v64_mulhi_s16
357
358
0
SIMD_INLINE v64 v64_mullo_s32(v64 a, v64 b) {
359
0
#if defined(__SSE4_1__)
360
0
  return _mm_mullo_epi32(a, b);
361
0
#else
362
0
  return _mm_unpacklo_epi32(
363
0
      _mm_mul_epu32(a, b),
364
0
      _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4)));
365
0
#endif
366
0
}
Unexecuted instantiation: cdef_block_sse4.c:v64_mullo_s32
Unexecuted instantiation: cdef_block_avx2.c:v64_mullo_s32
367
368
0
SIMD_INLINE v64 v64_madd_s16(v64 a, v64 b) { return _mm_madd_epi16(a, b); }
Unexecuted instantiation: cdef_block_sse4.c:v64_madd_s16
Unexecuted instantiation: cdef_block_avx2.c:v64_madd_s16
369
370
0
SIMD_INLINE v64 v64_madd_us8(v64 a, v64 b) {
371
0
#if defined(__SSSE3__)
372
0
  return _mm_maddubs_epi16(a, b);
373
0
#else
374
0
  __m128i t = _mm_madd_epi16(_mm_unpacklo_epi8(a, _mm_setzero_si128()),
375
0
                             _mm_srai_epi16(_mm_unpacklo_epi8(b, b), 8));
376
0
  return _mm_packs_epi32(t, t);
377
0
#endif
378
0
}
Unexecuted instantiation: cdef_block_sse4.c:v64_madd_us8
Unexecuted instantiation: cdef_block_avx2.c:v64_madd_us8
379
380
0
SIMD_INLINE v64 v64_avg_u8(v64 a, v64 b) { return _mm_avg_epu8(a, b); }
Unexecuted instantiation: cdef_block_sse4.c:v64_avg_u8
Unexecuted instantiation: cdef_block_avx2.c:v64_avg_u8
381
382
0
SIMD_INLINE v64 v64_rdavg_u8(v64 a, v64 b) {
383
0
  return _mm_sub_epi8(_mm_avg_epu8(a, b),
384
0
                      _mm_and_si128(_mm_xor_si128(a, b), v64_dup_8(1)));
385
0
}
Unexecuted instantiation: cdef_block_sse4.c:v64_rdavg_u8
Unexecuted instantiation: cdef_block_avx2.c:v64_rdavg_u8
386
387
0
SIMD_INLINE v64 v64_rdavg_u16(v64 a, v64 b) {
388
0
  return _mm_sub_epi16(_mm_avg_epu16(a, b),
389
0
                       _mm_and_si128(_mm_xor_si128(a, b), v64_dup_16(1)));
390
0
}
Unexecuted instantiation: cdef_block_sse4.c:v64_rdavg_u16
Unexecuted instantiation: cdef_block_avx2.c:v64_rdavg_u16
391
392
0
SIMD_INLINE v64 v64_avg_u16(v64 a, v64 b) { return _mm_avg_epu16(a, b); }
Unexecuted instantiation: cdef_block_sse4.c:v64_avg_u16
Unexecuted instantiation: cdef_block_avx2.c:v64_avg_u16
393
394
0
SIMD_INLINE v64 v64_min_u8(v64 a, v64 b) { return _mm_min_epu8(a, b); }
Unexecuted instantiation: cdef_block_sse4.c:v64_min_u8
Unexecuted instantiation: cdef_block_avx2.c:v64_min_u8
395
396
0
SIMD_INLINE v64 v64_max_u8(v64 a, v64 b) { return _mm_max_epu8(a, b); }
Unexecuted instantiation: cdef_block_sse4.c:v64_max_u8
Unexecuted instantiation: cdef_block_avx2.c:v64_max_u8
397
398
0
SIMD_INLINE v64 v64_min_s8(v64 a, v64 b) {
399
0
#if defined(__SSE4_1__)
400
0
  return _mm_min_epi8(a, b);
401
0
#else
402
0
  v64 mask = _mm_cmplt_epi8(a, b);
403
0
  return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
404
0
#endif
405
0
}
Unexecuted instantiation: cdef_block_sse4.c:v64_min_s8
Unexecuted instantiation: cdef_block_avx2.c:v64_min_s8
406
407
0
SIMD_INLINE v64 v64_max_s8(v64 a, v64 b) {
408
0
#if defined(__SSE4_1__)
409
0
  return _mm_max_epi8(a, b);
410
0
#else
411
0
  v64 mask = _mm_cmplt_epi8(b, a);
412
0
  return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
413
0
#endif
414
0
}
Unexecuted instantiation: cdef_block_sse4.c:v64_max_s8
Unexecuted instantiation: cdef_block_avx2.c:v64_max_s8
415
416
0
SIMD_INLINE v64 v64_min_s16(v64 a, v64 b) { return _mm_min_epi16(a, b); }
Unexecuted instantiation: cdef_block_sse4.c:v64_min_s16
Unexecuted instantiation: cdef_block_avx2.c:v64_min_s16
417
418
0
SIMD_INLINE v64 v64_max_s16(v64 a, v64 b) { return _mm_max_epi16(a, b); }
Unexecuted instantiation: cdef_block_sse4.c:v64_max_s16
Unexecuted instantiation: cdef_block_avx2.c:v64_max_s16
419
420
0
SIMD_INLINE v64 v64_cmpgt_s8(v64 a, v64 b) { return _mm_cmpgt_epi8(a, b); }
Unexecuted instantiation: cdef_block_sse4.c:v64_cmpgt_s8
Unexecuted instantiation: cdef_block_avx2.c:v64_cmpgt_s8
421
422
0
SIMD_INLINE v64 v64_cmplt_s8(v64 a, v64 b) { return _mm_cmplt_epi8(a, b); }
Unexecuted instantiation: cdef_block_sse4.c:v64_cmplt_s8
Unexecuted instantiation: cdef_block_avx2.c:v64_cmplt_s8
423
424
0
SIMD_INLINE v64 v64_cmpeq_8(v64 a, v64 b) { return _mm_cmpeq_epi8(a, b); }
Unexecuted instantiation: cdef_block_sse4.c:v64_cmpeq_8
Unexecuted instantiation: cdef_block_avx2.c:v64_cmpeq_8
425
426
0
SIMD_INLINE v64 v64_cmpgt_s16(v64 a, v64 b) { return _mm_cmpgt_epi16(a, b); }
Unexecuted instantiation: cdef_block_sse4.c:v64_cmpgt_s16
Unexecuted instantiation: cdef_block_avx2.c:v64_cmpgt_s16
427
428
0
SIMD_INLINE v64 v64_cmplt_s16(v64 a, v64 b) { return _mm_cmplt_epi16(a, b); }
Unexecuted instantiation: cdef_block_sse4.c:v64_cmplt_s16
Unexecuted instantiation: cdef_block_avx2.c:v64_cmplt_s16
429
430
0
SIMD_INLINE v64 v64_cmpeq_16(v64 a, v64 b) { return _mm_cmpeq_epi16(a, b); }
Unexecuted instantiation: cdef_block_sse4.c:v64_cmpeq_16
Unexecuted instantiation: cdef_block_avx2.c:v64_cmpeq_16
431
432
0
SIMD_INLINE v64 v64_shl_8(v64 a, unsigned int c) {
433
0
  return _mm_and_si128(_mm_set1_epi8((char)(0xff << c)),
434
0
                       _mm_sll_epi16(a, _mm_cvtsi32_si128((int)c)));
435
0
}
Unexecuted instantiation: cdef_block_sse4.c:v64_shl_8
Unexecuted instantiation: cdef_block_avx2.c:v64_shl_8
436
437
0
SIMD_INLINE v64 v64_shr_u8(v64 a, unsigned int c) {
438
0
  return _mm_and_si128(_mm_set1_epi8((char)(0xff >> c)),
439
0
                       _mm_srl_epi16(a, _mm_cvtsi32_si128((int)c)));
440
0
}
Unexecuted instantiation: cdef_block_sse4.c:v64_shr_u8
Unexecuted instantiation: cdef_block_avx2.c:v64_shr_u8
441
442
0
SIMD_INLINE v64 v64_shr_s8(v64 a, unsigned int c) {
443
0
  return _mm_packs_epi16(
444
0
      _mm_sra_epi16(_mm_unpacklo_epi8(a, a), _mm_cvtsi32_si128((int)(c + 8))),
445
0
      a);
446
0
}
Unexecuted instantiation: cdef_block_sse4.c:v64_shr_s8
Unexecuted instantiation: cdef_block_avx2.c:v64_shr_s8
447
448
0
SIMD_INLINE v64 v64_shl_16(v64 a, unsigned int c) {
449
0
  return _mm_sll_epi16(a, _mm_cvtsi32_si128((int)c));
450
0
}
Unexecuted instantiation: cdef_block_sse4.c:v64_shl_16
Unexecuted instantiation: cdef_block_avx2.c:v64_shl_16
451
452
0
SIMD_INLINE v64 v64_shr_u16(v64 a, unsigned int c) {
453
0
  return _mm_srl_epi16(a, _mm_cvtsi32_si128((int)c));
454
0
}
Unexecuted instantiation: cdef_block_sse4.c:v64_shr_u16
Unexecuted instantiation: cdef_block_avx2.c:v64_shr_u16
455
456
0
SIMD_INLINE v64 v64_shr_s16(v64 a, unsigned int c) {
457
0
  return _mm_sra_epi16(a, _mm_cvtsi32_si128((int)c));
458
0
}
Unexecuted instantiation: cdef_block_sse4.c:v64_shr_s16
Unexecuted instantiation: cdef_block_avx2.c:v64_shr_s16
459
460
0
SIMD_INLINE v64 v64_shl_32(v64 a, unsigned int c) {
461
0
  return _mm_sll_epi32(a, _mm_cvtsi32_si128((int)c));
462
0
}
Unexecuted instantiation: cdef_block_sse4.c:v64_shl_32
Unexecuted instantiation: cdef_block_avx2.c:v64_shl_32
463
464
0
SIMD_INLINE v64 v64_shr_u32(v64 a, unsigned int c) {
465
0
  return _mm_srl_epi32(a, _mm_cvtsi32_si128((int)c));
466
0
}
Unexecuted instantiation: cdef_block_sse4.c:v64_shr_u32
Unexecuted instantiation: cdef_block_avx2.c:v64_shr_u32
467
468
0
SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int c) {
469
0
  return _mm_sra_epi32(a, _mm_cvtsi32_si128((int)c));
470
0
}
Unexecuted instantiation: cdef_block_sse4.c:v64_shr_s32
Unexecuted instantiation: cdef_block_avx2.c:v64_shr_s32
471
472
/* These intrinsics require immediate values, so we must use #defines
473
   to enforce that. */
474
#define v64_shl_n_byte(a, c) _mm_slli_si128(a, c)
475
#define v64_shr_n_byte(a, c) _mm_srli_si128(_mm_unpacklo_epi64(a, a), c + 8)
476
#define v64_shl_n_8(a, c) \
477
  _mm_and_si128(_mm_set1_epi8((char)(0xff << (c))), _mm_slli_epi16(a, c))
478
#define v64_shr_n_u8(a, c) \
479
  _mm_and_si128(_mm_set1_epi8((char)(0xff >> (c))), _mm_srli_epi16(a, c))
480
#define v64_shr_n_s8(a, c) \
481
  _mm_packs_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), (c) + 8), a)
482
#define v64_shl_n_16(a, c) _mm_slli_epi16(a, c)
483
#define v64_shr_n_u16(a, c) _mm_srli_epi16(a, c)
484
#define v64_shr_n_s16(a, c) _mm_srai_epi16(a, c)
485
#define v64_shl_n_32(a, c) _mm_slli_epi32(a, c)
486
#define v64_shr_n_u32(a, c) _mm_srli_epi32(a, c)
487
#define v64_shr_n_s32(a, c) _mm_srai_epi32(a, c)
488
489
#endif  // AOM_AOM_DSP_SIMD_V64_INTRINSICS_X86_H_