Coverage Report

Created: 2025-06-13 07:07

/src/aom/aom_dsp/simd/v128_intrinsics_x86.h
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 */
11
12
#ifndef AOM_AOM_DSP_SIMD_V128_INTRINSICS_X86_H_
13
#define AOM_AOM_DSP_SIMD_V128_INTRINSICS_X86_H_
14
15
#include <stdint.h>
16
#include "aom_dsp/simd/v64_intrinsics_x86.h"
17
18
typedef __m128i v128;
19
20
8.18k
SIMD_INLINE uint32_t v128_low_u32(v128 a) {
21
8.18k
  return (uint32_t)_mm_cvtsi128_si32(a);
22
8.18k
}
Unexecuted instantiation: cdef_block_sse4.c:v128_low_u32
cdef_block_avx2.c:v128_low_u32
Line
Count
Source
20
8.18k
SIMD_INLINE uint32_t v128_low_u32(v128 a) {
21
8.18k
  return (uint32_t)_mm_cvtsi128_si32(a);
22
8.18k
}
23
24
109M
SIMD_INLINE v64 v128_low_v64(v128 a) {
25
109M
  return _mm_unpacklo_epi64(a, v64_zero());
26
109M
}
Unexecuted instantiation: cdef_block_sse4.c:v128_low_v64
cdef_block_avx2.c:v128_low_v64
Line
Count
Source
24
109M
SIMD_INLINE v64 v128_low_v64(v128 a) {
25
109M
  return _mm_unpacklo_epi64(a, v64_zero());
26
109M
}
27
28
106M
SIMD_INLINE v64 v128_high_v64(v128 a) { return _mm_srli_si128(a, 8); }
Unexecuted instantiation: cdef_block_sse4.c:v128_high_v64
cdef_block_avx2.c:v128_high_v64
Line
Count
Source
28
106M
SIMD_INLINE v64 v128_high_v64(v128 a) { return _mm_srli_si128(a, 8); }
29
30
401M
SIMD_INLINE v128 v128_from_v64(v64 a, v64 b) {
31
401M
  return _mm_unpacklo_epi64(b, a);
32
401M
}
Unexecuted instantiation: cdef_block_sse4.c:v128_from_v64
cdef_block_avx2.c:v128_from_v64
Line
Count
Source
30
401M
SIMD_INLINE v128 v128_from_v64(v64 a, v64 b) {
31
401M
  return _mm_unpacklo_epi64(b, a);
32
401M
}
33
34
0
SIMD_INLINE v128 v128_from_64(uint64_t a, uint64_t b) {
35
0
  return v128_from_v64(v64_from_64(a), v64_from_64(b));
36
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_from_64
Unexecuted instantiation: cdef_block_avx2.c:v128_from_64
37
38
147k
SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
39
147k
  return _mm_set_epi32((int)a, (int)b, (int)c, (int)d);
40
147k
}
Unexecuted instantiation: cdef_block_sse4.c:v128_from_32
cdef_block_avx2.c:v128_from_32
Line
Count
Source
38
147k
SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
39
147k
  return _mm_set_epi32((int)a, (int)b, (int)c, (int)d);
40
147k
}
41
42
150M
SIMD_INLINE v128 v128_load_aligned(const void *p) {
43
150M
  return _mm_load_si128((__m128i *)p);
44
150M
}
Unexecuted instantiation: cdef_block_sse4.c:v128_load_aligned
cdef_block_avx2.c:v128_load_aligned
Line
Count
Source
42
150M
SIMD_INLINE v128 v128_load_aligned(const void *p) {
43
150M
  return _mm_load_si128((__m128i *)p);
44
150M
}
45
46
724M
SIMD_INLINE v128 v128_load_unaligned(const void *p) {
47
724M
#if defined(__SSSE3__)
48
724M
  return _mm_lddqu_si128((__m128i *)p);
49
#else
50
  return _mm_loadu_si128((__m128i *)p);
51
#endif
52
724M
}
Unexecuted instantiation: cdef_block_sse4.c:v128_load_unaligned
cdef_block_avx2.c:v128_load_unaligned
Line
Count
Source
46
724M
SIMD_INLINE v128 v128_load_unaligned(const void *p) {
47
724M
#if defined(__SSSE3__)
48
724M
  return _mm_lddqu_si128((__m128i *)p);
49
#else
50
  return _mm_loadu_si128((__m128i *)p);
51
#endif
52
724M
}
53
54
0
SIMD_INLINE void v128_store_aligned(void *p, v128 a) {
55
0
  _mm_store_si128((__m128i *)p, a);
56
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_store_aligned
Unexecuted instantiation: cdef_block_avx2.c:v128_store_aligned
57
58
151M
SIMD_INLINE void v128_store_unaligned(void *p, v128 a) {
59
151M
  _mm_storeu_si128((__m128i *)p, a);
60
151M
}
Unexecuted instantiation: cdef_block_sse4.c:v128_store_unaligned
cdef_block_avx2.c:v128_store_unaligned
Line
Count
Source
58
151M
SIMD_INLINE void v128_store_unaligned(void *p, v128 a) {
59
151M
  _mm_storeu_si128((__m128i *)p, a);
60
151M
}
61
62
// The following function requires an immediate.
63
// Some compilers will check this during optimisation, others wont.
64
#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__)
65
#if defined(__SSSE3__)
66
SIMD_INLINE v128 v128_align(v128 a, v128 b, const unsigned int c) {
67
  return c ? _mm_alignr_epi8(a, b, c) : b;
68
}
69
#else
70
#define v128_align(a, b, c) \
71
  ((c) ? _mm_or_si128(_mm_srli_si128(b, c), _mm_slli_si128(a, 16 - (c))) : (b))
72
#endif
73
#else
74
#if defined(__SSSE3__)
75
16.3k
#define v128_align(a, b, c) ((c) ? _mm_alignr_epi8(a, b, (uint8_t)(c)) : (b))
76
#else
77
#define v128_align(a, b, c) \
78
  ((c) ? _mm_or_si128(_mm_srli_si128(b, c), _mm_slli_si128(a, 16 - (c))) : (b))
79
#endif
80
#endif
81
82
0
SIMD_INLINE v128 v128_zero(void) { return _mm_setzero_si128(); }
Unexecuted instantiation: cdef_block_sse4.c:v128_zero
Unexecuted instantiation: cdef_block_avx2.c:v128_zero
83
84
0
SIMD_INLINE v128 v128_dup_8(uint8_t x) { return _mm_set1_epi8((char)x); }
Unexecuted instantiation: cdef_block_sse4.c:v128_dup_8
Unexecuted instantiation: cdef_block_avx2.c:v128_dup_8
85
86
65.4k
SIMD_INLINE v128 v128_dup_16(uint16_t x) { return _mm_set1_epi16((short)x); }
Unexecuted instantiation: cdef_block_sse4.c:v128_dup_16
cdef_block_avx2.c:v128_dup_16
Line
Count
Source
86
65.4k
SIMD_INLINE v128 v128_dup_16(uint16_t x) { return _mm_set1_epi16((short)x); }
87
88
16.3k
SIMD_INLINE v128 v128_dup_32(uint32_t x) { return _mm_set1_epi32((int)x); }
Unexecuted instantiation: cdef_block_sse4.c:v128_dup_32
cdef_block_avx2.c:v128_dup_32
Line
Count
Source
88
16.3k
SIMD_INLINE v128 v128_dup_32(uint32_t x) { return _mm_set1_epi32((int)x); }
89
90
0
SIMD_INLINE v128 v128_dup_64(uint64_t x) {
91
0
  // _mm_set_pi64x and _mm_cvtsi64x_si64 missing in some compilers
92
0
  return _mm_set_epi32((int32_t)(x >> 32), (int32_t)x, (int32_t)(x >> 32),
93
0
                       (int32_t)x);
94
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_dup_64
Unexecuted instantiation: cdef_block_avx2.c:v128_dup_64
95
96
0
SIMD_INLINE v128 v128_add_8(v128 a, v128 b) { return _mm_add_epi8(a, b); }
Unexecuted instantiation: cdef_block_sse4.c:v128_add_8
Unexecuted instantiation: cdef_block_avx2.c:v128_add_8
97
98
523k
SIMD_INLINE v128 v128_add_16(v128 a, v128 b) { return _mm_add_epi16(a, b); }
Unexecuted instantiation: cdef_block_sse4.c:v128_add_16
cdef_block_avx2.c:v128_add_16
Line
Count
Source
98
523k
SIMD_INLINE v128 v128_add_16(v128 a, v128 b) { return _mm_add_epi16(a, b); }
99
100
0
SIMD_INLINE v128 v128_sadd_u8(v128 a, v128 b) { return _mm_adds_epu8(a, b); }
Unexecuted instantiation: cdef_block_sse4.c:v128_sadd_u8
Unexecuted instantiation: cdef_block_avx2.c:v128_sadd_u8
101
102
0
SIMD_INLINE v128 v128_sadd_s8(v128 a, v128 b) { return _mm_adds_epi8(a, b); }
Unexecuted instantiation: cdef_block_sse4.c:v128_sadd_s8
Unexecuted instantiation: cdef_block_avx2.c:v128_sadd_s8
103
104
0
SIMD_INLINE v128 v128_sadd_s16(v128 a, v128 b) { return _mm_adds_epi16(a, b); }
Unexecuted instantiation: cdef_block_sse4.c:v128_sadd_s16
Unexecuted instantiation: cdef_block_avx2.c:v128_sadd_s16
105
106
98.1k
SIMD_INLINE v128 v128_add_32(v128 a, v128 b) { return _mm_add_epi32(a, b); }
Unexecuted instantiation: cdef_block_sse4.c:v128_add_32
cdef_block_avx2.c:v128_add_32
Line
Count
Source
106
98.1k
SIMD_INLINE v128 v128_add_32(v128 a, v128 b) { return _mm_add_epi32(a, b); }
107
108
0
SIMD_INLINE v128 v128_add_64(v128 a, v128 b) { return _mm_add_epi64(a, b); }
Unexecuted instantiation: cdef_block_sse4.c:v128_add_64
Unexecuted instantiation: cdef_block_avx2.c:v128_add_64
109
110
0
SIMD_INLINE v128 v128_padd_s16(v128 a) {
111
0
  return _mm_madd_epi16(a, _mm_set1_epi16(1));
112
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_padd_s16
Unexecuted instantiation: cdef_block_avx2.c:v128_padd_s16
113
114
0
SIMD_INLINE v128 v128_sub_8(v128 a, v128 b) { return _mm_sub_epi8(a, b); }
Unexecuted instantiation: cdef_block_sse4.c:v128_sub_8
Unexecuted instantiation: cdef_block_avx2.c:v128_sub_8
115
116
0
SIMD_INLINE v128 v128_ssub_u8(v128 a, v128 b) { return _mm_subs_epu8(a, b); }
Unexecuted instantiation: cdef_block_sse4.c:v128_ssub_u8
Unexecuted instantiation: cdef_block_avx2.c:v128_ssub_u8
117
118
0
SIMD_INLINE v128 v128_ssub_s8(v128 a, v128 b) { return _mm_subs_epi8(a, b); }
Unexecuted instantiation: cdef_block_sse4.c:v128_ssub_s8
Unexecuted instantiation: cdef_block_avx2.c:v128_ssub_s8
119
120
65.4k
SIMD_INLINE v128 v128_sub_16(v128 a, v128 b) { return _mm_sub_epi16(a, b); }
Unexecuted instantiation: cdef_block_sse4.c:v128_sub_16
cdef_block_avx2.c:v128_sub_16
Line
Count
Source
120
65.4k
SIMD_INLINE v128 v128_sub_16(v128 a, v128 b) { return _mm_sub_epi16(a, b); }
121
122
0
SIMD_INLINE v128 v128_ssub_s16(v128 a, v128 b) { return _mm_subs_epi16(a, b); }
Unexecuted instantiation: cdef_block_sse4.c:v128_ssub_s16
Unexecuted instantiation: cdef_block_avx2.c:v128_ssub_s16
123
124
0
SIMD_INLINE v128 v128_ssub_u16(v128 a, v128 b) { return _mm_subs_epu16(a, b); }
Unexecuted instantiation: cdef_block_sse4.c:v128_ssub_u16
Unexecuted instantiation: cdef_block_avx2.c:v128_ssub_u16
125
126
0
SIMD_INLINE v128 v128_sub_32(v128 a, v128 b) { return _mm_sub_epi32(a, b); }
Unexecuted instantiation: cdef_block_sse4.c:v128_sub_32
Unexecuted instantiation: cdef_block_avx2.c:v128_sub_32
127
128
0
SIMD_INLINE v128 v128_sub_64(v128 a, v128 b) { return _mm_sub_epi64(a, b); }
Unexecuted instantiation: cdef_block_sse4.c:v128_sub_64
Unexecuted instantiation: cdef_block_avx2.c:v128_sub_64
129
130
0
SIMD_INLINE v128 v128_abs_s16(v128 a) {
131
0
#if defined(__SSSE3__)
132
0
  return _mm_abs_epi16(a);
133
#else
134
  return _mm_max_epi16(a, _mm_sub_epi16(_mm_setzero_si128(), a));
135
#endif
136
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_abs_s16
Unexecuted instantiation: cdef_block_avx2.c:v128_abs_s16
137
138
0
SIMD_INLINE v128 v128_abs_s8(v128 a) {
139
0
#if defined(__SSSE3__)
140
0
  return _mm_abs_epi8(a);
141
0
#else
142
0
  v128 sign = _mm_cmplt_epi8(a, _mm_setzero_si128());
143
0
  return _mm_xor_si128(sign, _mm_add_epi8(a, sign));
144
0
#endif
145
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_abs_s8
Unexecuted instantiation: cdef_block_avx2.c:v128_abs_s8
146
147
0
SIMD_INLINE v128 v128_ziplo_8(v128 a, v128 b) {
148
0
  return _mm_unpacklo_epi8(b, a);
149
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_ziplo_8
Unexecuted instantiation: cdef_block_avx2.c:v128_ziplo_8
150
151
0
SIMD_INLINE v128 v128_ziphi_8(v128 a, v128 b) {
152
0
  return _mm_unpackhi_epi8(b, a);
153
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_ziphi_8
Unexecuted instantiation: cdef_block_avx2.c:v128_ziphi_8
154
155
81.8k
SIMD_INLINE v128 v128_ziplo_16(v128 a, v128 b) {
156
81.8k
  return _mm_unpacklo_epi16(b, a);
157
81.8k
}
Unexecuted instantiation: cdef_block_sse4.c:v128_ziplo_16
cdef_block_avx2.c:v128_ziplo_16
Line
Count
Source
155
81.8k
SIMD_INLINE v128 v128_ziplo_16(v128 a, v128 b) {
156
81.8k
  return _mm_unpacklo_epi16(b, a);
157
81.8k
}
158
159
81.8k
SIMD_INLINE v128 v128_ziphi_16(v128 a, v128 b) {
160
81.8k
  return _mm_unpackhi_epi16(b, a);
161
81.8k
}
Unexecuted instantiation: cdef_block_sse4.c:v128_ziphi_16
cdef_block_avx2.c:v128_ziphi_16
Line
Count
Source
159
81.8k
SIMD_INLINE v128 v128_ziphi_16(v128 a, v128 b) {
160
81.8k
  return _mm_unpackhi_epi16(b, a);
161
81.8k
}
162
163
65.4k
SIMD_INLINE v128 v128_ziplo_32(v128 a, v128 b) {
164
65.4k
  return _mm_unpacklo_epi32(b, a);
165
65.4k
}
Unexecuted instantiation: cdef_block_sse4.c:v128_ziplo_32
cdef_block_avx2.c:v128_ziplo_32
Line
Count
Source
163
65.4k
SIMD_INLINE v128 v128_ziplo_32(v128 a, v128 b) {
164
65.4k
  return _mm_unpacklo_epi32(b, a);
165
65.4k
}
166
167
65.4k
SIMD_INLINE v128 v128_ziphi_32(v128 a, v128 b) {
168
65.4k
  return _mm_unpackhi_epi32(b, a);
169
65.4k
}
Unexecuted instantiation: cdef_block_sse4.c:v128_ziphi_32
cdef_block_avx2.c:v128_ziphi_32
Line
Count
Source
167
65.4k
SIMD_INLINE v128 v128_ziphi_32(v128 a, v128 b) {
168
65.4k
  return _mm_unpackhi_epi32(b, a);
169
65.4k
}
170
171
65.4k
SIMD_INLINE v128 v128_ziplo_64(v128 a, v128 b) {
172
65.4k
  return _mm_unpacklo_epi64(b, a);
173
65.4k
}
Unexecuted instantiation: cdef_block_sse4.c:v128_ziplo_64
cdef_block_avx2.c:v128_ziplo_64
Line
Count
Source
171
65.4k
SIMD_INLINE v128 v128_ziplo_64(v128 a, v128 b) {
172
65.4k
  return _mm_unpacklo_epi64(b, a);
173
65.4k
}
174
175
65.4k
SIMD_INLINE v128 v128_ziphi_64(v128 a, v128 b) {
176
65.4k
  return _mm_unpackhi_epi64(b, a);
177
65.4k
}
Unexecuted instantiation: cdef_block_sse4.c:v128_ziphi_64
cdef_block_avx2.c:v128_ziphi_64
Line
Count
Source
175
65.4k
SIMD_INLINE v128 v128_ziphi_64(v128 a, v128 b) {
176
65.4k
  return _mm_unpackhi_epi64(b, a);
177
65.4k
}
178
179
0
SIMD_INLINE v128 v128_zip_8(v64 a, v64 b) { return _mm_unpacklo_epi8(b, a); }
Unexecuted instantiation: cdef_block_sse4.c:v128_zip_8
Unexecuted instantiation: cdef_block_avx2.c:v128_zip_8
180
181
0
SIMD_INLINE v128 v128_zip_16(v64 a, v64 b) { return _mm_unpacklo_epi16(b, a); }
Unexecuted instantiation: cdef_block_sse4.c:v128_zip_16
Unexecuted instantiation: cdef_block_avx2.c:v128_zip_16
182
183
0
SIMD_INLINE v128 v128_zip_32(v64 a, v64 b) { return _mm_unpacklo_epi32(b, a); }
Unexecuted instantiation: cdef_block_sse4.c:v128_zip_32
Unexecuted instantiation: cdef_block_avx2.c:v128_zip_32
184
185
0
SIMD_INLINE v128 v128_unziphi_8(v128 a, v128 b) {
186
0
  return _mm_packs_epi16(_mm_srai_epi16(b, 8), _mm_srai_epi16(a, 8));
187
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_unziphi_8
Unexecuted instantiation: cdef_block_avx2.c:v128_unziphi_8
188
189
0
SIMD_INLINE v128 v128_unziplo_8(v128 a, v128 b) {
190
0
#if defined(__SSSE3__)
191
0
#ifdef __x86_64__
192
0
  v128 order = _mm_cvtsi64_si128(0x0e0c0a0806040200LL);
193
0
#else
194
0
  v128 order = _mm_set_epi32(0, 0, 0x0e0c0a08, 0x06040200);
195
0
#endif
196
0
  return _mm_unpacklo_epi64(_mm_shuffle_epi8(b, order),
197
0
                            _mm_shuffle_epi8(a, order));
198
0
#else
199
0
  return v128_unziphi_8(_mm_slli_si128(a, 1), _mm_slli_si128(b, 1));
200
0
#endif
201
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_unziplo_8
Unexecuted instantiation: cdef_block_avx2.c:v128_unziplo_8
202
203
0
SIMD_INLINE v128 v128_unziphi_16(v128 a, v128 b) {
204
0
  return _mm_packs_epi32(_mm_srai_epi32(b, 16), _mm_srai_epi32(a, 16));
205
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_unziphi_16
Unexecuted instantiation: cdef_block_avx2.c:v128_unziphi_16
206
207
0
SIMD_INLINE v128 v128_unziplo_16(v128 a, v128 b) {
208
0
#if defined(__SSSE3__)
209
0
#ifdef __x86_64__
210
0
  v128 order = _mm_cvtsi64_si128(0x0d0c090805040100LL);
211
0
#else
212
0
  v128 order = _mm_set_epi32(0, 0, 0x0d0c0908, 0x05040100);
213
0
#endif
214
0
  return _mm_unpacklo_epi64(_mm_shuffle_epi8(b, order),
215
0
                            _mm_shuffle_epi8(a, order));
216
0
#else
217
0
  return v128_unziphi_16(_mm_slli_si128(a, 2), _mm_slli_si128(b, 2));
218
0
#endif
219
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_unziplo_16
Unexecuted instantiation: cdef_block_avx2.c:v128_unziplo_16
220
221
0
SIMD_INLINE v128 v128_unziphi_32(v128 a, v128 b) {
222
0
  return _mm_castps_si128(_mm_shuffle_ps(
223
0
      _mm_castsi128_ps(b), _mm_castsi128_ps(a), _MM_SHUFFLE(3, 1, 3, 1)));
224
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_unziphi_32
Unexecuted instantiation: cdef_block_avx2.c:v128_unziphi_32
225
226
0
SIMD_INLINE v128 v128_unziplo_32(v128 a, v128 b) {
227
0
  return _mm_castps_si128(_mm_shuffle_ps(
228
0
      _mm_castsi128_ps(b), _mm_castsi128_ps(a), _MM_SHUFFLE(2, 0, 2, 0)));
229
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_unziplo_32
Unexecuted instantiation: cdef_block_avx2.c:v128_unziplo_32
230
231
0
SIMD_INLINE v128 v128_unpack_u8_s16(v64 a) {
232
0
  return _mm_unpacklo_epi8(a, _mm_setzero_si128());
233
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_unpack_u8_s16
Unexecuted instantiation: cdef_block_avx2.c:v128_unpack_u8_s16
234
235
0
SIMD_INLINE v128 v128_unpacklo_u8_s16(v128 a) {
236
0
  return _mm_unpacklo_epi8(a, _mm_setzero_si128());
237
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_unpacklo_u8_s16
Unexecuted instantiation: cdef_block_avx2.c:v128_unpacklo_u8_s16
238
239
0
SIMD_INLINE v128 v128_unpackhi_u8_s16(v128 a) {
240
0
  return _mm_unpackhi_epi8(a, _mm_setzero_si128());
241
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_unpackhi_u8_s16
Unexecuted instantiation: cdef_block_avx2.c:v128_unpackhi_u8_s16
242
243
0
SIMD_INLINE v128 v128_unpack_s8_s16(v64 a) {
244
0
  return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8);
245
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_unpack_s8_s16
Unexecuted instantiation: cdef_block_avx2.c:v128_unpack_s8_s16
246
247
0
SIMD_INLINE v128 v128_unpacklo_s8_s16(v128 a) {
248
0
  return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8);
249
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_unpacklo_s8_s16
Unexecuted instantiation: cdef_block_avx2.c:v128_unpacklo_s8_s16
250
251
0
SIMD_INLINE v128 v128_unpackhi_s8_s16(v128 a) {
252
0
  return _mm_srai_epi16(_mm_unpackhi_epi8(a, a), 8);
253
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_unpackhi_s8_s16
Unexecuted instantiation: cdef_block_avx2.c:v128_unpackhi_s8_s16
254
255
8.18k
SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) {
256
8.18k
  return _mm_packs_epi32(b, a);
257
8.18k
}
Unexecuted instantiation: cdef_block_sse4.c:v128_pack_s32_s16
cdef_block_avx2.c:v128_pack_s32_s16
Line
Count
Source
255
8.18k
SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) {
256
8.18k
  return _mm_packs_epi32(b, a);
257
8.18k
}
258
259
0
SIMD_INLINE v128 v128_pack_s32_u16(v128 a, v128 b) {
260
0
#if defined(__SSE4_1__)
261
0
  return _mm_packus_epi32(b, a);
262
0
#else
263
0
  return v128_from_v64(v64_pack_s32_u16(v128_high_v64(a), v128_low_v64(a)),
264
0
                       v64_pack_s32_u16(v128_high_v64(b), v128_low_v64(b)));
265
0
#endif
266
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_pack_s32_u16
Unexecuted instantiation: cdef_block_avx2.c:v128_pack_s32_u16
267
268
11.0M
SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) {
269
11.0M
  return _mm_packus_epi16(b, a);
270
11.0M
}
Unexecuted instantiation: cdef_block_sse4.c:v128_pack_s16_u8
cdef_block_avx2.c:v128_pack_s16_u8
Line
Count
Source
268
11.0M
SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) {
269
11.0M
  return _mm_packus_epi16(b, a);
270
11.0M
}
271
272
8.18k
SIMD_INLINE v128 v128_pack_s16_s8(v128 a, v128 b) {
273
8.18k
  return _mm_packs_epi16(b, a);
274
8.18k
}
Unexecuted instantiation: cdef_block_sse4.c:v128_pack_s16_s8
cdef_block_avx2.c:v128_pack_s16_s8
Line
Count
Source
272
8.18k
SIMD_INLINE v128 v128_pack_s16_s8(v128 a, v128 b) {
273
8.18k
  return _mm_packs_epi16(b, a);
274
8.18k
}
275
276
0
SIMD_INLINE v128 v128_unpack_u16_s32(v64 a) {
277
0
  return _mm_unpacklo_epi16(a, _mm_setzero_si128());
278
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_unpack_u16_s32
Unexecuted instantiation: cdef_block_avx2.c:v128_unpack_u16_s32
279
280
0
SIMD_INLINE v128 v128_unpack_s16_s32(v64 a) {
281
0
  return _mm_srai_epi32(_mm_unpacklo_epi16(a, a), 16);
282
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_unpack_s16_s32
Unexecuted instantiation: cdef_block_avx2.c:v128_unpack_s16_s32
283
284
0
SIMD_INLINE v128 v128_unpacklo_u16_s32(v128 a) {
285
0
  return _mm_unpacklo_epi16(a, _mm_setzero_si128());
286
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_unpacklo_u16_s32
Unexecuted instantiation: cdef_block_avx2.c:v128_unpacklo_u16_s32
287
288
0
SIMD_INLINE v128 v128_unpacklo_s16_s32(v128 a) {
289
0
  return _mm_srai_epi32(_mm_unpacklo_epi16(a, a), 16);
290
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_unpacklo_s16_s32
Unexecuted instantiation: cdef_block_avx2.c:v128_unpacklo_s16_s32
291
292
0
SIMD_INLINE v128 v128_unpackhi_u16_s32(v128 a) {
293
0
  return _mm_unpackhi_epi16(a, _mm_setzero_si128());
294
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_unpackhi_u16_s32
Unexecuted instantiation: cdef_block_avx2.c:v128_unpackhi_u16_s32
295
296
0
SIMD_INLINE v128 v128_unpackhi_s16_s32(v128 a) {
297
0
  return _mm_srai_epi32(_mm_unpackhi_epi16(a, a), 16);
298
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_unpackhi_s16_s32
Unexecuted instantiation: cdef_block_avx2.c:v128_unpackhi_s16_s32
299
300
49.0k
SIMD_INLINE v128 v128_shuffle_8(v128 x, v128 pattern) {
301
49.0k
#if defined(__SSSE3__)
302
49.0k
  return _mm_shuffle_epi8(x, pattern);
303
#else
304
  v128 output;
305
  unsigned char *input = (unsigned char *)&x;
306
  unsigned char *index = (unsigned char *)&pattern;
307
  unsigned char *selected = (unsigned char *)&output;
308
  int counter;
309
310
  for (counter = 0; counter < 16; counter++) {
311
    selected[counter] = input[index[counter] & 15];
312
  }
313
314
  return output;
315
#endif
316
49.0k
}
Unexecuted instantiation: cdef_block_sse4.c:v128_shuffle_8
cdef_block_avx2.c:v128_shuffle_8
Line
Count
Source
300
49.0k
SIMD_INLINE v128 v128_shuffle_8(v128 x, v128 pattern) {
301
49.0k
#if defined(__SSSE3__)
302
49.0k
  return _mm_shuffle_epi8(x, pattern);
303
#else
304
  v128 output;
305
  unsigned char *input = (unsigned char *)&x;
306
  unsigned char *index = (unsigned char *)&pattern;
307
  unsigned char *selected = (unsigned char *)&output;
308
  int counter;
309
310
  for (counter = 0; counter < 16; counter++) {
311
    selected[counter] = input[index[counter] & 15];
312
  }
313
314
  return output;
315
#endif
316
49.0k
}
317
318
0
SIMD_INLINE int64_t v128_dotp_su8(v128 a, v128 b) {
319
0
  v128 t1 = _mm_madd_epi16(v128_unpackhi_s8_s16(a), v128_unpackhi_u8_s16(b));
320
0
  v128 t2 = _mm_madd_epi16(v128_unpacklo_s8_s16(a), v128_unpacklo_u8_s16(b));
321
0
  v128 t = v128_add_32(t1, t2);
322
0
  t = v128_add_32(t, _mm_srli_si128(t, 8));
323
0
  t = v128_add_32(t, _mm_srli_si128(t, 4));
324
0
  return (int32_t)v128_low_u32(t);
325
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_dotp_su8
Unexecuted instantiation: cdef_block_avx2.c:v128_dotp_su8
326
327
0
SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) {
328
0
  v128 r = _mm_madd_epi16(a, b);
329
0
#if defined(__SSE4_1__) && defined(__x86_64__)
330
0
  v128 c = _mm_add_epi64(_mm_cvtepi32_epi64(r),
331
0
                         _mm_cvtepi32_epi64(_mm_srli_si128(r, 8)));
332
0
  return _mm_cvtsi128_si64(_mm_add_epi64(c, _mm_srli_si128(c, 8)));
333
0
#else
334
0
  return (int64_t)_mm_cvtsi128_si32(r) +
335
0
         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 4)) +
336
0
         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 8)) +
337
0
         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 12));
338
0
#endif
339
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_dotp_s16
Unexecuted instantiation: cdef_block_avx2.c:v128_dotp_s16
340
341
0
SIMD_INLINE uint64_t v128_hadd_u8(v128 a) {
342
0
  v128 t = _mm_sad_epu8(a, _mm_setzero_si128());
343
0
  return v64_low_u32(v128_low_v64(t)) + v64_low_u32(v128_high_v64(t));
344
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_hadd_u8
Unexecuted instantiation: cdef_block_avx2.c:v128_hadd_u8
345
346
typedef v128 sad128_internal;
347
348
0
SIMD_INLINE sad128_internal v128_sad_u8_init(void) {
349
0
  return _mm_setzero_si128();
350
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_sad_u8_init
Unexecuted instantiation: cdef_block_avx2.c:v128_sad_u8_init
351
352
/* Implementation dependent return value.  Result must be finalised with
353
   v128_sad_sum().
354
   The result for more than 32 v128_sad_u8() calls is undefined. */
355
0
SIMD_INLINE sad128_internal v128_sad_u8(sad128_internal s, v128 a, v128 b) {
356
0
  return _mm_add_epi64(s, _mm_sad_epu8(a, b));
357
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_sad_u8
Unexecuted instantiation: cdef_block_avx2.c:v128_sad_u8
358
359
0
SIMD_INLINE uint32_t v128_sad_u8_sum(sad128_internal s) {
360
0
  return v128_low_u32(_mm_add_epi32(s, _mm_unpackhi_epi64(s, s)));
361
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_sad_u8_sum
Unexecuted instantiation: cdef_block_avx2.c:v128_sad_u8_sum
362
363
typedef int32_t ssd128_internal;
364
365
0
SIMD_INLINE ssd128_internal v128_ssd_u8_init(void) { return 0; }
Unexecuted instantiation: cdef_block_sse4.c:v128_ssd_u8_init
Unexecuted instantiation: cdef_block_avx2.c:v128_ssd_u8_init
366
367
/* Implementation dependent return value.  Result must be finalised with
368
 * v128_ssd_sum(). */
369
0
SIMD_INLINE ssd128_internal v128_ssd_u8(ssd128_internal s, v128 a, v128 b) {
370
0
  v128 z = _mm_setzero_si128();
371
0
  v128 l = _mm_sub_epi16(_mm_unpacklo_epi8(a, z), _mm_unpacklo_epi8(b, z));
372
0
  v128 h = _mm_sub_epi16(_mm_unpackhi_epi8(a, z), _mm_unpackhi_epi8(b, z));
373
0
  v128 rl = _mm_madd_epi16(l, l);
374
0
  v128 rh = _mm_madd_epi16(h, h);
375
0
  v128 r = _mm_add_epi32(rl, rh);
376
0
  r = _mm_add_epi32(r, _mm_srli_si128(r, 8));
377
0
  r = _mm_add_epi32(r, _mm_srli_si128(r, 4));
378
0
  return s + _mm_cvtsi128_si32(r);
379
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_ssd_u8
Unexecuted instantiation: cdef_block_avx2.c:v128_ssd_u8
380
381
0
SIMD_INLINE int32_t v128_ssd_u8_sum(ssd128_internal s) { return s; }
Unexecuted instantiation: cdef_block_sse4.c:v128_ssd_u8_sum
Unexecuted instantiation: cdef_block_avx2.c:v128_ssd_u8_sum
382
383
0
SIMD_INLINE v128 v128_or(v128 a, v128 b) { return _mm_or_si128(a, b); }
Unexecuted instantiation: cdef_block_sse4.c:v128_or
Unexecuted instantiation: cdef_block_avx2.c:v128_or
384
385
0
SIMD_INLINE v128 v128_xor(v128 a, v128 b) { return _mm_xor_si128(a, b); }
Unexecuted instantiation: cdef_block_sse4.c:v128_xor
Unexecuted instantiation: cdef_block_avx2.c:v128_xor
386
387
0
SIMD_INLINE v128 v128_and(v128 a, v128 b) { return _mm_and_si128(a, b); }
Unexecuted instantiation: cdef_block_sse4.c:v128_and
Unexecuted instantiation: cdef_block_avx2.c:v128_and
388
389
0
SIMD_INLINE v128 v128_andn(v128 a, v128 b) { return _mm_andnot_si128(b, a); }
Unexecuted instantiation: cdef_block_sse4.c:v128_andn
Unexecuted instantiation: cdef_block_avx2.c:v128_andn
390
391
0
SIMD_INLINE v128 v128_mul_s16(v64 a, v64 b) {
392
0
  v64 lo_bits = v64_mullo_s16(a, b);
393
0
  v64 hi_bits = v64_mulhi_s16(a, b);
394
0
  return v128_from_v64(v64_ziphi_16(hi_bits, lo_bits),
395
0
                       v64_ziplo_16(hi_bits, lo_bits));
396
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_mul_s16
Unexecuted instantiation: cdef_block_avx2.c:v128_mul_s16
397
398
0
SIMD_INLINE v128 v128_mullo_s16(v128 a, v128 b) {
399
0
  return _mm_mullo_epi16(a, b);
400
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_mullo_s16
Unexecuted instantiation: cdef_block_avx2.c:v128_mullo_s16
401
402
0
SIMD_INLINE v128 v128_mulhi_s16(v128 a, v128 b) {
403
0
  return _mm_mulhi_epi16(a, b);
404
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_mulhi_s16
Unexecuted instantiation: cdef_block_avx2.c:v128_mulhi_s16
405
406
114k
SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) {
407
114k
#if defined(__SSE4_1__)
408
114k
  return _mm_mullo_epi32(a, b);
409
#else
410
  return _mm_unpacklo_epi32(
411
      _mm_shuffle_epi32(_mm_mul_epu32(a, b), 8),
412
      _mm_shuffle_epi32(
413
          _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4)), 8));
414
#endif
415
114k
}
Unexecuted instantiation: cdef_block_sse4.c:v128_mullo_s32
cdef_block_avx2.c:v128_mullo_s32
Line
Count
Source
406
114k
SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) {
407
114k
#if defined(__SSE4_1__)
408
114k
  return _mm_mullo_epi32(a, b);
409
#else
410
  return _mm_unpacklo_epi32(
411
      _mm_shuffle_epi32(_mm_mul_epu32(a, b), 8),
412
      _mm_shuffle_epi32(
413
          _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4)), 8));
414
#endif
415
114k
}
416
417
0
SIMD_INLINE int64_t v128_dotp_s32(v128 a, v128 b) {
418
0
  v128 r = v128_mullo_s32(a, b);
419
0
  return (int64_t)_mm_cvtsi128_si32(r) +
420
0
         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 4)) +
421
0
         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 8)) +
422
0
         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 12));
423
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_dotp_s32
Unexecuted instantiation: cdef_block_avx2.c:v128_dotp_s32
424
425
114k
SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) { return _mm_madd_epi16(a, b); }
Unexecuted instantiation: cdef_block_sse4.c:v128_madd_s16
cdef_block_avx2.c:v128_madd_s16
Line
Count
Source
425
114k
SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) { return _mm_madd_epi16(a, b); }
426
427
0
SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) {
428
0
#if defined(__SSSE3__)
429
0
  return _mm_maddubs_epi16(a, b);
430
0
#else
431
0
  return _mm_packs_epi32(
432
0
      _mm_madd_epi16(_mm_unpacklo_epi8(a, _mm_setzero_si128()),
433
0
                     _mm_srai_epi16(_mm_unpacklo_epi8(b, b), 8)),
434
0
      _mm_madd_epi16(_mm_unpackhi_epi8(a, _mm_setzero_si128()),
435
0
                     _mm_srai_epi16(_mm_unpackhi_epi8(b, b), 8)));
436
0
#endif
437
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_madd_us8
Unexecuted instantiation: cdef_block_avx2.c:v128_madd_us8
438
439
0
SIMD_INLINE v128 v128_padd_u8(v128 a) {
440
0
  return v128_madd_us8(a, _mm_set1_epi8(1));
441
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_padd_u8
Unexecuted instantiation: cdef_block_avx2.c:v128_padd_u8
442
443
0
SIMD_INLINE v128 v128_avg_u8(v128 a, v128 b) { return _mm_avg_epu8(a, b); }
Unexecuted instantiation: cdef_block_sse4.c:v128_avg_u8
Unexecuted instantiation: cdef_block_avx2.c:v128_avg_u8
444
445
0
SIMD_INLINE v128 v128_rdavg_u8(v128 a, v128 b) {
446
0
  return _mm_sub_epi8(_mm_avg_epu8(a, b),
447
0
                      _mm_and_si128(_mm_xor_si128(a, b), v128_dup_8(1)));
448
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_rdavg_u8
Unexecuted instantiation: cdef_block_avx2.c:v128_rdavg_u8
449
450
0
SIMD_INLINE v128 v128_rdavg_u16(v128 a, v128 b) {
451
0
  return _mm_sub_epi16(_mm_avg_epu16(a, b),
452
0
                       _mm_and_si128(_mm_xor_si128(a, b), v128_dup_16(1)));
453
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_rdavg_u16
Unexecuted instantiation: cdef_block_avx2.c:v128_rdavg_u16
454
455
0
SIMD_INLINE v128 v128_avg_u16(v128 a, v128 b) { return _mm_avg_epu16(a, b); }
Unexecuted instantiation: cdef_block_sse4.c:v128_avg_u16
Unexecuted instantiation: cdef_block_avx2.c:v128_avg_u16
456
457
0
SIMD_INLINE v128 v128_min_u8(v128 a, v128 b) { return _mm_min_epu8(a, b); }
Unexecuted instantiation: cdef_block_sse4.c:v128_min_u8
Unexecuted instantiation: cdef_block_avx2.c:v128_min_u8
458
459
0
SIMD_INLINE v128 v128_max_u8(v128 a, v128 b) { return _mm_max_epu8(a, b); }
Unexecuted instantiation: cdef_block_sse4.c:v128_max_u8
Unexecuted instantiation: cdef_block_avx2.c:v128_max_u8
460
461
0
SIMD_INLINE v128 v128_min_s8(v128 a, v128 b) {
462
0
#if defined(__SSE4_1__)
463
0
  return _mm_min_epi8(a, b);
464
0
#else
465
0
  v128 mask = _mm_cmplt_epi8(a, b);
466
0
  return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
467
0
#endif
468
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_min_s8
Unexecuted instantiation: cdef_block_avx2.c:v128_min_s8
469
470
8.18k
SIMD_INLINE uint32_t v128_movemask_8(v128 a) { return _mm_movemask_epi8(a); }
Unexecuted instantiation: cdef_block_sse4.c:v128_movemask_8
cdef_block_avx2.c:v128_movemask_8
Line
Count
Source
470
8.18k
SIMD_INLINE uint32_t v128_movemask_8(v128 a) { return _mm_movemask_epi8(a); }
471
472
0
SIMD_INLINE v128 v128_blend_8(v128 a, v128 b, v128 c) {
473
0
#if defined(__SSE4_1__)
474
0
  return _mm_blendv_epi8(a, b, c);
475
0
#else
476
0
  c = _mm_cmplt_epi8(c, v128_zero());
477
0
  return v128_or(v128_and(b, c), v128_andn(a, c));
478
0
#endif
479
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_blend_8
Unexecuted instantiation: cdef_block_avx2.c:v128_blend_8
480
481
0
SIMD_INLINE v128 v128_max_s8(v128 a, v128 b) {
482
0
#if defined(__SSE4_1__)
483
0
  return _mm_max_epi8(a, b);
484
0
#else
485
0
  v128 mask = _mm_cmplt_epi8(b, a);
486
0
  return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
487
0
#endif
488
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_max_s8
Unexecuted instantiation: cdef_block_avx2.c:v128_max_s8
489
490
0
SIMD_INLINE v128 v128_min_s16(v128 a, v128 b) { return _mm_min_epi16(a, b); }
Unexecuted instantiation: cdef_block_sse4.c:v128_min_s16
Unexecuted instantiation: cdef_block_avx2.c:v128_min_s16
491
492
0
SIMD_INLINE v128 v128_max_s16(v128 a, v128 b) { return _mm_max_epi16(a, b); }
Unexecuted instantiation: cdef_block_sse4.c:v128_max_s16
Unexecuted instantiation: cdef_block_avx2.c:v128_max_s16
493
494
0
SIMD_INLINE v128 v128_min_s32(v128 a, v128 b) {
495
0
#if defined(__SSE4_1__)
496
0
  return _mm_min_epi32(a, b);
497
0
#else
498
0
  v128 mask = _mm_cmplt_epi32(a, b);
499
0
  return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
500
0
#endif
501
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_min_s32
Unexecuted instantiation: cdef_block_avx2.c:v128_min_s32
502
503
24.5k
SIMD_INLINE v128 v128_max_s32(v128 a, v128 b) {
504
24.5k
#if defined(__SSE4_1__)
505
24.5k
  return _mm_max_epi32(a, b);
506
#else
507
  v128 mask = _mm_cmplt_epi32(b, a);
508
  return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
509
#endif
510
24.5k
}
Unexecuted instantiation: cdef_block_sse4.c:v128_max_s32
cdef_block_avx2.c:v128_max_s32
Line
Count
Source
503
24.5k
SIMD_INLINE v128 v128_max_s32(v128 a, v128 b) {
504
24.5k
#if defined(__SSE4_1__)
505
24.5k
  return _mm_max_epi32(a, b);
506
#else
507
  v128 mask = _mm_cmplt_epi32(b, a);
508
  return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
509
#endif
510
24.5k
}
511
512
0
SIMD_INLINE v128 v128_cmpgt_s8(v128 a, v128 b) { return _mm_cmpgt_epi8(a, b); }
Unexecuted instantiation: cdef_block_sse4.c:v128_cmpgt_s8
Unexecuted instantiation: cdef_block_avx2.c:v128_cmpgt_s8
513
514
0
SIMD_INLINE v128 v128_cmplt_s8(v128 a, v128 b) { return _mm_cmplt_epi8(a, b); }
Unexecuted instantiation: cdef_block_sse4.c:v128_cmplt_s8
Unexecuted instantiation: cdef_block_avx2.c:v128_cmplt_s8
515
516
0
SIMD_INLINE v128 v128_cmpeq_8(v128 a, v128 b) { return _mm_cmpeq_epi8(a, b); }
Unexecuted instantiation: cdef_block_sse4.c:v128_cmpeq_8
Unexecuted instantiation: cdef_block_avx2.c:v128_cmpeq_8
517
518
0
SIMD_INLINE v128 v128_cmpgt_s16(v128 a, v128 b) {
519
0
  return _mm_cmpgt_epi16(a, b);
520
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_cmpgt_s16
Unexecuted instantiation: cdef_block_avx2.c:v128_cmpgt_s16
521
522
0
SIMD_INLINE v128 v128_cmplt_s16(v128 a, v128 b) {
523
0
  return _mm_cmplt_epi16(a, b);
524
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_cmplt_s16
Unexecuted instantiation: cdef_block_avx2.c:v128_cmplt_s16
525
526
16.3k
SIMD_INLINE v128 v128_cmpeq_32(v128 a, v128 b) { return _mm_cmpeq_epi32(a, b); }
Unexecuted instantiation: cdef_block_sse4.c:v128_cmpeq_32
cdef_block_avx2.c:v128_cmpeq_32
Line
Count
Source
526
16.3k
SIMD_INLINE v128 v128_cmpeq_32(v128 a, v128 b) { return _mm_cmpeq_epi32(a, b); }
527
528
0
SIMD_INLINE v128 v128_cmpgt_s32(v128 a, v128 b) {
529
0
  return _mm_cmpgt_epi32(a, b);
530
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_cmpgt_s32
Unexecuted instantiation: cdef_block_avx2.c:v128_cmpgt_s32
531
532
0
SIMD_INLINE v128 v128_cmplt_s32(v128 a, v128 b) {
533
0
  return _mm_cmplt_epi32(a, b);
534
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_cmplt_s32
Unexecuted instantiation: cdef_block_avx2.c:v128_cmplt_s32
535
536
0
SIMD_INLINE v128 v128_cmpeq_16(v128 a, v128 b) { return _mm_cmpeq_epi16(a, b); }
Unexecuted instantiation: cdef_block_sse4.c:v128_cmpeq_16
Unexecuted instantiation: cdef_block_avx2.c:v128_cmpeq_16
537
538
0
SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) {
539
0
  return _mm_and_si128(_mm_set1_epi8((char)(0xff << c)),
540
0
                       _mm_sll_epi16(a, _mm_cvtsi32_si128((int)c)));
541
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_shl_8
Unexecuted instantiation: cdef_block_avx2.c:v128_shl_8
542
543
0
SIMD_INLINE v128 v128_shr_u8(v128 a, unsigned int c) {
544
0
  return _mm_and_si128(_mm_set1_epi8((char)(0xff >> c)),
545
0
                       _mm_srl_epi16(a, _mm_cvtsi32_si128((int)c)));
546
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_shr_u8
Unexecuted instantiation: cdef_block_avx2.c:v128_shr_u8
547
548
0
SIMD_INLINE v128 v128_shr_s8(v128 a, unsigned int c) {
549
0
  __m128i x = _mm_cvtsi32_si128((int)(c + 8));
550
0
  return _mm_packs_epi16(_mm_sra_epi16(_mm_unpacklo_epi8(a, a), x),
551
0
                         _mm_sra_epi16(_mm_unpackhi_epi8(a, a), x));
552
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_shr_s8
Unexecuted instantiation: cdef_block_avx2.c:v128_shr_s8
553
554
0
SIMD_INLINE v128 v128_shl_16(v128 a, unsigned int c) {
555
0
  return _mm_sll_epi16(a, _mm_cvtsi32_si128((int)c));
556
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_shl_16
Unexecuted instantiation: cdef_block_avx2.c:v128_shl_16
557
558
0
SIMD_INLINE v128 v128_shr_u16(v128 a, unsigned int c) {
559
0
  return _mm_srl_epi16(a, _mm_cvtsi32_si128((int)c));
560
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_shr_u16
Unexecuted instantiation: cdef_block_avx2.c:v128_shr_u16
561
562
65.4k
SIMD_INLINE v128 v128_shr_s16(v128 a, unsigned int c) {
563
65.4k
  return _mm_sra_epi16(a, _mm_cvtsi32_si128((int)c));
564
65.4k
}
Unexecuted instantiation: cdef_block_sse4.c:v128_shr_s16
cdef_block_avx2.c:v128_shr_s16
Line
Count
Source
562
65.4k
SIMD_INLINE v128 v128_shr_s16(v128 a, unsigned int c) {
563
65.4k
  return _mm_sra_epi16(a, _mm_cvtsi32_si128((int)c));
564
65.4k
}
565
566
0
SIMD_INLINE v128 v128_shl_32(v128 a, unsigned int c) {
567
0
  return _mm_sll_epi32(a, _mm_cvtsi32_si128((int)c));
568
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_shl_32
Unexecuted instantiation: cdef_block_avx2.c:v128_shl_32
569
570
0
SIMD_INLINE v128 v128_shr_u32(v128 a, unsigned int c) {
571
0
  return _mm_srl_epi32(a, _mm_cvtsi32_si128((int)c));
572
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_shr_u32
Unexecuted instantiation: cdef_block_avx2.c:v128_shr_u32
573
574
0
SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) {
575
0
  return _mm_sra_epi32(a, _mm_cvtsi32_si128((int)c));
576
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_shr_s32
Unexecuted instantiation: cdef_block_avx2.c:v128_shr_s32
577
578
0
SIMD_INLINE v128 v128_shl_64(v128 a, unsigned int c) {
579
0
  return _mm_sll_epi64(a, _mm_cvtsi32_si128((int)c));
580
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_shl_64
Unexecuted instantiation: cdef_block_avx2.c:v128_shl_64
581
582
0
SIMD_INLINE v128 v128_shr_u64(v128 a, unsigned int c) {
583
0
  return _mm_srl_epi64(a, _mm_cvtsi32_si128((int)c));
584
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_shr_u64
Unexecuted instantiation: cdef_block_avx2.c:v128_shr_u64
585
586
0
SIMD_INLINE v128 v128_shr_s64(v128 a, unsigned int c) {
587
0
  // _mm_sra_epi64 is missing in gcc?
588
0
  return v128_from_64((uint64_t)((int64_t)v64_u64(v128_high_v64(a)) >> c),
589
0
                      (uint64_t)((int64_t)v64_u64(v128_low_v64(a)) >> c));
590
0
  // return _mm_sra_epi64(a, _mm_cvtsi32_si128((int)c));
591
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_shr_s64
Unexecuted instantiation: cdef_block_avx2.c:v128_shr_s64
592
593
/* These intrinsics require immediate values, so we must use #defines
594
   to enforce that. */
595
245k
#define v128_shl_n_byte(a, c) _mm_slli_si128(a, (c)&127)
596
245k
#define v128_shr_n_byte(a, c) _mm_srli_si128(a, (c)&127)
597
#define v128_shl_n_8(a, c) \
598
  _mm_and_si128(_mm_set1_epi8((char)(0xff << (c))), _mm_slli_epi16(a, c))
599
#define v128_shr_n_u8(a, c) \
600
  _mm_and_si128(_mm_set1_epi8((char)(0xff >> (c))), _mm_srli_epi16(a, c))
601
#define v128_shr_n_s8(a, c)                                         \
602
  _mm_packs_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), (c) + 8), \
603
                  _mm_srai_epi16(_mm_unpackhi_epi8(a, a), (c) + 8))
604
#define v128_shl_n_16(a, c) _mm_slli_epi16(a, c)
605
#define v128_shr_n_u16(a, c) _mm_srli_epi16(a, c)
606
0
#define v128_shr_n_s16(a, c) _mm_srai_epi16(a, c)
607
#define v128_shl_n_32(a, c) _mm_slli_epi32(a, c)
608
#define v128_shr_n_u32(a, c) _mm_srli_epi32(a, c)
609
#define v128_shr_n_s32(a, c) _mm_srai_epi32(a, c)
610
#define v128_shl_n_64(a, c) _mm_slli_epi64(a, c)
611
#define v128_shr_n_u64(a, c) _mm_srli_epi64(a, c)
612
#define v128_shr_n_s64(a, c) \
613
  v128_shr_s64(a, c)  // _mm_srai_epi64 missing in gcc?
614
615
typedef v128 sad128_internal_u16;
616
617
0
SIMD_INLINE sad128_internal_u16 v128_sad_u16_init(void) { return v128_zero(); }
Unexecuted instantiation: cdef_block_sse4.c:v128_sad_u16_init
Unexecuted instantiation: cdef_block_avx2.c:v128_sad_u16_init
618
619
/* Implementation dependent return value.  Result must be finalised with
620
 * v128_sad_u16_sum(). */
621
SIMD_INLINE sad128_internal_u16 v128_sad_u16(sad128_internal_u16 s, v128 a,
622
0
                                             v128 b) {
623
0
#if defined(__SSE4_1__)
624
0
  v128 t = v128_sub_16(_mm_max_epu16(a, b), _mm_min_epu16(a, b));
625
0
#else
626
0
  v128 t = v128_cmplt_s16(v128_xor(a, v128_dup_16(32768)),
627
0
                          v128_xor(b, v128_dup_16(32768)));
628
0
  t = v128_sub_16(v128_or(v128_and(b, t), v128_andn(a, t)),
629
0
                  v128_or(v128_and(a, t), v128_andn(b, t)));
630
0
#endif
631
0
  return v128_add_32(
632
0
      s, v128_add_32(v128_unpackhi_u16_s32(t), v128_unpacklo_u16_s32(t)));
633
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_sad_u16
Unexecuted instantiation: cdef_block_avx2.c:v128_sad_u16
634
635
0
SIMD_INLINE uint32_t v128_sad_u16_sum(sad128_internal_u16 s) {
636
0
  return v128_low_u32(s) + v128_low_u32(v128_shr_n_byte(s, 4)) +
637
0
         v128_low_u32(v128_shr_n_byte(s, 8)) +
638
0
         v128_low_u32(v128_shr_n_byte(s, 12));
639
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_sad_u16_sum
Unexecuted instantiation: cdef_block_avx2.c:v128_sad_u16_sum
640
641
typedef v128 ssd128_internal_s16;
642
643
0
SIMD_INLINE ssd128_internal_s16 v128_ssd_s16_init(void) { return v128_zero(); }
Unexecuted instantiation: cdef_block_sse4.c:v128_ssd_s16_init
Unexecuted instantiation: cdef_block_avx2.c:v128_ssd_s16_init
644
645
/* Implementation dependent return value.  Result must be finalised with
646
 * v128_ssd_s16_sum(). */
647
SIMD_INLINE ssd128_internal_s16 v128_ssd_s16(ssd128_internal_s16 s, v128 a,
648
0
                                             v128 b) {
649
0
  v128 d = v128_sub_16(a, b);
650
0
  d = v128_madd_s16(d, d);
651
0
  return v128_add_64(s, v128_add_64(_mm_unpackhi_epi32(d, v128_zero()),
652
0
                                    _mm_unpacklo_epi32(d, v128_zero())));
653
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_ssd_s16
Unexecuted instantiation: cdef_block_avx2.c:v128_ssd_s16
654
655
0
SIMD_INLINE uint64_t v128_ssd_s16_sum(ssd128_internal_s16 s) {
656
0
  return v64_u64(v128_low_v64(s)) + v64_u64(v128_high_v64(s));
657
0
}
Unexecuted instantiation: cdef_block_sse4.c:v128_ssd_s16_sum
Unexecuted instantiation: cdef_block_avx2.c:v128_ssd_s16_sum
658
659
#endif  // AOM_AOM_DSP_SIMD_V128_INTRINSICS_X86_H_