/src/aom/aom_dsp/simd/v64_intrinsics_x86.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2016, Alliance for Open Media. All rights reserved. |
3 | | * |
4 | | * This source code is subject to the terms of the BSD 2 Clause License and |
5 | | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
6 | | * was not distributed with this source code in the LICENSE file, you can |
7 | | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
8 | | * Media Patent License 1.0 was not distributed with this source code in the |
9 | | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
10 | | */ |
11 | | |
12 | | #ifndef AOM_AOM_DSP_SIMD_V64_INTRINSICS_X86_H_ |
13 | | #define AOM_AOM_DSP_SIMD_V64_INTRINSICS_X86_H_ |
14 | | |
15 | | #include <emmintrin.h> |
16 | | #if defined(__SSSE3__) |
17 | | #include <tmmintrin.h> |
18 | | #endif |
19 | | #if defined(__SSE4_1__) |
20 | | #include <smmintrin.h> |
21 | | #endif |
22 | | |
23 | | typedef __m128i v64; |
24 | | |
25 | 44.4M | SIMD_INLINE uint32_t v64_low_u32(v64 a) { |
26 | 44.4M | return (uint32_t)_mm_cvtsi128_si32(a); |
27 | 44.4M | } Unexecuted instantiation: cdef_block_sse4.c:v64_low_u32 cdef_block_avx2.c:v64_low_u32 Line | Count | Source | 25 | 44.4M | SIMD_INLINE uint32_t v64_low_u32(v64 a) { | 26 | 44.4M | return (uint32_t)_mm_cvtsi128_si32(a); | 27 | 44.4M | } |
|
28 | | |
29 | 42.7M | SIMD_INLINE uint32_t v64_high_u32(v64 a) { |
30 | 42.7M | return (uint32_t)_mm_cvtsi128_si32(_mm_srli_si128(a, 4)); |
31 | 42.7M | } Unexecuted instantiation: cdef_block_sse4.c:v64_high_u32 cdef_block_avx2.c:v64_high_u32 Line | Count | Source | 29 | 42.7M | SIMD_INLINE uint32_t v64_high_u32(v64 a) { | 30 | 42.7M | return (uint32_t)_mm_cvtsi128_si32(_mm_srli_si128(a, 4)); | 31 | 42.7M | } |
|
32 | | |
33 | 0 | SIMD_INLINE int32_t v64_low_s32(v64 a) { return (int32_t)_mm_cvtsi128_si32(a); } Unexecuted instantiation: cdef_block_sse4.c:v64_low_s32 Unexecuted instantiation: cdef_block_avx2.c:v64_low_s32 |
34 | | |
35 | 0 | SIMD_INLINE int32_t v64_high_s32(v64 a) { |
36 | 0 | return (int32_t)_mm_cvtsi128_si32(_mm_srli_si128(a, 4)); |
37 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v64_high_s32 Unexecuted instantiation: cdef_block_avx2.c:v64_high_s32 |
38 | | |
39 | 0 | SIMD_INLINE v64 v64_from_16(uint16_t a, uint16_t b, uint16_t c, uint16_t d) { |
40 | 0 | return _mm_packs_epi32( |
41 | 0 | _mm_set_epi32((int16_t)a, (int16_t)b, (int16_t)c, (int16_t)d), |
42 | 0 | _mm_setzero_si128()); |
43 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v64_from_16 Unexecuted instantiation: cdef_block_avx2.c:v64_from_16 |
44 | | |
45 | 0 | SIMD_INLINE v64 v64_from_32(uint32_t x, uint32_t y) { |
46 | 0 | return _mm_set_epi32(0, 0, (int32_t)x, (int32_t)y); |
47 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v64_from_32 Unexecuted instantiation: cdef_block_avx2.c:v64_from_32 |
48 | | |
49 | 0 | SIMD_INLINE v64 v64_from_64(uint64_t x) { |
50 | 0 | #ifdef __x86_64__ |
51 | 0 | return _mm_cvtsi64_si128((int64_t)x); |
52 | 0 | #else |
53 | 0 | return _mm_set_epi32(0, 0, (int32_t)(x >> 32), (int32_t)x); |
54 | 0 | #endif |
55 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v64_from_64 Unexecuted instantiation: cdef_block_avx2.c:v64_from_64 |
56 | | |
57 | 0 | SIMD_INLINE uint64_t v64_u64(v64 x) { |
58 | 0 | return (uint64_t)v64_low_u32(x) | ((uint64_t)v64_high_u32(x) << 32); |
59 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v64_u64 Unexecuted instantiation: cdef_block_avx2.c:v64_u64 |
60 | | |
61 | 0 | SIMD_INLINE uint32_t u32_load_aligned(const void *p) { |
62 | 0 | return *((uint32_t *)p); |
63 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:u32_load_aligned Unexecuted instantiation: cdef_block_avx2.c:u32_load_aligned |
64 | | |
65 | 0 | SIMD_INLINE uint32_t u32_load_unaligned(const void *p) { |
66 | 0 | return *((uint32_t *)p); |
67 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:u32_load_unaligned Unexecuted instantiation: cdef_block_avx2.c:u32_load_unaligned |
68 | | |
69 | 83.6M | SIMD_INLINE void u32_store_aligned(void *p, uint32_t a) { |
70 | 83.6M | *((uint32_t *)p) = a; |
71 | 83.6M | } Unexecuted instantiation: cdef_block_sse4.c:u32_store_aligned cdef_block_avx2.c:u32_store_aligned Line | Count | Source | 69 | 83.6M | SIMD_INLINE void u32_store_aligned(void *p, uint32_t a) { | 70 | 83.6M | *((uint32_t *)p) = a; | 71 | 83.6M | } |
|
72 | | |
73 | 0 | SIMD_INLINE void u32_store_unaligned(void *p, uint32_t a) { |
74 | 0 | *((uint32_t *)p) = a; |
75 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:u32_store_unaligned Unexecuted instantiation: cdef_block_avx2.c:u32_store_unaligned |
76 | | |
77 | 187M | SIMD_INLINE v64 v64_load_aligned(const void *p) { |
78 | 187M | return _mm_loadl_epi64((__m128i *)p); |
79 | 187M | } Unexecuted instantiation: cdef_block_sse4.c:v64_load_aligned cdef_block_avx2.c:v64_load_aligned Line | Count | Source | 77 | 187M | SIMD_INLINE v64 v64_load_aligned(const void *p) { | 78 | 187M | return _mm_loadl_epi64((__m128i *)p); | 79 | 187M | } |
|
80 | | |
81 | 689M | SIMD_INLINE v64 v64_load_unaligned(const void *p) { |
82 | 689M | return _mm_loadl_epi64((__m128i *)p); |
83 | 689M | } Unexecuted instantiation: cdef_block_sse4.c:v64_load_unaligned cdef_block_avx2.c:v64_load_unaligned Line | Count | Source | 81 | 689M | SIMD_INLINE v64 v64_load_unaligned(const void *p) { | 82 | 689M | return _mm_loadl_epi64((__m128i *)p); | 83 | 689M | } |
|
84 | | |
85 | 147M | SIMD_INLINE void v64_store_aligned(void *p, v64 a) { |
86 | 147M | _mm_storel_epi64((__m128i *)p, a); |
87 | 147M | } Unexecuted instantiation: cdef_block_sse4.c:v64_store_aligned cdef_block_avx2.c:v64_store_aligned Line | Count | Source | 85 | 147M | SIMD_INLINE void v64_store_aligned(void *p, v64 a) { | 86 | 147M | _mm_storel_epi64((__m128i *)p, a); | 87 | 147M | } |
|
88 | | |
89 | 0 | SIMD_INLINE void v64_store_unaligned(void *p, v64 a) { |
90 | 0 | _mm_storel_epi64((__m128i *)p, a); |
91 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v64_store_unaligned Unexecuted instantiation: cdef_block_avx2.c:v64_store_unaligned |
92 | | |
93 | | #if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__) |
94 | | #define v64_align(a, b, c) \ |
95 | | ((c) ? _mm_srli_si128(_mm_unpacklo_epi64(b, a), (c)) : b) |
96 | | #else |
97 | | #define v64_align(a, b, c) \ |
98 | | ((c) ? v64_from_64((v64_u64(b) >> (c)*8) | (v64_u64(a) << (8 - (c)) * 8)) \ |
99 | | : (b)) |
100 | | #endif |
101 | | |
102 | 109M | SIMD_INLINE v64 v64_zero(void) { return _mm_setzero_si128(); } Unexecuted instantiation: cdef_block_sse4.c:v64_zero cdef_block_avx2.c:v64_zero Line | Count | Source | 102 | 109M | SIMD_INLINE v64 v64_zero(void) { return _mm_setzero_si128(); } |
|
103 | | |
104 | 0 | SIMD_INLINE v64 v64_dup_8(uint8_t x) { return _mm_set1_epi8((char)x); } Unexecuted instantiation: cdef_block_sse4.c:v64_dup_8 Unexecuted instantiation: cdef_block_avx2.c:v64_dup_8 |
105 | | |
106 | 0 | SIMD_INLINE v64 v64_dup_16(uint16_t x) { return _mm_set1_epi16((short)x); } Unexecuted instantiation: cdef_block_sse4.c:v64_dup_16 Unexecuted instantiation: cdef_block_avx2.c:v64_dup_16 |
107 | | |
108 | 0 | SIMD_INLINE v64 v64_dup_32(uint32_t x) { return _mm_set1_epi32((int)x); } Unexecuted instantiation: cdef_block_sse4.c:v64_dup_32 Unexecuted instantiation: cdef_block_avx2.c:v64_dup_32 |
109 | | |
110 | 0 | SIMD_INLINE v64 v64_add_8(v64 a, v64 b) { return _mm_add_epi8(a, b); } Unexecuted instantiation: cdef_block_sse4.c:v64_add_8 Unexecuted instantiation: cdef_block_avx2.c:v64_add_8 |
111 | | |
112 | 0 | SIMD_INLINE v64 v64_add_16(v64 a, v64 b) { return _mm_add_epi16(a, b); } Unexecuted instantiation: cdef_block_sse4.c:v64_add_16 Unexecuted instantiation: cdef_block_avx2.c:v64_add_16 |
113 | | |
114 | 0 | SIMD_INLINE v64 v64_sadd_u8(v64 a, v64 b) { return _mm_adds_epu8(a, b); } Unexecuted instantiation: cdef_block_sse4.c:v64_sadd_u8 Unexecuted instantiation: cdef_block_avx2.c:v64_sadd_u8 |
115 | | |
116 | 0 | SIMD_INLINE v64 v64_sadd_s8(v64 a, v64 b) { return _mm_adds_epi8(a, b); } Unexecuted instantiation: cdef_block_sse4.c:v64_sadd_s8 Unexecuted instantiation: cdef_block_avx2.c:v64_sadd_s8 |
117 | | |
118 | 0 | SIMD_INLINE v64 v64_sadd_s16(v64 a, v64 b) { return _mm_adds_epi16(a, b); } Unexecuted instantiation: cdef_block_sse4.c:v64_sadd_s16 Unexecuted instantiation: cdef_block_avx2.c:v64_sadd_s16 |
119 | | |
120 | 0 | SIMD_INLINE v64 v64_add_32(v64 a, v64 b) { return _mm_add_epi32(a, b); } Unexecuted instantiation: cdef_block_sse4.c:v64_add_32 Unexecuted instantiation: cdef_block_avx2.c:v64_add_32 |
121 | | |
122 | 0 | SIMD_INLINE v64 v64_sub_8(v64 a, v64 b) { return _mm_sub_epi8(a, b); } Unexecuted instantiation: cdef_block_sse4.c:v64_sub_8 Unexecuted instantiation: cdef_block_avx2.c:v64_sub_8 |
123 | | |
124 | 0 | SIMD_INLINE v64 v64_ssub_u8(v64 a, v64 b) { return _mm_subs_epu8(a, b); } Unexecuted instantiation: cdef_block_sse4.c:v64_ssub_u8 Unexecuted instantiation: cdef_block_avx2.c:v64_ssub_u8 |
125 | | |
126 | 0 | SIMD_INLINE v64 v64_ssub_s8(v64 a, v64 b) { return _mm_subs_epi8(a, b); } Unexecuted instantiation: cdef_block_sse4.c:v64_ssub_s8 Unexecuted instantiation: cdef_block_avx2.c:v64_ssub_s8 |
127 | | |
128 | 0 | SIMD_INLINE v64 v64_sub_16(v64 a, v64 b) { return _mm_sub_epi16(a, b); } Unexecuted instantiation: cdef_block_sse4.c:v64_sub_16 Unexecuted instantiation: cdef_block_avx2.c:v64_sub_16 |
129 | | |
130 | 0 | SIMD_INLINE v64 v64_ssub_s16(v64 a, v64 b) { return _mm_subs_epi16(a, b); } Unexecuted instantiation: cdef_block_sse4.c:v64_ssub_s16 Unexecuted instantiation: cdef_block_avx2.c:v64_ssub_s16 |
131 | | |
132 | 0 | SIMD_INLINE v64 v64_ssub_u16(v64 a, v64 b) { return _mm_subs_epu16(a, b); } Unexecuted instantiation: cdef_block_sse4.c:v64_ssub_u16 Unexecuted instantiation: cdef_block_avx2.c:v64_ssub_u16 |
133 | | |
134 | 0 | SIMD_INLINE v64 v64_sub_32(v64 a, v64 b) { return _mm_sub_epi32(a, b); } Unexecuted instantiation: cdef_block_sse4.c:v64_sub_32 Unexecuted instantiation: cdef_block_avx2.c:v64_sub_32 |
135 | | |
136 | 0 | SIMD_INLINE v64 v64_abs_s16(v64 a) { |
137 | 0 | #if defined(__SSSE3__) |
138 | 0 | return _mm_abs_epi16(a); |
139 | 0 | #else |
140 | 0 | return _mm_max_epi16(a, _mm_sub_epi16(_mm_setzero_si128(), a)); |
141 | 0 | #endif |
142 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v64_abs_s16 Unexecuted instantiation: cdef_block_avx2.c:v64_abs_s16 |
143 | | |
144 | 0 | SIMD_INLINE v64 v64_abs_s8(v64 a) { |
145 | 0 | #if defined(__SSSE3__) |
146 | 0 | return _mm_abs_epi8(a); |
147 | 0 | #else |
148 | 0 | v64 sign = _mm_cmplt_epi8(a, _mm_setzero_si128()); |
149 | 0 | return _mm_xor_si128(sign, _mm_add_epi8(a, sign)); |
150 | 0 | #endif |
151 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v64_abs_s8 Unexecuted instantiation: cdef_block_avx2.c:v64_abs_s8 |
152 | | |
153 | 0 | SIMD_INLINE v64 v64_ziplo_8(v64 a, v64 b) { return _mm_unpacklo_epi8(b, a); } Unexecuted instantiation: cdef_block_sse4.c:v64_ziplo_8 Unexecuted instantiation: cdef_block_avx2.c:v64_ziplo_8 |
154 | | |
155 | 0 | SIMD_INLINE v64 v64_ziphi_8(v64 a, v64 b) { |
156 | 0 | return _mm_srli_si128(_mm_unpacklo_epi8(b, a), 8); |
157 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v64_ziphi_8 Unexecuted instantiation: cdef_block_avx2.c:v64_ziphi_8 |
158 | | |
159 | 0 | SIMD_INLINE v64 v64_ziplo_16(v64 a, v64 b) { return _mm_unpacklo_epi16(b, a); } Unexecuted instantiation: cdef_block_sse4.c:v64_ziplo_16 Unexecuted instantiation: cdef_block_avx2.c:v64_ziplo_16 |
160 | | |
161 | 0 | SIMD_INLINE v64 v64_ziphi_16(v64 a, v64 b) { |
162 | 0 | return _mm_srli_si128(_mm_unpacklo_epi16(b, a), 8); |
163 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v64_ziphi_16 Unexecuted instantiation: cdef_block_avx2.c:v64_ziphi_16 |
164 | | |
165 | 0 | SIMD_INLINE v64 v64_ziplo_32(v64 a, v64 b) { return _mm_unpacklo_epi32(b, a); } Unexecuted instantiation: cdef_block_sse4.c:v64_ziplo_32 Unexecuted instantiation: cdef_block_avx2.c:v64_ziplo_32 |
166 | | |
167 | 0 | SIMD_INLINE v64 v64_ziphi_32(v64 a, v64 b) { |
168 | 0 | return _mm_srli_si128(_mm_unpacklo_epi32(b, a), 8); |
169 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v64_ziphi_32 Unexecuted instantiation: cdef_block_avx2.c:v64_ziphi_32 |
170 | | |
171 | 0 | SIMD_INLINE v64 v64_pack_s32_s16(v64 a, v64 b) { |
172 | 0 | __m128i t = _mm_unpacklo_epi64(b, a); |
173 | 0 | return _mm_packs_epi32(t, t); |
174 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v64_pack_s32_s16 Unexecuted instantiation: cdef_block_avx2.c:v64_pack_s32_s16 |
175 | | |
176 | 0 | SIMD_INLINE v64 v64_pack_s32_u16(v64 a, v64 b) { |
177 | 0 | #if defined(__SSE4_1__) |
178 | 0 | __m128i t = _mm_unpacklo_epi64(b, a); |
179 | 0 | return _mm_packus_epi32(t, t); |
180 | 0 | #else |
181 | 0 | const int32_t ah = SIMD_CLAMP(v64_high_s32(a), 0, 65535); |
182 | 0 | const int32_t al = SIMD_CLAMP(v64_low_s32(a), 0, 65535); |
183 | 0 | const int32_t bh = SIMD_CLAMP(v64_high_s32(b), 0, 65535); |
184 | 0 | const int32_t bl = SIMD_CLAMP(v64_low_s32(b), 0, 65535); |
185 | 0 | return v64_from_16(ah, al, bh, bl); |
186 | 0 | #endif |
187 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v64_pack_s32_u16 Unexecuted instantiation: cdef_block_avx2.c:v64_pack_s32_u16 |
188 | | |
189 | 0 | SIMD_INLINE v64 v64_pack_s16_u8(v64 a, v64 b) { |
190 | 0 | __m128i t = _mm_unpacklo_epi64(b, a); |
191 | 0 | return _mm_packus_epi16(t, t); |
192 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v64_pack_s16_u8 Unexecuted instantiation: cdef_block_avx2.c:v64_pack_s16_u8 |
193 | | |
194 | 0 | SIMD_INLINE v64 v64_pack_s16_s8(v64 a, v64 b) { |
195 | 0 | __m128i t = _mm_unpacklo_epi64(b, a); |
196 | 0 | return _mm_packs_epi16(t, t); |
197 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v64_pack_s16_s8 Unexecuted instantiation: cdef_block_avx2.c:v64_pack_s16_s8 |
198 | | |
199 | 0 | SIMD_INLINE v64 v64_unziphi_8(v64 a, v64 b) { |
200 | 0 | #if defined(__SSSE3__) |
201 | 0 | return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a), |
202 | 0 | v64_from_64(0x0f0d0b0907050301LL)); |
203 | 0 | #else |
204 | 0 | return _mm_packus_epi16( |
205 | 0 | _mm_unpacklo_epi64(_mm_srli_epi16(b, 8), _mm_srli_epi16(a, 8)), |
206 | 0 | _mm_setzero_si128()); |
207 | 0 | #endif |
208 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v64_unziphi_8 Unexecuted instantiation: cdef_block_avx2.c:v64_unziphi_8 |
209 | | |
210 | 0 | SIMD_INLINE v64 v64_unziplo_8(v64 a, v64 b) { |
211 | 0 | #if defined(__SSSE3__) |
212 | 0 | return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a), |
213 | 0 | v64_from_64(0x0e0c0a0806040200LL)); |
214 | 0 | #else |
215 | 0 | return v64_unziphi_8(_mm_slli_si128(a, 1), _mm_slli_si128(b, 1)); |
216 | 0 | #endif |
217 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v64_unziplo_8 Unexecuted instantiation: cdef_block_avx2.c:v64_unziplo_8 |
218 | | |
219 | 0 | SIMD_INLINE v64 v64_unziphi_16(v64 a, v64 b) { |
220 | 0 | #if defined(__SSSE3__) |
221 | 0 | return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a), |
222 | 0 | v64_from_64(0x0f0e0b0a07060302LL)); |
223 | 0 | #else |
224 | 0 | return _mm_packs_epi32( |
225 | 0 | _mm_unpacklo_epi64(_mm_srai_epi32(b, 16), _mm_srai_epi32(a, 16)), |
226 | 0 | _mm_setzero_si128()); |
227 | 0 | #endif |
228 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v64_unziphi_16 Unexecuted instantiation: cdef_block_avx2.c:v64_unziphi_16 |
229 | | |
230 | 0 | SIMD_INLINE v64 v64_unziplo_16(v64 a, v64 b) { |
231 | 0 | #if defined(__SSSE3__) |
232 | 0 | return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a), |
233 | 0 | v64_from_64(0x0d0c090805040100LL)); |
234 | 0 | #else |
235 | 0 | return v64_unziphi_16(_mm_slli_si128(a, 2), _mm_slli_si128(b, 2)); |
236 | 0 | #endif |
237 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v64_unziplo_16 Unexecuted instantiation: cdef_block_avx2.c:v64_unziplo_16 |
238 | | |
239 | 0 | SIMD_INLINE v64 v64_unpacklo_u8_s16(v64 a) { |
240 | 0 | return _mm_unpacklo_epi8(a, _mm_setzero_si128()); |
241 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v64_unpacklo_u8_s16 Unexecuted instantiation: cdef_block_avx2.c:v64_unpacklo_u8_s16 |
242 | | |
243 | 0 | SIMD_INLINE v64 v64_unpackhi_u8_s16(v64 a) { |
244 | 0 | return _mm_srli_si128(_mm_unpacklo_epi8(a, _mm_setzero_si128()), 8); |
245 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v64_unpackhi_u8_s16 Unexecuted instantiation: cdef_block_avx2.c:v64_unpackhi_u8_s16 |
246 | | |
247 | 0 | SIMD_INLINE v64 v64_unpacklo_s8_s16(v64 a) { |
248 | 0 | return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8); |
249 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v64_unpacklo_s8_s16 Unexecuted instantiation: cdef_block_avx2.c:v64_unpacklo_s8_s16 |
250 | | |
251 | 0 | SIMD_INLINE v64 v64_unpackhi_s8_s16(v64 a) { |
252 | 0 | return _mm_srli_si128(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8), 8); |
253 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v64_unpackhi_s8_s16 Unexecuted instantiation: cdef_block_avx2.c:v64_unpackhi_s8_s16 |
254 | | |
255 | 0 | SIMD_INLINE v64 v64_unpacklo_u16_s32(v64 a) { |
256 | 0 | return _mm_unpacklo_epi16(a, _mm_setzero_si128()); |
257 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v64_unpacklo_u16_s32 Unexecuted instantiation: cdef_block_avx2.c:v64_unpacklo_u16_s32 |
258 | | |
259 | 0 | SIMD_INLINE v64 v64_unpacklo_s16_s32(v64 a) { |
260 | 0 | return _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), a), 16); |
261 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v64_unpacklo_s16_s32 Unexecuted instantiation: cdef_block_avx2.c:v64_unpacklo_s16_s32 |
262 | | |
263 | 0 | SIMD_INLINE v64 v64_unpackhi_u16_s32(v64 a) { |
264 | 0 | return _mm_srli_si128(_mm_unpacklo_epi16(a, _mm_setzero_si128()), 8); |
265 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v64_unpackhi_u16_s32 Unexecuted instantiation: cdef_block_avx2.c:v64_unpackhi_u16_s32 |
266 | | |
267 | 0 | SIMD_INLINE v64 v64_unpackhi_s16_s32(v64 a) { |
268 | 0 | return _mm_srli_si128( |
269 | 0 | _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), a), 16), 8); |
270 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v64_unpackhi_s16_s32 Unexecuted instantiation: cdef_block_avx2.c:v64_unpackhi_s16_s32 |
271 | | |
272 | 0 | SIMD_INLINE v64 v64_shuffle_8(v64 x, v64 pattern) { |
273 | 0 | #if defined(__SSSE3__) |
274 | 0 | return _mm_shuffle_epi8(x, pattern); |
275 | 0 | #else |
276 | 0 | v64 output; |
277 | 0 | unsigned char *input = (unsigned char *)&x; |
278 | 0 | unsigned char *index = (unsigned char *)&pattern; |
279 | 0 | unsigned char *selected = (unsigned char *)&output; |
280 | 0 | int counter; |
281 | 0 |
|
282 | 0 | for (counter = 0; counter < 8; counter++) { |
283 | 0 | selected[counter] = input[index[counter]]; |
284 | 0 | } |
285 | 0 |
|
286 | 0 | return output; |
287 | 0 | #endif |
288 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v64_shuffle_8 Unexecuted instantiation: cdef_block_avx2.c:v64_shuffle_8 |
289 | | |
290 | 0 | SIMD_INLINE int64_t v64_dotp_su8(v64 a, v64 b) { |
291 | 0 | __m128i t = _mm_madd_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8), |
292 | 0 | _mm_unpacklo_epi8(b, _mm_setzero_si128())); |
293 | 0 | t = _mm_add_epi32(t, _mm_srli_si128(t, 8)); |
294 | 0 | t = _mm_add_epi32(t, _mm_srli_si128(t, 4)); |
295 | 0 | return (int32_t)v64_low_u32(t); |
296 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v64_dotp_su8 Unexecuted instantiation: cdef_block_avx2.c:v64_dotp_su8 |
297 | | |
298 | 0 | SIMD_INLINE int64_t v64_dotp_s16(v64 a, v64 b) { |
299 | 0 | __m128i r = _mm_madd_epi16(a, b); |
300 | 0 | #if defined(__SSE4_1__) && defined(__x86_64__) |
301 | 0 | __m128i x = _mm_cvtepi32_epi64(r); |
302 | 0 | return _mm_cvtsi128_si64(_mm_add_epi64(x, _mm_srli_si128(x, 8))); |
303 | 0 | #else |
304 | 0 | return (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 4)) + |
305 | 0 | (int64_t)_mm_cvtsi128_si32(r); |
306 | 0 | #endif |
307 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v64_dotp_s16 Unexecuted instantiation: cdef_block_avx2.c:v64_dotp_s16 |
308 | | |
309 | 0 | SIMD_INLINE uint64_t v64_hadd_u8(v64 a) { |
310 | 0 | return v64_low_u32(_mm_sad_epu8(a, _mm_setzero_si128())); |
311 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v64_hadd_u8 Unexecuted instantiation: cdef_block_avx2.c:v64_hadd_u8 |
312 | | |
313 | 0 | SIMD_INLINE int64_t v64_hadd_s16(v64 a) { |
314 | 0 | return v64_dotp_s16(a, v64_dup_16(1)); |
315 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v64_hadd_s16 Unexecuted instantiation: cdef_block_avx2.c:v64_hadd_s16 |
316 | | |
317 | | typedef v64 sad64_internal; |
318 | | |
319 | 0 | SIMD_INLINE sad64_internal v64_sad_u8_init(void) { return _mm_setzero_si128(); } Unexecuted instantiation: cdef_block_sse4.c:v64_sad_u8_init Unexecuted instantiation: cdef_block_avx2.c:v64_sad_u8_init |
320 | | |
321 | | /* Implementation dependent return value. Result must be finalised with |
322 | | v64_sad_u8_sum(). |
323 | | The result for more than 32 v64_sad_u8() calls is undefined. */ |
324 | 0 | SIMD_INLINE sad64_internal v64_sad_u8(sad64_internal s, v64 a, v64 b) { |
325 | 0 | return _mm_add_epi64(s, _mm_sad_epu8(a, b)); |
326 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v64_sad_u8 Unexecuted instantiation: cdef_block_avx2.c:v64_sad_u8 |
327 | | |
328 | 0 | SIMD_INLINE uint32_t v64_sad_u8_sum(sad64_internal s) { return v64_low_u32(s); } Unexecuted instantiation: cdef_block_sse4.c:v64_sad_u8_sum Unexecuted instantiation: cdef_block_avx2.c:v64_sad_u8_sum |
329 | | |
330 | | typedef v64 ssd64_internal; |
331 | | |
332 | 0 | SIMD_INLINE ssd64_internal v64_ssd_u8_init(void) { return _mm_setzero_si128(); } Unexecuted instantiation: cdef_block_sse4.c:v64_ssd_u8_init Unexecuted instantiation: cdef_block_avx2.c:v64_ssd_u8_init |
333 | | |
334 | | /* Implementation dependent return value. Result must be finalised with |
335 | | * v64_ssd_u8_sum(). */ |
336 | 0 | SIMD_INLINE ssd64_internal v64_ssd_u8(ssd64_internal s, v64 a, v64 b) { |
337 | 0 | v64 l = v64_sub_16(v64_ziplo_8(v64_zero(), a), v64_ziplo_8(v64_zero(), b)); |
338 | 0 | v64 h = v64_sub_16(v64_ziphi_8(v64_zero(), a), v64_ziphi_8(v64_zero(), b)); |
339 | 0 | v64 r = v64_add_32(_mm_madd_epi16(l, l), _mm_madd_epi16(h, h)); |
340 | 0 | return _mm_add_epi64( |
341 | 0 | s, v64_ziplo_32(v64_zero(), _mm_add_epi32(r, _mm_srli_si128(r, 4)))); |
342 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v64_ssd_u8 Unexecuted instantiation: cdef_block_avx2.c:v64_ssd_u8 |
343 | | |
344 | 0 | SIMD_INLINE uint32_t v64_ssd_u8_sum(sad64_internal s) { return v64_low_u32(s); } Unexecuted instantiation: cdef_block_sse4.c:v64_ssd_u8_sum Unexecuted instantiation: cdef_block_avx2.c:v64_ssd_u8_sum |
345 | | |
346 | 0 | SIMD_INLINE v64 v64_or(v64 a, v64 b) { return _mm_or_si128(a, b); } Unexecuted instantiation: cdef_block_sse4.c:v64_or Unexecuted instantiation: cdef_block_avx2.c:v64_or |
347 | | |
348 | 0 | SIMD_INLINE v64 v64_xor(v64 a, v64 b) { return _mm_xor_si128(a, b); } Unexecuted instantiation: cdef_block_sse4.c:v64_xor Unexecuted instantiation: cdef_block_avx2.c:v64_xor |
349 | | |
350 | 0 | SIMD_INLINE v64 v64_and(v64 a, v64 b) { return _mm_and_si128(a, b); } Unexecuted instantiation: cdef_block_sse4.c:v64_and Unexecuted instantiation: cdef_block_avx2.c:v64_and |
351 | | |
352 | 0 | SIMD_INLINE v64 v64_andn(v64 a, v64 b) { return _mm_andnot_si128(b, a); } Unexecuted instantiation: cdef_block_sse4.c:v64_andn Unexecuted instantiation: cdef_block_avx2.c:v64_andn |
353 | | |
354 | 0 | SIMD_INLINE v64 v64_mullo_s16(v64 a, v64 b) { return _mm_mullo_epi16(a, b); } Unexecuted instantiation: cdef_block_sse4.c:v64_mullo_s16 Unexecuted instantiation: cdef_block_avx2.c:v64_mullo_s16 |
355 | | |
356 | 0 | SIMD_INLINE v64 v64_mulhi_s16(v64 a, v64 b) { return _mm_mulhi_epi16(a, b); } Unexecuted instantiation: cdef_block_sse4.c:v64_mulhi_s16 Unexecuted instantiation: cdef_block_avx2.c:v64_mulhi_s16 |
357 | | |
358 | 0 | SIMD_INLINE v64 v64_mullo_s32(v64 a, v64 b) { |
359 | 0 | #if defined(__SSE4_1__) |
360 | 0 | return _mm_mullo_epi32(a, b); |
361 | 0 | #else |
362 | 0 | return _mm_unpacklo_epi32( |
363 | 0 | _mm_mul_epu32(a, b), |
364 | 0 | _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4))); |
365 | 0 | #endif |
366 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v64_mullo_s32 Unexecuted instantiation: cdef_block_avx2.c:v64_mullo_s32 |
367 | | |
368 | 0 | SIMD_INLINE v64 v64_madd_s16(v64 a, v64 b) { return _mm_madd_epi16(a, b); } Unexecuted instantiation: cdef_block_sse4.c:v64_madd_s16 Unexecuted instantiation: cdef_block_avx2.c:v64_madd_s16 |
369 | | |
370 | 0 | SIMD_INLINE v64 v64_madd_us8(v64 a, v64 b) { |
371 | 0 | #if defined(__SSSE3__) |
372 | 0 | return _mm_maddubs_epi16(a, b); |
373 | 0 | #else |
374 | 0 | __m128i t = _mm_madd_epi16(_mm_unpacklo_epi8(a, _mm_setzero_si128()), |
375 | 0 | _mm_srai_epi16(_mm_unpacklo_epi8(b, b), 8)); |
376 | 0 | return _mm_packs_epi32(t, t); |
377 | 0 | #endif |
378 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v64_madd_us8 Unexecuted instantiation: cdef_block_avx2.c:v64_madd_us8 |
379 | | |
380 | 0 | SIMD_INLINE v64 v64_avg_u8(v64 a, v64 b) { return _mm_avg_epu8(a, b); } Unexecuted instantiation: cdef_block_sse4.c:v64_avg_u8 Unexecuted instantiation: cdef_block_avx2.c:v64_avg_u8 |
381 | | |
382 | 0 | SIMD_INLINE v64 v64_rdavg_u8(v64 a, v64 b) { |
383 | 0 | return _mm_sub_epi8(_mm_avg_epu8(a, b), |
384 | 0 | _mm_and_si128(_mm_xor_si128(a, b), v64_dup_8(1))); |
385 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v64_rdavg_u8 Unexecuted instantiation: cdef_block_avx2.c:v64_rdavg_u8 |
386 | | |
387 | 0 | SIMD_INLINE v64 v64_rdavg_u16(v64 a, v64 b) { |
388 | 0 | return _mm_sub_epi16(_mm_avg_epu16(a, b), |
389 | 0 | _mm_and_si128(_mm_xor_si128(a, b), v64_dup_16(1))); |
390 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v64_rdavg_u16 Unexecuted instantiation: cdef_block_avx2.c:v64_rdavg_u16 |
391 | | |
392 | 0 | SIMD_INLINE v64 v64_avg_u16(v64 a, v64 b) { return _mm_avg_epu16(a, b); } Unexecuted instantiation: cdef_block_sse4.c:v64_avg_u16 Unexecuted instantiation: cdef_block_avx2.c:v64_avg_u16 |
393 | | |
394 | 0 | SIMD_INLINE v64 v64_min_u8(v64 a, v64 b) { return _mm_min_epu8(a, b); } Unexecuted instantiation: cdef_block_sse4.c:v64_min_u8 Unexecuted instantiation: cdef_block_avx2.c:v64_min_u8 |
395 | | |
396 | 0 | SIMD_INLINE v64 v64_max_u8(v64 a, v64 b) { return _mm_max_epu8(a, b); } Unexecuted instantiation: cdef_block_sse4.c:v64_max_u8 Unexecuted instantiation: cdef_block_avx2.c:v64_max_u8 |
397 | | |
398 | 0 | SIMD_INLINE v64 v64_min_s8(v64 a, v64 b) { |
399 | 0 | #if defined(__SSE4_1__) |
400 | 0 | return _mm_min_epi8(a, b); |
401 | 0 | #else |
402 | 0 | v64 mask = _mm_cmplt_epi8(a, b); |
403 | 0 | return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a)); |
404 | 0 | #endif |
405 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v64_min_s8 Unexecuted instantiation: cdef_block_avx2.c:v64_min_s8 |
406 | | |
407 | 0 | SIMD_INLINE v64 v64_max_s8(v64 a, v64 b) { |
408 | 0 | #if defined(__SSE4_1__) |
409 | 0 | return _mm_max_epi8(a, b); |
410 | 0 | #else |
411 | 0 | v64 mask = _mm_cmplt_epi8(b, a); |
412 | 0 | return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a)); |
413 | 0 | #endif |
414 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v64_max_s8 Unexecuted instantiation: cdef_block_avx2.c:v64_max_s8 |
415 | | |
416 | 0 | SIMD_INLINE v64 v64_min_s16(v64 a, v64 b) { return _mm_min_epi16(a, b); } Unexecuted instantiation: cdef_block_sse4.c:v64_min_s16 Unexecuted instantiation: cdef_block_avx2.c:v64_min_s16 |
417 | | |
418 | 0 | SIMD_INLINE v64 v64_max_s16(v64 a, v64 b) { return _mm_max_epi16(a, b); } Unexecuted instantiation: cdef_block_sse4.c:v64_max_s16 Unexecuted instantiation: cdef_block_avx2.c:v64_max_s16 |
419 | | |
420 | 0 | SIMD_INLINE v64 v64_cmpgt_s8(v64 a, v64 b) { return _mm_cmpgt_epi8(a, b); } Unexecuted instantiation: cdef_block_sse4.c:v64_cmpgt_s8 Unexecuted instantiation: cdef_block_avx2.c:v64_cmpgt_s8 |
421 | | |
422 | 0 | SIMD_INLINE v64 v64_cmplt_s8(v64 a, v64 b) { return _mm_cmplt_epi8(a, b); } Unexecuted instantiation: cdef_block_sse4.c:v64_cmplt_s8 Unexecuted instantiation: cdef_block_avx2.c:v64_cmplt_s8 |
423 | | |
424 | 0 | SIMD_INLINE v64 v64_cmpeq_8(v64 a, v64 b) { return _mm_cmpeq_epi8(a, b); } Unexecuted instantiation: cdef_block_sse4.c:v64_cmpeq_8 Unexecuted instantiation: cdef_block_avx2.c:v64_cmpeq_8 |
425 | | |
426 | 0 | SIMD_INLINE v64 v64_cmpgt_s16(v64 a, v64 b) { return _mm_cmpgt_epi16(a, b); } Unexecuted instantiation: cdef_block_sse4.c:v64_cmpgt_s16 Unexecuted instantiation: cdef_block_avx2.c:v64_cmpgt_s16 |
427 | | |
428 | 0 | SIMD_INLINE v64 v64_cmplt_s16(v64 a, v64 b) { return _mm_cmplt_epi16(a, b); } Unexecuted instantiation: cdef_block_sse4.c:v64_cmplt_s16 Unexecuted instantiation: cdef_block_avx2.c:v64_cmplt_s16 |
429 | | |
430 | 0 | SIMD_INLINE v64 v64_cmpeq_16(v64 a, v64 b) { return _mm_cmpeq_epi16(a, b); } Unexecuted instantiation: cdef_block_sse4.c:v64_cmpeq_16 Unexecuted instantiation: cdef_block_avx2.c:v64_cmpeq_16 |
431 | | |
432 | 0 | SIMD_INLINE v64 v64_shl_8(v64 a, unsigned int c) { |
433 | 0 | return _mm_and_si128(_mm_set1_epi8((char)(0xff << c)), |
434 | 0 | _mm_sll_epi16(a, _mm_cvtsi32_si128((int)c))); |
435 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v64_shl_8 Unexecuted instantiation: cdef_block_avx2.c:v64_shl_8 |
436 | | |
437 | 0 | SIMD_INLINE v64 v64_shr_u8(v64 a, unsigned int c) { |
438 | 0 | return _mm_and_si128(_mm_set1_epi8((char)(0xff >> c)), |
439 | 0 | _mm_srl_epi16(a, _mm_cvtsi32_si128((int)c))); |
440 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v64_shr_u8 Unexecuted instantiation: cdef_block_avx2.c:v64_shr_u8 |
441 | | |
442 | 0 | SIMD_INLINE v64 v64_shr_s8(v64 a, unsigned int c) { |
443 | 0 | return _mm_packs_epi16( |
444 | 0 | _mm_sra_epi16(_mm_unpacklo_epi8(a, a), _mm_cvtsi32_si128((int)(c + 8))), |
445 | 0 | a); |
446 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v64_shr_s8 Unexecuted instantiation: cdef_block_avx2.c:v64_shr_s8 |
447 | | |
448 | 0 | SIMD_INLINE v64 v64_shl_16(v64 a, unsigned int c) { |
449 | 0 | return _mm_sll_epi16(a, _mm_cvtsi32_si128((int)c)); |
450 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v64_shl_16 Unexecuted instantiation: cdef_block_avx2.c:v64_shl_16 |
451 | | |
452 | 0 | SIMD_INLINE v64 v64_shr_u16(v64 a, unsigned int c) { |
453 | 0 | return _mm_srl_epi16(a, _mm_cvtsi32_si128((int)c)); |
454 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v64_shr_u16 Unexecuted instantiation: cdef_block_avx2.c:v64_shr_u16 |
455 | | |
456 | 0 | SIMD_INLINE v64 v64_shr_s16(v64 a, unsigned int c) { |
457 | 0 | return _mm_sra_epi16(a, _mm_cvtsi32_si128((int)c)); |
458 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v64_shr_s16 Unexecuted instantiation: cdef_block_avx2.c:v64_shr_s16 |
459 | | |
460 | 0 | SIMD_INLINE v64 v64_shl_32(v64 a, unsigned int c) { |
461 | 0 | return _mm_sll_epi32(a, _mm_cvtsi32_si128((int)c)); |
462 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v64_shl_32 Unexecuted instantiation: cdef_block_avx2.c:v64_shl_32 |
463 | | |
464 | 0 | SIMD_INLINE v64 v64_shr_u32(v64 a, unsigned int c) { |
465 | 0 | return _mm_srl_epi32(a, _mm_cvtsi32_si128((int)c)); |
466 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v64_shr_u32 Unexecuted instantiation: cdef_block_avx2.c:v64_shr_u32 |
467 | | |
468 | 0 | SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int c) { |
469 | 0 | return _mm_sra_epi32(a, _mm_cvtsi32_si128((int)c)); |
470 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v64_shr_s32 Unexecuted instantiation: cdef_block_avx2.c:v64_shr_s32 |
471 | | |
472 | | /* These intrinsics require immediate values, so we must use #defines |
473 | | to enforce that. */ |
474 | | #define v64_shl_n_byte(a, c) _mm_slli_si128(a, c) |
475 | | #define v64_shr_n_byte(a, c) _mm_srli_si128(_mm_unpacklo_epi64(a, a), c + 8) |
476 | | #define v64_shl_n_8(a, c) \ |
477 | | _mm_and_si128(_mm_set1_epi8((char)(0xff << (c))), _mm_slli_epi16(a, c)) |
478 | | #define v64_shr_n_u8(a, c) \ |
479 | | _mm_and_si128(_mm_set1_epi8((char)(0xff >> (c))), _mm_srli_epi16(a, c)) |
480 | | #define v64_shr_n_s8(a, c) \ |
481 | | _mm_packs_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), (c) + 8), a) |
482 | | #define v64_shl_n_16(a, c) _mm_slli_epi16(a, c) |
483 | | #define v64_shr_n_u16(a, c) _mm_srli_epi16(a, c) |
484 | | #define v64_shr_n_s16(a, c) _mm_srai_epi16(a, c) |
485 | | #define v64_shl_n_32(a, c) _mm_slli_epi32(a, c) |
486 | | #define v64_shr_n_u32(a, c) _mm_srli_epi32(a, c) |
487 | | #define v64_shr_n_s32(a, c) _mm_srai_epi32(a, c) |
488 | | |
489 | | #endif // AOM_AOM_DSP_SIMD_V64_INTRINSICS_X86_H_ |