/src/aom/aom_dsp/simd/v128_intrinsics_x86.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2016, Alliance for Open Media. All rights reserved. |
3 | | * |
4 | | * This source code is subject to the terms of the BSD 2 Clause License and |
5 | | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
6 | | * was not distributed with this source code in the LICENSE file, you can |
7 | | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
8 | | * Media Patent License 1.0 was not distributed with this source code in the |
9 | | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
10 | | */ |
11 | | |
12 | | #ifndef AOM_AOM_DSP_SIMD_V128_INTRINSICS_X86_H_ |
13 | | #define AOM_AOM_DSP_SIMD_V128_INTRINSICS_X86_H_ |
14 | | |
15 | | #include <stdint.h> |
16 | | #include "aom_dsp/simd/v64_intrinsics_x86.h" |
17 | | |
18 | | typedef __m128i v128; |
19 | | |
20 | 8.18k | SIMD_INLINE uint32_t v128_low_u32(v128 a) { |
21 | 8.18k | return (uint32_t)_mm_cvtsi128_si32(a); |
22 | 8.18k | } Unexecuted instantiation: cdef_block_sse4.c:v128_low_u32 cdef_block_avx2.c:v128_low_u32 Line | Count | Source | 20 | 8.18k | SIMD_INLINE uint32_t v128_low_u32(v128 a) { | 21 | 8.18k | return (uint32_t)_mm_cvtsi128_si32(a); | 22 | 8.18k | } |
|
23 | | |
24 | 109M | SIMD_INLINE v64 v128_low_v64(v128 a) { |
25 | 109M | return _mm_unpacklo_epi64(a, v64_zero()); |
26 | 109M | } Unexecuted instantiation: cdef_block_sse4.c:v128_low_v64 cdef_block_avx2.c:v128_low_v64 Line | Count | Source | 24 | 109M | SIMD_INLINE v64 v128_low_v64(v128 a) { | 25 | 109M | return _mm_unpacklo_epi64(a, v64_zero()); | 26 | 109M | } |
|
27 | | |
28 | 106M | SIMD_INLINE v64 v128_high_v64(v128 a) { return _mm_srli_si128(a, 8); } Unexecuted instantiation: cdef_block_sse4.c:v128_high_v64 cdef_block_avx2.c:v128_high_v64 Line | Count | Source | 28 | 106M | SIMD_INLINE v64 v128_high_v64(v128 a) { return _mm_srli_si128(a, 8); } |
|
29 | | |
30 | 401M | SIMD_INLINE v128 v128_from_v64(v64 a, v64 b) { |
31 | 401M | return _mm_unpacklo_epi64(b, a); |
32 | 401M | } Unexecuted instantiation: cdef_block_sse4.c:v128_from_v64 cdef_block_avx2.c:v128_from_v64 Line | Count | Source | 30 | 401M | SIMD_INLINE v128 v128_from_v64(v64 a, v64 b) { | 31 | 401M | return _mm_unpacklo_epi64(b, a); | 32 | 401M | } |
|
33 | | |
34 | 0 | SIMD_INLINE v128 v128_from_64(uint64_t a, uint64_t b) { |
35 | 0 | return v128_from_v64(v64_from_64(a), v64_from_64(b)); |
36 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_from_64 Unexecuted instantiation: cdef_block_avx2.c:v128_from_64 |
37 | | |
38 | 147k | SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { |
39 | 147k | return _mm_set_epi32((int)a, (int)b, (int)c, (int)d); |
40 | 147k | } Unexecuted instantiation: cdef_block_sse4.c:v128_from_32 cdef_block_avx2.c:v128_from_32 Line | Count | Source | 38 | 147k | SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { | 39 | 147k | return _mm_set_epi32((int)a, (int)b, (int)c, (int)d); | 40 | 147k | } |
|
41 | | |
42 | 150M | SIMD_INLINE v128 v128_load_aligned(const void *p) { |
43 | 150M | return _mm_load_si128((__m128i *)p); |
44 | 150M | } Unexecuted instantiation: cdef_block_sse4.c:v128_load_aligned cdef_block_avx2.c:v128_load_aligned Line | Count | Source | 42 | 150M | SIMD_INLINE v128 v128_load_aligned(const void *p) { | 43 | 150M | return _mm_load_si128((__m128i *)p); | 44 | 150M | } |
|
45 | | |
46 | 724M | SIMD_INLINE v128 v128_load_unaligned(const void *p) { |
47 | 724M | #if defined(__SSSE3__) |
48 | 724M | return _mm_lddqu_si128((__m128i *)p); |
49 | | #else |
50 | | return _mm_loadu_si128((__m128i *)p); |
51 | | #endif |
52 | 724M | } Unexecuted instantiation: cdef_block_sse4.c:v128_load_unaligned cdef_block_avx2.c:v128_load_unaligned Line | Count | Source | 46 | 724M | SIMD_INLINE v128 v128_load_unaligned(const void *p) { | 47 | 724M | #if defined(__SSSE3__) | 48 | 724M | return _mm_lddqu_si128((__m128i *)p); | 49 | | #else | 50 | | return _mm_loadu_si128((__m128i *)p); | 51 | | #endif | 52 | 724M | } |
|
53 | | |
54 | 0 | SIMD_INLINE void v128_store_aligned(void *p, v128 a) { |
55 | 0 | _mm_store_si128((__m128i *)p, a); |
56 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_store_aligned Unexecuted instantiation: cdef_block_avx2.c:v128_store_aligned |
57 | | |
58 | 151M | SIMD_INLINE void v128_store_unaligned(void *p, v128 a) { |
59 | 151M | _mm_storeu_si128((__m128i *)p, a); |
60 | 151M | } Unexecuted instantiation: cdef_block_sse4.c:v128_store_unaligned cdef_block_avx2.c:v128_store_unaligned Line | Count | Source | 58 | 151M | SIMD_INLINE void v128_store_unaligned(void *p, v128 a) { | 59 | 151M | _mm_storeu_si128((__m128i *)p, a); | 60 | 151M | } |
|
61 | | |
62 | | // The following function requires an immediate. |
63 | | // Some compilers will check this during optimisation, others wont. |
64 | | #if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__) |
65 | | #if defined(__SSSE3__) |
66 | | SIMD_INLINE v128 v128_align(v128 a, v128 b, const unsigned int c) { |
67 | | return c ? _mm_alignr_epi8(a, b, c) : b; |
68 | | } |
69 | | #else |
70 | | #define v128_align(a, b, c) \ |
71 | | ((c) ? _mm_or_si128(_mm_srli_si128(b, c), _mm_slli_si128(a, 16 - (c))) : (b)) |
72 | | #endif |
73 | | #else |
74 | | #if defined(__SSSE3__) |
75 | 16.3k | #define v128_align(a, b, c) ((c) ? _mm_alignr_epi8(a, b, (uint8_t)(c)) : (b)) |
76 | | #else |
77 | | #define v128_align(a, b, c) \ |
78 | | ((c) ? _mm_or_si128(_mm_srli_si128(b, c), _mm_slli_si128(a, 16 - (c))) : (b)) |
79 | | #endif |
80 | | #endif |
81 | | |
82 | 0 | SIMD_INLINE v128 v128_zero(void) { return _mm_setzero_si128(); } Unexecuted instantiation: cdef_block_sse4.c:v128_zero Unexecuted instantiation: cdef_block_avx2.c:v128_zero |
83 | | |
84 | 0 | SIMD_INLINE v128 v128_dup_8(uint8_t x) { return _mm_set1_epi8((char)x); } Unexecuted instantiation: cdef_block_sse4.c:v128_dup_8 Unexecuted instantiation: cdef_block_avx2.c:v128_dup_8 |
85 | | |
86 | 65.4k | SIMD_INLINE v128 v128_dup_16(uint16_t x) { return _mm_set1_epi16((short)x); } Unexecuted instantiation: cdef_block_sse4.c:v128_dup_16 cdef_block_avx2.c:v128_dup_16 Line | Count | Source | 86 | 65.4k | SIMD_INLINE v128 v128_dup_16(uint16_t x) { return _mm_set1_epi16((short)x); } |
|
87 | | |
88 | 16.3k | SIMD_INLINE v128 v128_dup_32(uint32_t x) { return _mm_set1_epi32((int)x); } Unexecuted instantiation: cdef_block_sse4.c:v128_dup_32 cdef_block_avx2.c:v128_dup_32 Line | Count | Source | 88 | 16.3k | SIMD_INLINE v128 v128_dup_32(uint32_t x) { return _mm_set1_epi32((int)x); } |
|
89 | | |
90 | 0 | SIMD_INLINE v128 v128_dup_64(uint64_t x) { |
91 | 0 | // _mm_set_pi64x and _mm_cvtsi64x_si64 missing in some compilers |
92 | 0 | return _mm_set_epi32((int32_t)(x >> 32), (int32_t)x, (int32_t)(x >> 32), |
93 | 0 | (int32_t)x); |
94 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_dup_64 Unexecuted instantiation: cdef_block_avx2.c:v128_dup_64 |
95 | | |
96 | 0 | SIMD_INLINE v128 v128_add_8(v128 a, v128 b) { return _mm_add_epi8(a, b); } Unexecuted instantiation: cdef_block_sse4.c:v128_add_8 Unexecuted instantiation: cdef_block_avx2.c:v128_add_8 |
97 | | |
98 | 523k | SIMD_INLINE v128 v128_add_16(v128 a, v128 b) { return _mm_add_epi16(a, b); } Unexecuted instantiation: cdef_block_sse4.c:v128_add_16 cdef_block_avx2.c:v128_add_16 Line | Count | Source | 98 | 523k | SIMD_INLINE v128 v128_add_16(v128 a, v128 b) { return _mm_add_epi16(a, b); } |
|
99 | | |
100 | 0 | SIMD_INLINE v128 v128_sadd_u8(v128 a, v128 b) { return _mm_adds_epu8(a, b); } Unexecuted instantiation: cdef_block_sse4.c:v128_sadd_u8 Unexecuted instantiation: cdef_block_avx2.c:v128_sadd_u8 |
101 | | |
102 | 0 | SIMD_INLINE v128 v128_sadd_s8(v128 a, v128 b) { return _mm_adds_epi8(a, b); } Unexecuted instantiation: cdef_block_sse4.c:v128_sadd_s8 Unexecuted instantiation: cdef_block_avx2.c:v128_sadd_s8 |
103 | | |
104 | 0 | SIMD_INLINE v128 v128_sadd_s16(v128 a, v128 b) { return _mm_adds_epi16(a, b); } Unexecuted instantiation: cdef_block_sse4.c:v128_sadd_s16 Unexecuted instantiation: cdef_block_avx2.c:v128_sadd_s16 |
105 | | |
106 | 98.1k | SIMD_INLINE v128 v128_add_32(v128 a, v128 b) { return _mm_add_epi32(a, b); } Unexecuted instantiation: cdef_block_sse4.c:v128_add_32 cdef_block_avx2.c:v128_add_32 Line | Count | Source | 106 | 98.1k | SIMD_INLINE v128 v128_add_32(v128 a, v128 b) { return _mm_add_epi32(a, b); } |
|
107 | | |
108 | 0 | SIMD_INLINE v128 v128_add_64(v128 a, v128 b) { return _mm_add_epi64(a, b); } Unexecuted instantiation: cdef_block_sse4.c:v128_add_64 Unexecuted instantiation: cdef_block_avx2.c:v128_add_64 |
109 | | |
110 | 0 | SIMD_INLINE v128 v128_padd_s16(v128 a) { |
111 | 0 | return _mm_madd_epi16(a, _mm_set1_epi16(1)); |
112 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_padd_s16 Unexecuted instantiation: cdef_block_avx2.c:v128_padd_s16 |
113 | | |
114 | 0 | SIMD_INLINE v128 v128_sub_8(v128 a, v128 b) { return _mm_sub_epi8(a, b); } Unexecuted instantiation: cdef_block_sse4.c:v128_sub_8 Unexecuted instantiation: cdef_block_avx2.c:v128_sub_8 |
115 | | |
116 | 0 | SIMD_INLINE v128 v128_ssub_u8(v128 a, v128 b) { return _mm_subs_epu8(a, b); } Unexecuted instantiation: cdef_block_sse4.c:v128_ssub_u8 Unexecuted instantiation: cdef_block_avx2.c:v128_ssub_u8 |
117 | | |
118 | 0 | SIMD_INLINE v128 v128_ssub_s8(v128 a, v128 b) { return _mm_subs_epi8(a, b); } Unexecuted instantiation: cdef_block_sse4.c:v128_ssub_s8 Unexecuted instantiation: cdef_block_avx2.c:v128_ssub_s8 |
119 | | |
120 | 65.4k | SIMD_INLINE v128 v128_sub_16(v128 a, v128 b) { return _mm_sub_epi16(a, b); } Unexecuted instantiation: cdef_block_sse4.c:v128_sub_16 cdef_block_avx2.c:v128_sub_16 Line | Count | Source | 120 | 65.4k | SIMD_INLINE v128 v128_sub_16(v128 a, v128 b) { return _mm_sub_epi16(a, b); } |
|
121 | | |
122 | 0 | SIMD_INLINE v128 v128_ssub_s16(v128 a, v128 b) { return _mm_subs_epi16(a, b); } Unexecuted instantiation: cdef_block_sse4.c:v128_ssub_s16 Unexecuted instantiation: cdef_block_avx2.c:v128_ssub_s16 |
123 | | |
124 | 0 | SIMD_INLINE v128 v128_ssub_u16(v128 a, v128 b) { return _mm_subs_epu16(a, b); } Unexecuted instantiation: cdef_block_sse4.c:v128_ssub_u16 Unexecuted instantiation: cdef_block_avx2.c:v128_ssub_u16 |
125 | | |
126 | 0 | SIMD_INLINE v128 v128_sub_32(v128 a, v128 b) { return _mm_sub_epi32(a, b); } Unexecuted instantiation: cdef_block_sse4.c:v128_sub_32 Unexecuted instantiation: cdef_block_avx2.c:v128_sub_32 |
127 | | |
128 | 0 | SIMD_INLINE v128 v128_sub_64(v128 a, v128 b) { return _mm_sub_epi64(a, b); } Unexecuted instantiation: cdef_block_sse4.c:v128_sub_64 Unexecuted instantiation: cdef_block_avx2.c:v128_sub_64 |
129 | | |
130 | 0 | SIMD_INLINE v128 v128_abs_s16(v128 a) { |
131 | 0 | #if defined(__SSSE3__) |
132 | 0 | return _mm_abs_epi16(a); |
133 | | #else |
134 | | return _mm_max_epi16(a, _mm_sub_epi16(_mm_setzero_si128(), a)); |
135 | | #endif |
136 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_abs_s16 Unexecuted instantiation: cdef_block_avx2.c:v128_abs_s16 |
137 | | |
138 | 0 | SIMD_INLINE v128 v128_abs_s8(v128 a) { |
139 | 0 | #if defined(__SSSE3__) |
140 | 0 | return _mm_abs_epi8(a); |
141 | 0 | #else |
142 | 0 | v128 sign = _mm_cmplt_epi8(a, _mm_setzero_si128()); |
143 | 0 | return _mm_xor_si128(sign, _mm_add_epi8(a, sign)); |
144 | 0 | #endif |
145 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_abs_s8 Unexecuted instantiation: cdef_block_avx2.c:v128_abs_s8 |
146 | | |
147 | 0 | SIMD_INLINE v128 v128_ziplo_8(v128 a, v128 b) { |
148 | 0 | return _mm_unpacklo_epi8(b, a); |
149 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_ziplo_8 Unexecuted instantiation: cdef_block_avx2.c:v128_ziplo_8 |
150 | | |
151 | 0 | SIMD_INLINE v128 v128_ziphi_8(v128 a, v128 b) { |
152 | 0 | return _mm_unpackhi_epi8(b, a); |
153 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_ziphi_8 Unexecuted instantiation: cdef_block_avx2.c:v128_ziphi_8 |
154 | | |
155 | 81.8k | SIMD_INLINE v128 v128_ziplo_16(v128 a, v128 b) { |
156 | 81.8k | return _mm_unpacklo_epi16(b, a); |
157 | 81.8k | } Unexecuted instantiation: cdef_block_sse4.c:v128_ziplo_16 cdef_block_avx2.c:v128_ziplo_16 Line | Count | Source | 155 | 81.8k | SIMD_INLINE v128 v128_ziplo_16(v128 a, v128 b) { | 156 | 81.8k | return _mm_unpacklo_epi16(b, a); | 157 | 81.8k | } |
|
158 | | |
159 | 81.8k | SIMD_INLINE v128 v128_ziphi_16(v128 a, v128 b) { |
160 | 81.8k | return _mm_unpackhi_epi16(b, a); |
161 | 81.8k | } Unexecuted instantiation: cdef_block_sse4.c:v128_ziphi_16 cdef_block_avx2.c:v128_ziphi_16 Line | Count | Source | 159 | 81.8k | SIMD_INLINE v128 v128_ziphi_16(v128 a, v128 b) { | 160 | 81.8k | return _mm_unpackhi_epi16(b, a); | 161 | 81.8k | } |
|
162 | | |
163 | 65.4k | SIMD_INLINE v128 v128_ziplo_32(v128 a, v128 b) { |
164 | 65.4k | return _mm_unpacklo_epi32(b, a); |
165 | 65.4k | } Unexecuted instantiation: cdef_block_sse4.c:v128_ziplo_32 cdef_block_avx2.c:v128_ziplo_32 Line | Count | Source | 163 | 65.4k | SIMD_INLINE v128 v128_ziplo_32(v128 a, v128 b) { | 164 | 65.4k | return _mm_unpacklo_epi32(b, a); | 165 | 65.4k | } |
|
166 | | |
167 | 65.4k | SIMD_INLINE v128 v128_ziphi_32(v128 a, v128 b) { |
168 | 65.4k | return _mm_unpackhi_epi32(b, a); |
169 | 65.4k | } Unexecuted instantiation: cdef_block_sse4.c:v128_ziphi_32 cdef_block_avx2.c:v128_ziphi_32 Line | Count | Source | 167 | 65.4k | SIMD_INLINE v128 v128_ziphi_32(v128 a, v128 b) { | 168 | 65.4k | return _mm_unpackhi_epi32(b, a); | 169 | 65.4k | } |
|
170 | | |
171 | 65.4k | SIMD_INLINE v128 v128_ziplo_64(v128 a, v128 b) { |
172 | 65.4k | return _mm_unpacklo_epi64(b, a); |
173 | 65.4k | } Unexecuted instantiation: cdef_block_sse4.c:v128_ziplo_64 cdef_block_avx2.c:v128_ziplo_64 Line | Count | Source | 171 | 65.4k | SIMD_INLINE v128 v128_ziplo_64(v128 a, v128 b) { | 172 | 65.4k | return _mm_unpacklo_epi64(b, a); | 173 | 65.4k | } |
|
174 | | |
175 | 65.4k | SIMD_INLINE v128 v128_ziphi_64(v128 a, v128 b) { |
176 | 65.4k | return _mm_unpackhi_epi64(b, a); |
177 | 65.4k | } Unexecuted instantiation: cdef_block_sse4.c:v128_ziphi_64 cdef_block_avx2.c:v128_ziphi_64 Line | Count | Source | 175 | 65.4k | SIMD_INLINE v128 v128_ziphi_64(v128 a, v128 b) { | 176 | 65.4k | return _mm_unpackhi_epi64(b, a); | 177 | 65.4k | } |
|
178 | | |
179 | 0 | SIMD_INLINE v128 v128_zip_8(v64 a, v64 b) { return _mm_unpacklo_epi8(b, a); } Unexecuted instantiation: cdef_block_sse4.c:v128_zip_8 Unexecuted instantiation: cdef_block_avx2.c:v128_zip_8 |
180 | | |
181 | 0 | SIMD_INLINE v128 v128_zip_16(v64 a, v64 b) { return _mm_unpacklo_epi16(b, a); } Unexecuted instantiation: cdef_block_sse4.c:v128_zip_16 Unexecuted instantiation: cdef_block_avx2.c:v128_zip_16 |
182 | | |
183 | 0 | SIMD_INLINE v128 v128_zip_32(v64 a, v64 b) { return _mm_unpacklo_epi32(b, a); } Unexecuted instantiation: cdef_block_sse4.c:v128_zip_32 Unexecuted instantiation: cdef_block_avx2.c:v128_zip_32 |
184 | | |
185 | 0 | SIMD_INLINE v128 v128_unziphi_8(v128 a, v128 b) { |
186 | 0 | return _mm_packs_epi16(_mm_srai_epi16(b, 8), _mm_srai_epi16(a, 8)); |
187 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_unziphi_8 Unexecuted instantiation: cdef_block_avx2.c:v128_unziphi_8 |
188 | | |
189 | 0 | SIMD_INLINE v128 v128_unziplo_8(v128 a, v128 b) { |
190 | 0 | #if defined(__SSSE3__) |
191 | 0 | #ifdef __x86_64__ |
192 | 0 | v128 order = _mm_cvtsi64_si128(0x0e0c0a0806040200LL); |
193 | 0 | #else |
194 | 0 | v128 order = _mm_set_epi32(0, 0, 0x0e0c0a08, 0x06040200); |
195 | 0 | #endif |
196 | 0 | return _mm_unpacklo_epi64(_mm_shuffle_epi8(b, order), |
197 | 0 | _mm_shuffle_epi8(a, order)); |
198 | 0 | #else |
199 | 0 | return v128_unziphi_8(_mm_slli_si128(a, 1), _mm_slli_si128(b, 1)); |
200 | 0 | #endif |
201 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_unziplo_8 Unexecuted instantiation: cdef_block_avx2.c:v128_unziplo_8 |
202 | | |
203 | 0 | SIMD_INLINE v128 v128_unziphi_16(v128 a, v128 b) { |
204 | 0 | return _mm_packs_epi32(_mm_srai_epi32(b, 16), _mm_srai_epi32(a, 16)); |
205 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_unziphi_16 Unexecuted instantiation: cdef_block_avx2.c:v128_unziphi_16 |
206 | | |
207 | 0 | SIMD_INLINE v128 v128_unziplo_16(v128 a, v128 b) { |
208 | 0 | #if defined(__SSSE3__) |
209 | 0 | #ifdef __x86_64__ |
210 | 0 | v128 order = _mm_cvtsi64_si128(0x0d0c090805040100LL); |
211 | 0 | #else |
212 | 0 | v128 order = _mm_set_epi32(0, 0, 0x0d0c0908, 0x05040100); |
213 | 0 | #endif |
214 | 0 | return _mm_unpacklo_epi64(_mm_shuffle_epi8(b, order), |
215 | 0 | _mm_shuffle_epi8(a, order)); |
216 | 0 | #else |
217 | 0 | return v128_unziphi_16(_mm_slli_si128(a, 2), _mm_slli_si128(b, 2)); |
218 | 0 | #endif |
219 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_unziplo_16 Unexecuted instantiation: cdef_block_avx2.c:v128_unziplo_16 |
220 | | |
221 | 0 | SIMD_INLINE v128 v128_unziphi_32(v128 a, v128 b) { |
222 | 0 | return _mm_castps_si128(_mm_shuffle_ps( |
223 | 0 | _mm_castsi128_ps(b), _mm_castsi128_ps(a), _MM_SHUFFLE(3, 1, 3, 1))); |
224 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_unziphi_32 Unexecuted instantiation: cdef_block_avx2.c:v128_unziphi_32 |
225 | | |
226 | 0 | SIMD_INLINE v128 v128_unziplo_32(v128 a, v128 b) { |
227 | 0 | return _mm_castps_si128(_mm_shuffle_ps( |
228 | 0 | _mm_castsi128_ps(b), _mm_castsi128_ps(a), _MM_SHUFFLE(2, 0, 2, 0))); |
229 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_unziplo_32 Unexecuted instantiation: cdef_block_avx2.c:v128_unziplo_32 |
230 | | |
231 | 0 | SIMD_INLINE v128 v128_unpack_u8_s16(v64 a) { |
232 | 0 | return _mm_unpacklo_epi8(a, _mm_setzero_si128()); |
233 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_unpack_u8_s16 Unexecuted instantiation: cdef_block_avx2.c:v128_unpack_u8_s16 |
234 | | |
235 | 0 | SIMD_INLINE v128 v128_unpacklo_u8_s16(v128 a) { |
236 | 0 | return _mm_unpacklo_epi8(a, _mm_setzero_si128()); |
237 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_unpacklo_u8_s16 Unexecuted instantiation: cdef_block_avx2.c:v128_unpacklo_u8_s16 |
238 | | |
239 | 0 | SIMD_INLINE v128 v128_unpackhi_u8_s16(v128 a) { |
240 | 0 | return _mm_unpackhi_epi8(a, _mm_setzero_si128()); |
241 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_unpackhi_u8_s16 Unexecuted instantiation: cdef_block_avx2.c:v128_unpackhi_u8_s16 |
242 | | |
243 | 0 | SIMD_INLINE v128 v128_unpack_s8_s16(v64 a) { |
244 | 0 | return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8); |
245 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_unpack_s8_s16 Unexecuted instantiation: cdef_block_avx2.c:v128_unpack_s8_s16 |
246 | | |
247 | 0 | SIMD_INLINE v128 v128_unpacklo_s8_s16(v128 a) { |
248 | 0 | return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8); |
249 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_unpacklo_s8_s16 Unexecuted instantiation: cdef_block_avx2.c:v128_unpacklo_s8_s16 |
250 | | |
251 | 0 | SIMD_INLINE v128 v128_unpackhi_s8_s16(v128 a) { |
252 | 0 | return _mm_srai_epi16(_mm_unpackhi_epi8(a, a), 8); |
253 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_unpackhi_s8_s16 Unexecuted instantiation: cdef_block_avx2.c:v128_unpackhi_s8_s16 |
254 | | |
255 | 8.18k | SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) { |
256 | 8.18k | return _mm_packs_epi32(b, a); |
257 | 8.18k | } Unexecuted instantiation: cdef_block_sse4.c:v128_pack_s32_s16 cdef_block_avx2.c:v128_pack_s32_s16 Line | Count | Source | 255 | 8.18k | SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) { | 256 | 8.18k | return _mm_packs_epi32(b, a); | 257 | 8.18k | } |
|
258 | | |
259 | 0 | SIMD_INLINE v128 v128_pack_s32_u16(v128 a, v128 b) { |
260 | 0 | #if defined(__SSE4_1__) |
261 | 0 | return _mm_packus_epi32(b, a); |
262 | 0 | #else |
263 | 0 | return v128_from_v64(v64_pack_s32_u16(v128_high_v64(a), v128_low_v64(a)), |
264 | 0 | v64_pack_s32_u16(v128_high_v64(b), v128_low_v64(b))); |
265 | 0 | #endif |
266 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_pack_s32_u16 Unexecuted instantiation: cdef_block_avx2.c:v128_pack_s32_u16 |
267 | | |
268 | 11.0M | SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) { |
269 | 11.0M | return _mm_packus_epi16(b, a); |
270 | 11.0M | } Unexecuted instantiation: cdef_block_sse4.c:v128_pack_s16_u8 cdef_block_avx2.c:v128_pack_s16_u8 Line | Count | Source | 268 | 11.0M | SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) { | 269 | 11.0M | return _mm_packus_epi16(b, a); | 270 | 11.0M | } |
|
271 | | |
272 | 8.18k | SIMD_INLINE v128 v128_pack_s16_s8(v128 a, v128 b) { |
273 | 8.18k | return _mm_packs_epi16(b, a); |
274 | 8.18k | } Unexecuted instantiation: cdef_block_sse4.c:v128_pack_s16_s8 cdef_block_avx2.c:v128_pack_s16_s8 Line | Count | Source | 272 | 8.18k | SIMD_INLINE v128 v128_pack_s16_s8(v128 a, v128 b) { | 273 | 8.18k | return _mm_packs_epi16(b, a); | 274 | 8.18k | } |
|
275 | | |
276 | 0 | SIMD_INLINE v128 v128_unpack_u16_s32(v64 a) { |
277 | 0 | return _mm_unpacklo_epi16(a, _mm_setzero_si128()); |
278 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_unpack_u16_s32 Unexecuted instantiation: cdef_block_avx2.c:v128_unpack_u16_s32 |
279 | | |
280 | 0 | SIMD_INLINE v128 v128_unpack_s16_s32(v64 a) { |
281 | 0 | return _mm_srai_epi32(_mm_unpacklo_epi16(a, a), 16); |
282 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_unpack_s16_s32 Unexecuted instantiation: cdef_block_avx2.c:v128_unpack_s16_s32 |
283 | | |
284 | 0 | SIMD_INLINE v128 v128_unpacklo_u16_s32(v128 a) { |
285 | 0 | return _mm_unpacklo_epi16(a, _mm_setzero_si128()); |
286 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_unpacklo_u16_s32 Unexecuted instantiation: cdef_block_avx2.c:v128_unpacklo_u16_s32 |
287 | | |
288 | 0 | SIMD_INLINE v128 v128_unpacklo_s16_s32(v128 a) { |
289 | 0 | return _mm_srai_epi32(_mm_unpacklo_epi16(a, a), 16); |
290 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_unpacklo_s16_s32 Unexecuted instantiation: cdef_block_avx2.c:v128_unpacklo_s16_s32 |
291 | | |
292 | 0 | SIMD_INLINE v128 v128_unpackhi_u16_s32(v128 a) { |
293 | 0 | return _mm_unpackhi_epi16(a, _mm_setzero_si128()); |
294 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_unpackhi_u16_s32 Unexecuted instantiation: cdef_block_avx2.c:v128_unpackhi_u16_s32 |
295 | | |
296 | 0 | SIMD_INLINE v128 v128_unpackhi_s16_s32(v128 a) { |
297 | 0 | return _mm_srai_epi32(_mm_unpackhi_epi16(a, a), 16); |
298 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_unpackhi_s16_s32 Unexecuted instantiation: cdef_block_avx2.c:v128_unpackhi_s16_s32 |
299 | | |
300 | 49.0k | SIMD_INLINE v128 v128_shuffle_8(v128 x, v128 pattern) { |
301 | 49.0k | #if defined(__SSSE3__) |
302 | 49.0k | return _mm_shuffle_epi8(x, pattern); |
303 | | #else |
304 | | v128 output; |
305 | | unsigned char *input = (unsigned char *)&x; |
306 | | unsigned char *index = (unsigned char *)&pattern; |
307 | | unsigned char *selected = (unsigned char *)&output; |
308 | | int counter; |
309 | | |
310 | | for (counter = 0; counter < 16; counter++) { |
311 | | selected[counter] = input[index[counter] & 15]; |
312 | | } |
313 | | |
314 | | return output; |
315 | | #endif |
316 | 49.0k | } Unexecuted instantiation: cdef_block_sse4.c:v128_shuffle_8 cdef_block_avx2.c:v128_shuffle_8 Line | Count | Source | 300 | 49.0k | SIMD_INLINE v128 v128_shuffle_8(v128 x, v128 pattern) { | 301 | 49.0k | #if defined(__SSSE3__) | 302 | 49.0k | return _mm_shuffle_epi8(x, pattern); | 303 | | #else | 304 | | v128 output; | 305 | | unsigned char *input = (unsigned char *)&x; | 306 | | unsigned char *index = (unsigned char *)&pattern; | 307 | | unsigned char *selected = (unsigned char *)&output; | 308 | | int counter; | 309 | | | 310 | | for (counter = 0; counter < 16; counter++) { | 311 | | selected[counter] = input[index[counter] & 15]; | 312 | | } | 313 | | | 314 | | return output; | 315 | | #endif | 316 | 49.0k | } |
|
317 | | |
318 | 0 | SIMD_INLINE int64_t v128_dotp_su8(v128 a, v128 b) { |
319 | 0 | v128 t1 = _mm_madd_epi16(v128_unpackhi_s8_s16(a), v128_unpackhi_u8_s16(b)); |
320 | 0 | v128 t2 = _mm_madd_epi16(v128_unpacklo_s8_s16(a), v128_unpacklo_u8_s16(b)); |
321 | 0 | v128 t = v128_add_32(t1, t2); |
322 | 0 | t = v128_add_32(t, _mm_srli_si128(t, 8)); |
323 | 0 | t = v128_add_32(t, _mm_srli_si128(t, 4)); |
324 | 0 | return (int32_t)v128_low_u32(t); |
325 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_dotp_su8 Unexecuted instantiation: cdef_block_avx2.c:v128_dotp_su8 |
326 | | |
327 | 0 | SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) { |
328 | 0 | v128 r = _mm_madd_epi16(a, b); |
329 | 0 | #if defined(__SSE4_1__) && defined(__x86_64__) |
330 | 0 | v128 c = _mm_add_epi64(_mm_cvtepi32_epi64(r), |
331 | 0 | _mm_cvtepi32_epi64(_mm_srli_si128(r, 8))); |
332 | 0 | return _mm_cvtsi128_si64(_mm_add_epi64(c, _mm_srli_si128(c, 8))); |
333 | 0 | #else |
334 | 0 | return (int64_t)_mm_cvtsi128_si32(r) + |
335 | 0 | (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 4)) + |
336 | 0 | (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 8)) + |
337 | 0 | (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 12)); |
338 | 0 | #endif |
339 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_dotp_s16 Unexecuted instantiation: cdef_block_avx2.c:v128_dotp_s16 |
340 | | |
341 | 0 | SIMD_INLINE uint64_t v128_hadd_u8(v128 a) { |
342 | 0 | v128 t = _mm_sad_epu8(a, _mm_setzero_si128()); |
343 | 0 | return v64_low_u32(v128_low_v64(t)) + v64_low_u32(v128_high_v64(t)); |
344 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_hadd_u8 Unexecuted instantiation: cdef_block_avx2.c:v128_hadd_u8 |
345 | | |
346 | | typedef v128 sad128_internal; |
347 | | |
348 | 0 | SIMD_INLINE sad128_internal v128_sad_u8_init(void) { |
349 | 0 | return _mm_setzero_si128(); |
350 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_sad_u8_init Unexecuted instantiation: cdef_block_avx2.c:v128_sad_u8_init |
351 | | |
352 | | /* Implementation dependent return value. Result must be finalised with |
353 | | v128_sad_sum(). |
354 | | The result for more than 32 v128_sad_u8() calls is undefined. */ |
355 | 0 | SIMD_INLINE sad128_internal v128_sad_u8(sad128_internal s, v128 a, v128 b) { |
356 | 0 | return _mm_add_epi64(s, _mm_sad_epu8(a, b)); |
357 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_sad_u8 Unexecuted instantiation: cdef_block_avx2.c:v128_sad_u8 |
358 | | |
359 | 0 | SIMD_INLINE uint32_t v128_sad_u8_sum(sad128_internal s) { |
360 | 0 | return v128_low_u32(_mm_add_epi32(s, _mm_unpackhi_epi64(s, s))); |
361 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_sad_u8_sum Unexecuted instantiation: cdef_block_avx2.c:v128_sad_u8_sum |
362 | | |
363 | | typedef int32_t ssd128_internal; |
364 | | |
365 | 0 | SIMD_INLINE ssd128_internal v128_ssd_u8_init(void) { return 0; } Unexecuted instantiation: cdef_block_sse4.c:v128_ssd_u8_init Unexecuted instantiation: cdef_block_avx2.c:v128_ssd_u8_init |
366 | | |
367 | | /* Implementation dependent return value. Result must be finalised with |
368 | | * v128_ssd_sum(). */ |
369 | 0 | SIMD_INLINE ssd128_internal v128_ssd_u8(ssd128_internal s, v128 a, v128 b) { |
370 | 0 | v128 z = _mm_setzero_si128(); |
371 | 0 | v128 l = _mm_sub_epi16(_mm_unpacklo_epi8(a, z), _mm_unpacklo_epi8(b, z)); |
372 | 0 | v128 h = _mm_sub_epi16(_mm_unpackhi_epi8(a, z), _mm_unpackhi_epi8(b, z)); |
373 | 0 | v128 rl = _mm_madd_epi16(l, l); |
374 | 0 | v128 rh = _mm_madd_epi16(h, h); |
375 | 0 | v128 r = _mm_add_epi32(rl, rh); |
376 | 0 | r = _mm_add_epi32(r, _mm_srli_si128(r, 8)); |
377 | 0 | r = _mm_add_epi32(r, _mm_srli_si128(r, 4)); |
378 | 0 | return s + _mm_cvtsi128_si32(r); |
379 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_ssd_u8 Unexecuted instantiation: cdef_block_avx2.c:v128_ssd_u8 |
380 | | |
381 | 0 | SIMD_INLINE int32_t v128_ssd_u8_sum(ssd128_internal s) { return s; } Unexecuted instantiation: cdef_block_sse4.c:v128_ssd_u8_sum Unexecuted instantiation: cdef_block_avx2.c:v128_ssd_u8_sum |
382 | | |
383 | 0 | SIMD_INLINE v128 v128_or(v128 a, v128 b) { return _mm_or_si128(a, b); } Unexecuted instantiation: cdef_block_sse4.c:v128_or Unexecuted instantiation: cdef_block_avx2.c:v128_or |
384 | | |
385 | 0 | SIMD_INLINE v128 v128_xor(v128 a, v128 b) { return _mm_xor_si128(a, b); } Unexecuted instantiation: cdef_block_sse4.c:v128_xor Unexecuted instantiation: cdef_block_avx2.c:v128_xor |
386 | | |
387 | 0 | SIMD_INLINE v128 v128_and(v128 a, v128 b) { return _mm_and_si128(a, b); } Unexecuted instantiation: cdef_block_sse4.c:v128_and Unexecuted instantiation: cdef_block_avx2.c:v128_and |
388 | | |
389 | 0 | SIMD_INLINE v128 v128_andn(v128 a, v128 b) { return _mm_andnot_si128(b, a); } Unexecuted instantiation: cdef_block_sse4.c:v128_andn Unexecuted instantiation: cdef_block_avx2.c:v128_andn |
390 | | |
391 | 0 | SIMD_INLINE v128 v128_mul_s16(v64 a, v64 b) { |
392 | 0 | v64 lo_bits = v64_mullo_s16(a, b); |
393 | 0 | v64 hi_bits = v64_mulhi_s16(a, b); |
394 | 0 | return v128_from_v64(v64_ziphi_16(hi_bits, lo_bits), |
395 | 0 | v64_ziplo_16(hi_bits, lo_bits)); |
396 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_mul_s16 Unexecuted instantiation: cdef_block_avx2.c:v128_mul_s16 |
397 | | |
398 | 0 | SIMD_INLINE v128 v128_mullo_s16(v128 a, v128 b) { |
399 | 0 | return _mm_mullo_epi16(a, b); |
400 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_mullo_s16 Unexecuted instantiation: cdef_block_avx2.c:v128_mullo_s16 |
401 | | |
402 | 0 | SIMD_INLINE v128 v128_mulhi_s16(v128 a, v128 b) { |
403 | 0 | return _mm_mulhi_epi16(a, b); |
404 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_mulhi_s16 Unexecuted instantiation: cdef_block_avx2.c:v128_mulhi_s16 |
405 | | |
406 | 114k | SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) { |
407 | 114k | #if defined(__SSE4_1__) |
408 | 114k | return _mm_mullo_epi32(a, b); |
409 | | #else |
410 | | return _mm_unpacklo_epi32( |
411 | | _mm_shuffle_epi32(_mm_mul_epu32(a, b), 8), |
412 | | _mm_shuffle_epi32( |
413 | | _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4)), 8)); |
414 | | #endif |
415 | 114k | } Unexecuted instantiation: cdef_block_sse4.c:v128_mullo_s32 cdef_block_avx2.c:v128_mullo_s32 Line | Count | Source | 406 | 114k | SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) { | 407 | 114k | #if defined(__SSE4_1__) | 408 | 114k | return _mm_mullo_epi32(a, b); | 409 | | #else | 410 | | return _mm_unpacklo_epi32( | 411 | | _mm_shuffle_epi32(_mm_mul_epu32(a, b), 8), | 412 | | _mm_shuffle_epi32( | 413 | | _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4)), 8)); | 414 | | #endif | 415 | 114k | } |
|
416 | | |
417 | 0 | SIMD_INLINE int64_t v128_dotp_s32(v128 a, v128 b) { |
418 | 0 | v128 r = v128_mullo_s32(a, b); |
419 | 0 | return (int64_t)_mm_cvtsi128_si32(r) + |
420 | 0 | (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 4)) + |
421 | 0 | (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 8)) + |
422 | 0 | (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 12)); |
423 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_dotp_s32 Unexecuted instantiation: cdef_block_avx2.c:v128_dotp_s32 |
424 | | |
425 | 114k | SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) { return _mm_madd_epi16(a, b); } Unexecuted instantiation: cdef_block_sse4.c:v128_madd_s16 cdef_block_avx2.c:v128_madd_s16 Line | Count | Source | 425 | 114k | SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) { return _mm_madd_epi16(a, b); } |
|
426 | | |
427 | 0 | SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) { |
428 | 0 | #if defined(__SSSE3__) |
429 | 0 | return _mm_maddubs_epi16(a, b); |
430 | 0 | #else |
431 | 0 | return _mm_packs_epi32( |
432 | 0 | _mm_madd_epi16(_mm_unpacklo_epi8(a, _mm_setzero_si128()), |
433 | 0 | _mm_srai_epi16(_mm_unpacklo_epi8(b, b), 8)), |
434 | 0 | _mm_madd_epi16(_mm_unpackhi_epi8(a, _mm_setzero_si128()), |
435 | 0 | _mm_srai_epi16(_mm_unpackhi_epi8(b, b), 8))); |
436 | 0 | #endif |
437 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_madd_us8 Unexecuted instantiation: cdef_block_avx2.c:v128_madd_us8 |
438 | | |
439 | 0 | SIMD_INLINE v128 v128_padd_u8(v128 a) { |
440 | 0 | return v128_madd_us8(a, _mm_set1_epi8(1)); |
441 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_padd_u8 Unexecuted instantiation: cdef_block_avx2.c:v128_padd_u8 |
442 | | |
443 | 0 | SIMD_INLINE v128 v128_avg_u8(v128 a, v128 b) { return _mm_avg_epu8(a, b); } Unexecuted instantiation: cdef_block_sse4.c:v128_avg_u8 Unexecuted instantiation: cdef_block_avx2.c:v128_avg_u8 |
444 | | |
445 | 0 | SIMD_INLINE v128 v128_rdavg_u8(v128 a, v128 b) { |
446 | 0 | return _mm_sub_epi8(_mm_avg_epu8(a, b), |
447 | 0 | _mm_and_si128(_mm_xor_si128(a, b), v128_dup_8(1))); |
448 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_rdavg_u8 Unexecuted instantiation: cdef_block_avx2.c:v128_rdavg_u8 |
449 | | |
450 | 0 | SIMD_INLINE v128 v128_rdavg_u16(v128 a, v128 b) { |
451 | 0 | return _mm_sub_epi16(_mm_avg_epu16(a, b), |
452 | 0 | _mm_and_si128(_mm_xor_si128(a, b), v128_dup_16(1))); |
453 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_rdavg_u16 Unexecuted instantiation: cdef_block_avx2.c:v128_rdavg_u16 |
454 | | |
455 | 0 | SIMD_INLINE v128 v128_avg_u16(v128 a, v128 b) { return _mm_avg_epu16(a, b); } Unexecuted instantiation: cdef_block_sse4.c:v128_avg_u16 Unexecuted instantiation: cdef_block_avx2.c:v128_avg_u16 |
456 | | |
457 | 0 | SIMD_INLINE v128 v128_min_u8(v128 a, v128 b) { return _mm_min_epu8(a, b); } Unexecuted instantiation: cdef_block_sse4.c:v128_min_u8 Unexecuted instantiation: cdef_block_avx2.c:v128_min_u8 |
458 | | |
459 | 0 | SIMD_INLINE v128 v128_max_u8(v128 a, v128 b) { return _mm_max_epu8(a, b); } Unexecuted instantiation: cdef_block_sse4.c:v128_max_u8 Unexecuted instantiation: cdef_block_avx2.c:v128_max_u8 |
460 | | |
461 | 0 | SIMD_INLINE v128 v128_min_s8(v128 a, v128 b) { |
462 | 0 | #if defined(__SSE4_1__) |
463 | 0 | return _mm_min_epi8(a, b); |
464 | 0 | #else |
465 | 0 | v128 mask = _mm_cmplt_epi8(a, b); |
466 | 0 | return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a)); |
467 | 0 | #endif |
468 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_min_s8 Unexecuted instantiation: cdef_block_avx2.c:v128_min_s8 |
469 | | |
470 | 8.18k | SIMD_INLINE uint32_t v128_movemask_8(v128 a) { return _mm_movemask_epi8(a); } Unexecuted instantiation: cdef_block_sse4.c:v128_movemask_8 cdef_block_avx2.c:v128_movemask_8 Line | Count | Source | 470 | 8.18k | SIMD_INLINE uint32_t v128_movemask_8(v128 a) { return _mm_movemask_epi8(a); } |
|
471 | | |
472 | 0 | SIMD_INLINE v128 v128_blend_8(v128 a, v128 b, v128 c) { |
473 | 0 | #if defined(__SSE4_1__) |
474 | 0 | return _mm_blendv_epi8(a, b, c); |
475 | 0 | #else |
476 | 0 | c = _mm_cmplt_epi8(c, v128_zero()); |
477 | 0 | return v128_or(v128_and(b, c), v128_andn(a, c)); |
478 | 0 | #endif |
479 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_blend_8 Unexecuted instantiation: cdef_block_avx2.c:v128_blend_8 |
480 | | |
481 | 0 | SIMD_INLINE v128 v128_max_s8(v128 a, v128 b) { |
482 | 0 | #if defined(__SSE4_1__) |
483 | 0 | return _mm_max_epi8(a, b); |
484 | 0 | #else |
485 | 0 | v128 mask = _mm_cmplt_epi8(b, a); |
486 | 0 | return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a)); |
487 | 0 | #endif |
488 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_max_s8 Unexecuted instantiation: cdef_block_avx2.c:v128_max_s8 |
489 | | |
490 | 0 | SIMD_INLINE v128 v128_min_s16(v128 a, v128 b) { return _mm_min_epi16(a, b); } Unexecuted instantiation: cdef_block_sse4.c:v128_min_s16 Unexecuted instantiation: cdef_block_avx2.c:v128_min_s16 |
491 | | |
492 | 0 | SIMD_INLINE v128 v128_max_s16(v128 a, v128 b) { return _mm_max_epi16(a, b); } Unexecuted instantiation: cdef_block_sse4.c:v128_max_s16 Unexecuted instantiation: cdef_block_avx2.c:v128_max_s16 |
493 | | |
494 | 0 | SIMD_INLINE v128 v128_min_s32(v128 a, v128 b) { |
495 | 0 | #if defined(__SSE4_1__) |
496 | 0 | return _mm_min_epi32(a, b); |
497 | 0 | #else |
498 | 0 | v128 mask = _mm_cmplt_epi32(a, b); |
499 | 0 | return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a)); |
500 | 0 | #endif |
501 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_min_s32 Unexecuted instantiation: cdef_block_avx2.c:v128_min_s32 |
502 | | |
503 | 24.5k | SIMD_INLINE v128 v128_max_s32(v128 a, v128 b) { |
504 | 24.5k | #if defined(__SSE4_1__) |
505 | 24.5k | return _mm_max_epi32(a, b); |
506 | | #else |
507 | | v128 mask = _mm_cmplt_epi32(b, a); |
508 | | return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a)); |
509 | | #endif |
510 | 24.5k | } Unexecuted instantiation: cdef_block_sse4.c:v128_max_s32 cdef_block_avx2.c:v128_max_s32 Line | Count | Source | 503 | 24.5k | SIMD_INLINE v128 v128_max_s32(v128 a, v128 b) { | 504 | 24.5k | #if defined(__SSE4_1__) | 505 | 24.5k | return _mm_max_epi32(a, b); | 506 | | #else | 507 | | v128 mask = _mm_cmplt_epi32(b, a); | 508 | | return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a)); | 509 | | #endif | 510 | 24.5k | } |
|
511 | | |
512 | 0 | SIMD_INLINE v128 v128_cmpgt_s8(v128 a, v128 b) { return _mm_cmpgt_epi8(a, b); } Unexecuted instantiation: cdef_block_sse4.c:v128_cmpgt_s8 Unexecuted instantiation: cdef_block_avx2.c:v128_cmpgt_s8 |
513 | | |
514 | 0 | SIMD_INLINE v128 v128_cmplt_s8(v128 a, v128 b) { return _mm_cmplt_epi8(a, b); } Unexecuted instantiation: cdef_block_sse4.c:v128_cmplt_s8 Unexecuted instantiation: cdef_block_avx2.c:v128_cmplt_s8 |
515 | | |
516 | 0 | SIMD_INLINE v128 v128_cmpeq_8(v128 a, v128 b) { return _mm_cmpeq_epi8(a, b); } Unexecuted instantiation: cdef_block_sse4.c:v128_cmpeq_8 Unexecuted instantiation: cdef_block_avx2.c:v128_cmpeq_8 |
517 | | |
518 | 0 | SIMD_INLINE v128 v128_cmpgt_s16(v128 a, v128 b) { |
519 | 0 | return _mm_cmpgt_epi16(a, b); |
520 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_cmpgt_s16 Unexecuted instantiation: cdef_block_avx2.c:v128_cmpgt_s16 |
521 | | |
522 | 0 | SIMD_INLINE v128 v128_cmplt_s16(v128 a, v128 b) { |
523 | 0 | return _mm_cmplt_epi16(a, b); |
524 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_cmplt_s16 Unexecuted instantiation: cdef_block_avx2.c:v128_cmplt_s16 |
525 | | |
526 | 16.3k | SIMD_INLINE v128 v128_cmpeq_32(v128 a, v128 b) { return _mm_cmpeq_epi32(a, b); } Unexecuted instantiation: cdef_block_sse4.c:v128_cmpeq_32 cdef_block_avx2.c:v128_cmpeq_32 Line | Count | Source | 526 | 16.3k | SIMD_INLINE v128 v128_cmpeq_32(v128 a, v128 b) { return _mm_cmpeq_epi32(a, b); } |
|
527 | | |
528 | 0 | SIMD_INLINE v128 v128_cmpgt_s32(v128 a, v128 b) { |
529 | 0 | return _mm_cmpgt_epi32(a, b); |
530 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_cmpgt_s32 Unexecuted instantiation: cdef_block_avx2.c:v128_cmpgt_s32 |
531 | | |
532 | 0 | SIMD_INLINE v128 v128_cmplt_s32(v128 a, v128 b) { |
533 | 0 | return _mm_cmplt_epi32(a, b); |
534 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_cmplt_s32 Unexecuted instantiation: cdef_block_avx2.c:v128_cmplt_s32 |
535 | | |
536 | 0 | SIMD_INLINE v128 v128_cmpeq_16(v128 a, v128 b) { return _mm_cmpeq_epi16(a, b); } Unexecuted instantiation: cdef_block_sse4.c:v128_cmpeq_16 Unexecuted instantiation: cdef_block_avx2.c:v128_cmpeq_16 |
537 | | |
538 | 0 | SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) { |
539 | 0 | return _mm_and_si128(_mm_set1_epi8((char)(0xff << c)), |
540 | 0 | _mm_sll_epi16(a, _mm_cvtsi32_si128((int)c))); |
541 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_shl_8 Unexecuted instantiation: cdef_block_avx2.c:v128_shl_8 |
542 | | |
543 | 0 | SIMD_INLINE v128 v128_shr_u8(v128 a, unsigned int c) { |
544 | 0 | return _mm_and_si128(_mm_set1_epi8((char)(0xff >> c)), |
545 | 0 | _mm_srl_epi16(a, _mm_cvtsi32_si128((int)c))); |
546 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_shr_u8 Unexecuted instantiation: cdef_block_avx2.c:v128_shr_u8 |
547 | | |
548 | 0 | SIMD_INLINE v128 v128_shr_s8(v128 a, unsigned int c) { |
549 | 0 | __m128i x = _mm_cvtsi32_si128((int)(c + 8)); |
550 | 0 | return _mm_packs_epi16(_mm_sra_epi16(_mm_unpacklo_epi8(a, a), x), |
551 | 0 | _mm_sra_epi16(_mm_unpackhi_epi8(a, a), x)); |
552 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_shr_s8 Unexecuted instantiation: cdef_block_avx2.c:v128_shr_s8 |
553 | | |
554 | 0 | SIMD_INLINE v128 v128_shl_16(v128 a, unsigned int c) { |
555 | 0 | return _mm_sll_epi16(a, _mm_cvtsi32_si128((int)c)); |
556 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_shl_16 Unexecuted instantiation: cdef_block_avx2.c:v128_shl_16 |
557 | | |
558 | 0 | SIMD_INLINE v128 v128_shr_u16(v128 a, unsigned int c) { |
559 | 0 | return _mm_srl_epi16(a, _mm_cvtsi32_si128((int)c)); |
560 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_shr_u16 Unexecuted instantiation: cdef_block_avx2.c:v128_shr_u16 |
561 | | |
562 | 65.4k | SIMD_INLINE v128 v128_shr_s16(v128 a, unsigned int c) { |
563 | 65.4k | return _mm_sra_epi16(a, _mm_cvtsi32_si128((int)c)); |
564 | 65.4k | } Unexecuted instantiation: cdef_block_sse4.c:v128_shr_s16 cdef_block_avx2.c:v128_shr_s16 Line | Count | Source | 562 | 65.4k | SIMD_INLINE v128 v128_shr_s16(v128 a, unsigned int c) { | 563 | 65.4k | return _mm_sra_epi16(a, _mm_cvtsi32_si128((int)c)); | 564 | 65.4k | } |
|
565 | | |
566 | 0 | SIMD_INLINE v128 v128_shl_32(v128 a, unsigned int c) { |
567 | 0 | return _mm_sll_epi32(a, _mm_cvtsi32_si128((int)c)); |
568 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_shl_32 Unexecuted instantiation: cdef_block_avx2.c:v128_shl_32 |
569 | | |
570 | 0 | SIMD_INLINE v128 v128_shr_u32(v128 a, unsigned int c) { |
571 | 0 | return _mm_srl_epi32(a, _mm_cvtsi32_si128((int)c)); |
572 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_shr_u32 Unexecuted instantiation: cdef_block_avx2.c:v128_shr_u32 |
573 | | |
574 | 0 | SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) { |
575 | 0 | return _mm_sra_epi32(a, _mm_cvtsi32_si128((int)c)); |
576 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_shr_s32 Unexecuted instantiation: cdef_block_avx2.c:v128_shr_s32 |
577 | | |
578 | 0 | SIMD_INLINE v128 v128_shl_64(v128 a, unsigned int c) { |
579 | 0 | return _mm_sll_epi64(a, _mm_cvtsi32_si128((int)c)); |
580 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_shl_64 Unexecuted instantiation: cdef_block_avx2.c:v128_shl_64 |
581 | | |
582 | 0 | SIMD_INLINE v128 v128_shr_u64(v128 a, unsigned int c) { |
583 | 0 | return _mm_srl_epi64(a, _mm_cvtsi32_si128((int)c)); |
584 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_shr_u64 Unexecuted instantiation: cdef_block_avx2.c:v128_shr_u64 |
585 | | |
586 | 0 | SIMD_INLINE v128 v128_shr_s64(v128 a, unsigned int c) { |
587 | 0 | // _mm_sra_epi64 is missing in gcc? |
588 | 0 | return v128_from_64((uint64_t)((int64_t)v64_u64(v128_high_v64(a)) >> c), |
589 | 0 | (uint64_t)((int64_t)v64_u64(v128_low_v64(a)) >> c)); |
590 | 0 | // return _mm_sra_epi64(a, _mm_cvtsi32_si128((int)c)); |
591 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_shr_s64 Unexecuted instantiation: cdef_block_avx2.c:v128_shr_s64 |
592 | | |
593 | | /* These intrinsics require immediate values, so we must use #defines |
594 | | to enforce that. */ |
595 | 245k | #define v128_shl_n_byte(a, c) _mm_slli_si128(a, (c)&127) |
596 | 245k | #define v128_shr_n_byte(a, c) _mm_srli_si128(a, (c)&127) |
597 | | #define v128_shl_n_8(a, c) \ |
598 | | _mm_and_si128(_mm_set1_epi8((char)(0xff << (c))), _mm_slli_epi16(a, c)) |
599 | | #define v128_shr_n_u8(a, c) \ |
600 | | _mm_and_si128(_mm_set1_epi8((char)(0xff >> (c))), _mm_srli_epi16(a, c)) |
601 | | #define v128_shr_n_s8(a, c) \ |
602 | | _mm_packs_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), (c) + 8), \ |
603 | | _mm_srai_epi16(_mm_unpackhi_epi8(a, a), (c) + 8)) |
604 | | #define v128_shl_n_16(a, c) _mm_slli_epi16(a, c) |
605 | | #define v128_shr_n_u16(a, c) _mm_srli_epi16(a, c) |
606 | 0 | #define v128_shr_n_s16(a, c) _mm_srai_epi16(a, c) |
607 | | #define v128_shl_n_32(a, c) _mm_slli_epi32(a, c) |
608 | | #define v128_shr_n_u32(a, c) _mm_srli_epi32(a, c) |
609 | | #define v128_shr_n_s32(a, c) _mm_srai_epi32(a, c) |
610 | | #define v128_shl_n_64(a, c) _mm_slli_epi64(a, c) |
611 | | #define v128_shr_n_u64(a, c) _mm_srli_epi64(a, c) |
612 | | #define v128_shr_n_s64(a, c) \ |
613 | | v128_shr_s64(a, c) // _mm_srai_epi64 missing in gcc? |
614 | | |
615 | | typedef v128 sad128_internal_u16; |
616 | | |
617 | 0 | SIMD_INLINE sad128_internal_u16 v128_sad_u16_init(void) { return v128_zero(); } Unexecuted instantiation: cdef_block_sse4.c:v128_sad_u16_init Unexecuted instantiation: cdef_block_avx2.c:v128_sad_u16_init |
618 | | |
619 | | /* Implementation dependent return value. Result must be finalised with |
620 | | * v128_sad_u16_sum(). */ |
621 | | SIMD_INLINE sad128_internal_u16 v128_sad_u16(sad128_internal_u16 s, v128 a, |
622 | 0 | v128 b) { |
623 | 0 | #if defined(__SSE4_1__) |
624 | 0 | v128 t = v128_sub_16(_mm_max_epu16(a, b), _mm_min_epu16(a, b)); |
625 | 0 | #else |
626 | 0 | v128 t = v128_cmplt_s16(v128_xor(a, v128_dup_16(32768)), |
627 | 0 | v128_xor(b, v128_dup_16(32768))); |
628 | 0 | t = v128_sub_16(v128_or(v128_and(b, t), v128_andn(a, t)), |
629 | 0 | v128_or(v128_and(a, t), v128_andn(b, t))); |
630 | 0 | #endif |
631 | 0 | return v128_add_32( |
632 | 0 | s, v128_add_32(v128_unpackhi_u16_s32(t), v128_unpacklo_u16_s32(t))); |
633 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_sad_u16 Unexecuted instantiation: cdef_block_avx2.c:v128_sad_u16 |
634 | | |
635 | 0 | SIMD_INLINE uint32_t v128_sad_u16_sum(sad128_internal_u16 s) { |
636 | 0 | return v128_low_u32(s) + v128_low_u32(v128_shr_n_byte(s, 4)) + |
637 | 0 | v128_low_u32(v128_shr_n_byte(s, 8)) + |
638 | 0 | v128_low_u32(v128_shr_n_byte(s, 12)); |
639 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_sad_u16_sum Unexecuted instantiation: cdef_block_avx2.c:v128_sad_u16_sum |
640 | | |
641 | | typedef v128 ssd128_internal_s16; |
642 | | |
643 | 0 | SIMD_INLINE ssd128_internal_s16 v128_ssd_s16_init(void) { return v128_zero(); } Unexecuted instantiation: cdef_block_sse4.c:v128_ssd_s16_init Unexecuted instantiation: cdef_block_avx2.c:v128_ssd_s16_init |
644 | | |
645 | | /* Implementation dependent return value. Result must be finalised with |
646 | | * v128_ssd_s16_sum(). */ |
647 | | SIMD_INLINE ssd128_internal_s16 v128_ssd_s16(ssd128_internal_s16 s, v128 a, |
648 | 0 | v128 b) { |
649 | 0 | v128 d = v128_sub_16(a, b); |
650 | 0 | d = v128_madd_s16(d, d); |
651 | 0 | return v128_add_64(s, v128_add_64(_mm_unpackhi_epi32(d, v128_zero()), |
652 | 0 | _mm_unpacklo_epi32(d, v128_zero()))); |
653 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_ssd_s16 Unexecuted instantiation: cdef_block_avx2.c:v128_ssd_s16 |
654 | | |
655 | 0 | SIMD_INLINE uint64_t v128_ssd_s16_sum(ssd128_internal_s16 s) { |
656 | 0 | return v64_u64(v128_low_v64(s)) + v64_u64(v128_high_v64(s)); |
657 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:v128_ssd_s16_sum Unexecuted instantiation: cdef_block_avx2.c:v128_ssd_s16_sum |
658 | | |
659 | | #endif // AOM_AOM_DSP_SIMD_V128_INTRINSICS_X86_H_ |