/src/aom/aom_dsp/simd/v256_intrinsics_v128.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2016, Alliance for Open Media. All rights reserved. |
3 | | * |
4 | | * This source code is subject to the terms of the BSD 2 Clause License and |
5 | | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
6 | | * was not distributed with this source code in the LICENSE file, you can |
7 | | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
8 | | * Media Patent License 1.0 was not distributed with this source code in the |
9 | | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
10 | | */ |
11 | | |
12 | | #ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_V128_H_ |
13 | | #define AOM_AOM_DSP_SIMD_V256_INTRINSICS_V128_H_ |
14 | | |
15 | | #include "config/aom_config.h" |
16 | | |
17 | | #if HAVE_NEON |
18 | | #error "Do not use this file for Neon" |
19 | | #endif |
20 | | |
21 | | #if HAVE_SSE2 |
22 | | #include "aom_dsp/simd/v128_intrinsics_x86.h" |
23 | | #else |
24 | | #include "aom_dsp/simd/v128_intrinsics.h" |
25 | | #endif |
26 | | |
27 | | typedef struct { |
28 | | v128 val[2]; |
29 | | } v256; |
30 | | |
31 | 0 | SIMD_INLINE uint32_t v256_low_u32(v256 a) { return v128_low_u32(a.val[0]); } |
32 | | |
33 | 0 | SIMD_INLINE v64 v256_low_v64(v256 a) { return v128_low_v64(a.val[0]); } |
34 | | |
35 | 0 | SIMD_INLINE uint64_t v256_low_u64(v256 a) { return v64_u64(v256_low_v64(a)); } |
36 | | |
37 | 0 | SIMD_INLINE v128 v256_low_v128(v256 a) { return a.val[0]; } |
38 | | |
39 | 0 | SIMD_INLINE v128 v256_high_v128(v256 a) { return a.val[1]; } |
40 | | |
41 | 0 | SIMD_INLINE v256 v256_from_v128(v128 hi, v128 lo) { |
42 | 0 | v256 t; |
43 | 0 | t.val[1] = hi; |
44 | 0 | t.val[0] = lo; |
45 | 0 | return t; |
46 | 0 | } |
47 | | |
48 | 0 | SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) { |
49 | 0 | return v256_from_v128(v128_from_64(a, b), v128_from_64(c, d)); |
50 | 0 | } |
51 | | |
52 | 0 | SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) { |
53 | 0 | return v256_from_v128(v128_from_v64(a, b), v128_from_v64(c, d)); |
54 | 0 | } |
55 | | |
56 | 0 | SIMD_INLINE v256 v256_load_unaligned(const void *p) { |
57 | 0 | return v256_from_v128(v128_load_unaligned((uint8_t *)p + 16), |
58 | 0 | v128_load_unaligned(p)); |
59 | 0 | } |
60 | | |
61 | 0 | SIMD_INLINE v256 v256_load_aligned(const void *p) { |
62 | 0 | return v256_from_v128(v128_load_aligned((uint8_t *)p + 16), |
63 | 0 | v128_load_aligned(p)); |
64 | 0 | } |
65 | | |
66 | 0 | SIMD_INLINE void v256_store_unaligned(void *p, v256 a) { |
67 | 0 | v128_store_unaligned(p, a.val[0]); |
68 | 0 | v128_store_unaligned((uint8_t *)p + 16, a.val[1]); |
69 | 0 | } |
70 | | |
71 | 0 | SIMD_INLINE void v256_store_aligned(void *p, v256 a) { |
72 | 0 | v128_store_aligned(p, a.val[0]); |
73 | 0 | v128_store_aligned((uint8_t *)p + 16, a.val[1]); |
74 | 0 | } |
75 | | |
76 | 0 | SIMD_INLINE v256 v256_zero(void) { |
77 | 0 | return v256_from_v128(v128_zero(), v128_zero()); |
78 | 0 | } |
79 | | |
80 | 0 | SIMD_INLINE v256 v256_dup_8(uint8_t x) { |
81 | 0 | v128 t = v128_dup_8(x); |
82 | 0 | return v256_from_v128(t, t); |
83 | 0 | } |
84 | | |
85 | 0 | SIMD_INLINE v256 v256_dup_16(uint16_t x) { |
86 | 0 | v128 t = v128_dup_16(x); |
87 | 0 | return v256_from_v128(t, t); |
88 | 0 | } |
89 | | |
90 | 0 | SIMD_INLINE v256 v256_dup_32(uint32_t x) { |
91 | 0 | v128 t = v128_dup_32(x); |
92 | 0 | return v256_from_v128(t, t); |
93 | 0 | } |
94 | | |
95 | 0 | SIMD_INLINE v256 v256_dup_64(uint64_t x) { |
96 | 0 | v128 t = v128_dup_64(x); |
97 | 0 | return v256_from_v128(t, t); |
98 | 0 | } |
99 | | |
100 | 0 | SIMD_INLINE int64_t v256_dotp_su8(v256 a, v256 b) { |
101 | 0 | return v128_dotp_su8(a.val[1], b.val[1]) + v128_dotp_su8(a.val[0], b.val[0]); |
102 | 0 | } |
103 | | |
104 | 0 | SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) { |
105 | 0 | return v128_dotp_s16(a.val[1], b.val[1]) + v128_dotp_s16(a.val[0], b.val[0]); |
106 | 0 | } |
107 | | |
108 | 0 | SIMD_INLINE int64_t v256_dotp_s32(v256 a, v256 b) { |
109 | 0 | return v128_dotp_s32(a.val[1], b.val[1]) + v128_dotp_s32(a.val[0], b.val[0]); |
110 | 0 | } |
111 | | |
112 | 0 | SIMD_INLINE uint64_t v256_hadd_u8(v256 a) { |
113 | 0 | return v128_hadd_u8(a.val[1]) + v128_hadd_u8(a.val[0]); |
114 | 0 | } |
115 | | |
116 | | typedef struct { |
117 | | sad128_internal val[2]; |
118 | | } sad256_internal; |
119 | | |
120 | 0 | SIMD_INLINE sad256_internal v256_sad_u8_init(void) { |
121 | 0 | sad256_internal t; |
122 | 0 | t.val[1] = v128_sad_u8_init(); |
123 | 0 | t.val[0] = v128_sad_u8_init(); |
124 | 0 | return t; |
125 | 0 | } |
126 | | |
127 | | /* Implementation dependent return value. Result must be finalised with |
128 | | v256_sad_u8_sum(). |
129 | | The result for more than 16 v256_sad_u8() calls is undefined. */ |
130 | 0 | SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) { |
131 | 0 | sad256_internal t; |
132 | 0 | t.val[1] = v128_sad_u8(s.val[1], a.val[1], b.val[1]); |
133 | 0 | t.val[0] = v128_sad_u8(s.val[0], a.val[0], b.val[0]); |
134 | 0 | return t; |
135 | 0 | } |
136 | | |
137 | 0 | SIMD_INLINE uint32_t v256_sad_u8_sum(sad256_internal s) { |
138 | 0 | return v128_sad_u8_sum(s.val[1]) + v128_sad_u8_sum(s.val[0]); |
139 | 0 | } |
140 | | |
141 | | typedef struct { |
142 | | ssd128_internal val[2]; |
143 | | } ssd256_internal; |
144 | | |
145 | 0 | SIMD_INLINE ssd256_internal v256_ssd_u8_init(void) { |
146 | 0 | ssd256_internal t; |
147 | 0 | t.val[1] = v128_ssd_u8_init(); |
148 | 0 | t.val[0] = v128_ssd_u8_init(); |
149 | 0 | return t; |
150 | 0 | } |
151 | | |
152 | | /* Implementation dependent return value. Result must be finalised with |
153 | | * v256_ssd_u8_sum(). */ |
154 | 0 | SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) { |
155 | 0 | ssd256_internal t; |
156 | 0 | t.val[1] = v128_ssd_u8(s.val[1], a.val[1], b.val[1]); |
157 | 0 | t.val[0] = v128_ssd_u8(s.val[0], a.val[0], b.val[0]); |
158 | 0 | return t; |
159 | 0 | } |
160 | | |
161 | 0 | SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) { |
162 | 0 | return v128_ssd_u8_sum(s.val[1]) + v128_ssd_u8_sum(s.val[0]); |
163 | 0 | } |
164 | | |
165 | 0 | SIMD_INLINE v256 v256_or(v256 a, v256 b) { |
166 | 0 | return v256_from_v128(v128_or(a.val[1], b.val[1]), |
167 | 0 | v128_or(a.val[0], b.val[0])); |
168 | 0 | } |
169 | | |
170 | 0 | SIMD_INLINE v256 v256_xor(v256 a, v256 b) { |
171 | 0 | return v256_from_v128(v128_xor(a.val[1], b.val[1]), |
172 | 0 | v128_xor(a.val[0], b.val[0])); |
173 | 0 | } |
174 | | |
175 | 0 | SIMD_INLINE v256 v256_and(v256 a, v256 b) { |
176 | 0 | return v256_from_v128(v128_and(a.val[1], b.val[1]), |
177 | 0 | v128_and(a.val[0], b.val[0])); |
178 | 0 | } |
179 | | |
180 | 0 | SIMD_INLINE v256 v256_andn(v256 a, v256 b) { |
181 | 0 | return v256_from_v128(v128_andn(a.val[1], b.val[1]), |
182 | 0 | v128_andn(a.val[0], b.val[0])); |
183 | 0 | } |
184 | | |
185 | 0 | SIMD_INLINE v256 v256_add_8(v256 a, v256 b) { |
186 | 0 | return v256_from_v128(v128_add_8(a.val[1], b.val[1]), |
187 | 0 | v128_add_8(a.val[0], b.val[0])); |
188 | 0 | } |
189 | | |
190 | 0 | SIMD_INLINE v256 v256_add_16(v256 a, v256 b) { |
191 | 0 | return v256_from_v128(v128_add_16(a.val[1], b.val[1]), |
192 | 0 | v128_add_16(a.val[0], b.val[0])); |
193 | 0 | } |
194 | | |
195 | 0 | SIMD_INLINE v256 v256_sadd_s8(v256 a, v256 b) { |
196 | 0 | return v256_from_v128(v128_sadd_s8(a.val[1], b.val[1]), |
197 | 0 | v128_sadd_s8(a.val[0], b.val[0])); |
198 | 0 | } |
199 | | |
200 | 0 | SIMD_INLINE v256 v256_sadd_u8(v256 a, v256 b) { |
201 | 0 | return v256_from_v128(v128_sadd_u8(a.val[1], b.val[1]), |
202 | 0 | v128_sadd_u8(a.val[0], b.val[0])); |
203 | 0 | } |
204 | | |
205 | 0 | SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) { |
206 | 0 | return v256_from_v128(v128_sadd_s16(a.val[1], b.val[1]), |
207 | 0 | v128_sadd_s16(a.val[0], b.val[0])); |
208 | 0 | } |
209 | | |
210 | 0 | SIMD_INLINE v256 v256_add_32(v256 a, v256 b) { |
211 | 0 | return v256_from_v128(v128_add_32(a.val[1], b.val[1]), |
212 | 0 | v128_add_32(a.val[0], b.val[0])); |
213 | 0 | } |
214 | | |
215 | 0 | SIMD_INLINE v256 v256_add_64(v256 a, v256 b) { |
216 | 0 | return v256_from_v128(v128_add_64(a.val[1], b.val[1]), |
217 | 0 | v128_add_64(a.val[0], b.val[0])); |
218 | 0 | } |
219 | | |
220 | 0 | SIMD_INLINE v256 v256_padd_u8(v256 a) { |
221 | 0 | return v256_from_v128(v128_padd_u8(a.val[1]), v128_padd_u8(a.val[0])); |
222 | 0 | } |
223 | | |
224 | 0 | SIMD_INLINE v256 v256_padd_s16(v256 a) { |
225 | 0 | return v256_from_v128(v128_padd_s16(a.val[1]), v128_padd_s16(a.val[0])); |
226 | 0 | } |
227 | | |
228 | 0 | SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) { |
229 | 0 | return v256_from_v128(v128_sub_8(a.val[1], b.val[1]), |
230 | 0 | v128_sub_8(a.val[0], b.val[0])); |
231 | 0 | } |
232 | | |
233 | 0 | SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) { |
234 | 0 | return v256_from_v128(v128_ssub_u8(a.val[1], b.val[1]), |
235 | 0 | v128_ssub_u8(a.val[0], b.val[0])); |
236 | 0 | } |
237 | | |
238 | 0 | SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) { |
239 | 0 | return v256_from_v128(v128_ssub_s8(a.val[1], b.val[1]), |
240 | 0 | v128_ssub_s8(a.val[0], b.val[0])); |
241 | 0 | } |
242 | | |
243 | 0 | SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) { |
244 | 0 | return v256_from_v128(v128_sub_16(a.val[1], b.val[1]), |
245 | 0 | v128_sub_16(a.val[0], b.val[0])); |
246 | 0 | } |
247 | | |
248 | 0 | SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) { |
249 | 0 | return v256_from_v128(v128_ssub_s16(a.val[1], b.val[1]), |
250 | 0 | v128_ssub_s16(a.val[0], b.val[0])); |
251 | 0 | } |
252 | | |
253 | 0 | SIMD_INLINE v256 v256_ssub_u16(v256 a, v256 b) { |
254 | 0 | return v256_from_v128(v128_ssub_u16(a.val[1], b.val[1]), |
255 | 0 | v128_ssub_u16(a.val[0], b.val[0])); |
256 | 0 | } |
257 | | |
258 | 0 | SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) { |
259 | 0 | return v256_from_v128(v128_sub_32(a.val[1], b.val[1]), |
260 | 0 | v128_sub_32(a.val[0], b.val[0])); |
261 | 0 | } |
262 | | |
263 | 0 | SIMD_INLINE v256 v256_sub_64(v256 a, v256 b) { |
264 | 0 | return v256_from_v128(v128_sub_64(a.val[1], b.val[1]), |
265 | 0 | v128_sub_64(a.val[0], b.val[0])); |
266 | 0 | } |
267 | | |
268 | 0 | SIMD_INLINE v256 v256_abs_s16(v256 a) { |
269 | 0 | return v256_from_v128(v128_abs_s16(a.val[1]), v128_abs_s16(a.val[0])); |
270 | 0 | } |
271 | | |
272 | 0 | SIMD_INLINE v256 v256_abs_s8(v256 a) { |
273 | 0 | return v256_from_v128(v128_abs_s8(a.val[1]), v128_abs_s8(a.val[0])); |
274 | 0 | } |
275 | | |
276 | 0 | SIMD_INLINE v256 v256_mul_s16(v128 a, v128 b) { |
277 | 0 | v128 lo_bits = v128_mullo_s16(a, b); |
278 | 0 | v128 hi_bits = v128_mulhi_s16(a, b); |
279 | 0 | return v256_from_v128(v128_ziphi_16(hi_bits, lo_bits), |
280 | 0 | v128_ziplo_16(hi_bits, lo_bits)); |
281 | 0 | } |
282 | | |
283 | 0 | SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) { |
284 | 0 | return v256_from_v128(v128_mullo_s16(a.val[1], b.val[1]), |
285 | 0 | v128_mullo_s16(a.val[0], b.val[0])); |
286 | 0 | } |
287 | | |
288 | 0 | SIMD_INLINE v256 v256_mulhi_s16(v256 a, v256 b) { |
289 | 0 | return v256_from_v128(v128_mulhi_s16(a.val[1], b.val[1]), |
290 | 0 | v128_mulhi_s16(a.val[0], b.val[0])); |
291 | 0 | } |
292 | | |
293 | 0 | SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) { |
294 | 0 | return v256_from_v128(v128_mullo_s32(a.val[1], b.val[1]), |
295 | 0 | v128_mullo_s32(a.val[0], b.val[0])); |
296 | 0 | } |
297 | | |
298 | 0 | SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) { |
299 | 0 | return v256_from_v128(v128_madd_s16(a.val[1], b.val[1]), |
300 | 0 | v128_madd_s16(a.val[0], b.val[0])); |
301 | 0 | } |
302 | | |
303 | 0 | SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) { |
304 | 0 | return v256_from_v128(v128_madd_us8(a.val[1], b.val[1]), |
305 | 0 | v128_madd_us8(a.val[0], b.val[0])); |
306 | 0 | } |
307 | | |
308 | 0 | SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) { |
309 | 0 | return v256_from_v128(v128_avg_u8(a.val[1], b.val[1]), |
310 | 0 | v128_avg_u8(a.val[0], b.val[0])); |
311 | 0 | } |
312 | | |
313 | 0 | SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) { |
314 | 0 | return v256_from_v128(v128_rdavg_u8(a.val[1], b.val[1]), |
315 | 0 | v128_rdavg_u8(a.val[0], b.val[0])); |
316 | 0 | } |
317 | | |
318 | 0 | SIMD_INLINE v256 v256_rdavg_u16(v256 a, v256 b) { |
319 | 0 | return v256_from_v128(v128_rdavg_u16(a.val[1], b.val[1]), |
320 | 0 | v128_rdavg_u16(a.val[0], b.val[0])); |
321 | 0 | } |
322 | | |
323 | 0 | SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) { |
324 | 0 | return v256_from_v128(v128_avg_u16(a.val[1], b.val[1]), |
325 | 0 | v128_avg_u16(a.val[0], b.val[0])); |
326 | 0 | } |
327 | | |
328 | 0 | SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) { |
329 | 0 | return v256_from_v128(v128_min_u8(a.val[1], b.val[1]), |
330 | 0 | v128_min_u8(a.val[0], b.val[0])); |
331 | 0 | } |
332 | | |
333 | 0 | SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) { |
334 | 0 | return v256_from_v128(v128_max_u8(a.val[1], b.val[1]), |
335 | 0 | v128_max_u8(a.val[0], b.val[0])); |
336 | 0 | } |
337 | | |
338 | 0 | SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) { |
339 | 0 | return v256_from_v128(v128_min_s8(a.val[1], b.val[1]), |
340 | 0 | v128_min_s8(a.val[0], b.val[0])); |
341 | 0 | } |
342 | | |
343 | 0 | SIMD_INLINE uint32_t v256_movemask_8(v256 a) { |
344 | 0 | return (v128_movemask_8(v256_high_v128(a)) << 16) | |
345 | 0 | v128_movemask_8(v256_low_v128(a)); |
346 | 0 | } |
347 | | |
348 | 0 | SIMD_INLINE v256 v256_blend_8(v256 a, v256 b, v256 c) { |
349 | 0 | return v256_from_v128(v128_blend_8(a.val[1], b.val[1], c.val[1]), |
350 | 0 | v128_blend_8(a.val[0], b.val[0], c.val[0])); |
351 | 0 | } |
352 | | |
353 | 0 | SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) { |
354 | 0 | return v256_from_v128(v128_max_s8(a.val[1], b.val[1]), |
355 | 0 | v128_max_s8(a.val[0], b.val[0])); |
356 | 0 | } |
357 | | |
358 | 0 | SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) { |
359 | 0 | return v256_from_v128(v128_min_s16(a.val[1], b.val[1]), |
360 | 0 | v128_min_s16(a.val[0], b.val[0])); |
361 | 0 | } |
362 | | |
363 | 0 | SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) { |
364 | 0 | return v256_from_v128(v128_max_s16(a.val[1], b.val[1]), |
365 | 0 | v128_max_s16(a.val[0], b.val[0])); |
366 | 0 | } |
367 | | |
368 | 0 | SIMD_INLINE v256 v256_min_s32(v256 a, v256 b) { |
369 | 0 | return v256_from_v128(v128_min_s32(a.val[1], b.val[1]), |
370 | 0 | v128_min_s32(a.val[0], b.val[0])); |
371 | 0 | } |
372 | | |
373 | 0 | SIMD_INLINE v256 v256_max_s32(v256 a, v256 b) { |
374 | 0 | return v256_from_v128(v128_max_s32(a.val[1], b.val[1]), |
375 | 0 | v128_max_s32(a.val[0], b.val[0])); |
376 | 0 | } |
377 | | |
378 | 0 | SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) { |
379 | 0 | return v256_from_v128(v128_ziphi_8(a.val[0], b.val[0]), |
380 | 0 | v128_ziplo_8(a.val[0], b.val[0])); |
381 | 0 | } |
382 | | |
383 | 0 | SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) { |
384 | 0 | return v256_from_v128(v128_ziphi_8(a.val[1], b.val[1]), |
385 | 0 | v128_ziplo_8(a.val[1], b.val[1])); |
386 | 0 | } |
387 | | |
388 | 0 | SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) { |
389 | 0 | return v256_from_v128(v128_ziphi_16(a.val[0], b.val[0]), |
390 | 0 | v128_ziplo_16(a.val[0], b.val[0])); |
391 | 0 | } |
392 | | |
393 | 0 | SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) { |
394 | 0 | return v256_from_v128(v128_ziphi_16(a.val[1], b.val[1]), |
395 | 0 | v128_ziplo_16(a.val[1], b.val[1])); |
396 | 0 | } |
397 | | |
398 | 0 | SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) { |
399 | 0 | return v256_from_v128(v128_ziphi_32(a.val[0], b.val[0]), |
400 | 0 | v128_ziplo_32(a.val[0], b.val[0])); |
401 | 0 | } |
402 | | |
403 | 0 | SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) { |
404 | 0 | return v256_from_v128(v128_ziphi_32(a.val[1], b.val[1]), |
405 | 0 | v128_ziplo_32(a.val[1], b.val[1])); |
406 | 0 | } |
407 | | |
408 | 0 | SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) { |
409 | 0 | return v256_from_v128(v128_ziphi_64(a.val[0], b.val[0]), |
410 | 0 | v128_ziplo_64(a.val[0], b.val[0])); |
411 | 0 | } |
412 | | |
413 | 0 | SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) { |
414 | 0 | return v256_from_v128(v128_ziphi_64(a.val[1], b.val[1]), |
415 | 0 | v128_ziplo_64(a.val[1], b.val[1])); |
416 | 0 | } |
417 | | |
418 | 0 | SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) { |
419 | 0 | return v256_from_v128(a.val[0], b.val[0]); |
420 | 0 | } |
421 | | |
422 | 0 | SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) { |
423 | 0 | return v256_from_v128(a.val[1], b.val[1]); |
424 | 0 | } |
425 | | |
426 | 0 | SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) { |
427 | 0 | return v256_from_v128(v128_ziphi_8(a, b), v128_ziplo_8(a, b)); |
428 | 0 | } |
429 | | |
430 | 0 | SIMD_INLINE v256 v256_zip_16(v128 a, v128 b) { |
431 | 0 | return v256_from_v128(v128_ziphi_16(a, b), v128_ziplo_16(a, b)); |
432 | 0 | } |
433 | | |
434 | 0 | SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) { |
435 | 0 | return v256_from_v128(v128_ziphi_32(a, b), v128_ziplo_32(a, b)); |
436 | 0 | } |
437 | | |
438 | 0 | SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) { |
439 | 0 | return v256_from_v128(v128_unziplo_8(a.val[1], a.val[0]), |
440 | 0 | v128_unziplo_8(b.val[1], b.val[0])); |
441 | 0 | } |
442 | | |
443 | 0 | SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) { |
444 | 0 | return v256_from_v128(v128_unziphi_8(a.val[1], a.val[0]), |
445 | 0 | v128_unziphi_8(b.val[1], b.val[0])); |
446 | 0 | } |
447 | | |
448 | 0 | SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) { |
449 | 0 | return v256_from_v128(v128_unziplo_16(a.val[1], a.val[0]), |
450 | 0 | v128_unziplo_16(b.val[1], b.val[0])); |
451 | 0 | } |
452 | | |
453 | 0 | SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) { |
454 | 0 | return v256_from_v128(v128_unziphi_16(a.val[1], a.val[0]), |
455 | 0 | v128_unziphi_16(b.val[1], b.val[0])); |
456 | 0 | } |
457 | | |
458 | 0 | SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) { |
459 | 0 | return v256_from_v128(v128_unziplo_32(a.val[1], a.val[0]), |
460 | 0 | v128_unziplo_32(b.val[1], b.val[0])); |
461 | 0 | } |
462 | | |
463 | 0 | SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) { |
464 | 0 | return v256_from_v128(v128_unziphi_32(a.val[1], a.val[0]), |
465 | 0 | v128_unziphi_32(b.val[1], b.val[0])); |
466 | 0 | } |
467 | | |
468 | 0 | SIMD_INLINE v256 v256_unziplo_64(v256 a, v256 b) { |
469 | 0 | #if HAVE_SSE2 |
470 | 0 | return v256_from_v128( |
471 | 0 | _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(a.val[0]), |
472 | 0 | _mm_castsi128_pd(a.val[1]), 0)), |
473 | 0 | _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(b.val[0]), |
474 | 0 | _mm_castsi128_pd(b.val[1]), 0))); |
475 | 0 | #else |
476 | 0 | return v256_from_v64(v128_low_v64(a.val[1]), v128_low_v64(a.val[0]), |
477 | 0 | v128_low_v64(b.val[1]), v128_low_v64(b.val[0])); |
478 | 0 | #endif |
479 | 0 | } |
480 | | |
481 | 0 | SIMD_INLINE v256 v256_unziphi_64(v256 a, v256 b) { |
482 | 0 | #if HAVE_SSE2 |
483 | 0 | return v256_from_v128( |
484 | 0 | _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(a.val[0]), |
485 | 0 | _mm_castsi128_pd(a.val[1]), 3)), |
486 | 0 | _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(b.val[0]), |
487 | 0 | _mm_castsi128_pd(b.val[1]), 3))); |
488 | 0 | #else |
489 | 0 | return v256_from_v64(v128_high_v64(a.val[1]), v128_high_v64(a.val[0]), |
490 | 0 | v128_high_v64(b.val[1]), v128_high_v64(b.val[0])); |
491 | 0 | #endif |
492 | 0 | } |
493 | | |
494 | 0 | SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) { |
495 | 0 | return v256_from_v128(v128_unpackhi_u8_s16(a), v128_unpacklo_u8_s16(a)); |
496 | 0 | } |
497 | | |
498 | 0 | SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) { |
499 | 0 | return v256_from_v128(v128_unpackhi_u8_s16(a.val[0]), |
500 | 0 | v128_unpacklo_u8_s16(a.val[0])); |
501 | 0 | } |
502 | | |
503 | 0 | SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) { |
504 | 0 | return v256_from_v128(v128_unpackhi_u8_s16(a.val[1]), |
505 | 0 | v128_unpacklo_u8_s16(a.val[1])); |
506 | 0 | } |
507 | | |
508 | 0 | SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) { |
509 | 0 | return v256_from_v128(v128_unpackhi_s8_s16(a), v128_unpacklo_s8_s16(a)); |
510 | 0 | } |
511 | | |
512 | 0 | SIMD_INLINE v256 v256_unpacklo_s8_s16(v256 a) { |
513 | 0 | return v256_from_v128(v128_unpackhi_s8_s16(a.val[0]), |
514 | 0 | v128_unpacklo_s8_s16(a.val[0])); |
515 | 0 | } |
516 | | |
517 | 0 | SIMD_INLINE v256 v256_unpackhi_s8_s16(v256 a) { |
518 | 0 | return v256_from_v128(v128_unpackhi_s8_s16(a.val[1]), |
519 | 0 | v128_unpacklo_s8_s16(a.val[1])); |
520 | 0 | } |
521 | | |
522 | 0 | SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) { |
523 | 0 | return v256_from_v128(v128_pack_s32_s16(a.val[1], a.val[0]), |
524 | 0 | v128_pack_s32_s16(b.val[1], b.val[0])); |
525 | 0 | } |
526 | | |
527 | 0 | SIMD_INLINE v256 v256_pack_s32_u16(v256 a, v256 b) { |
528 | 0 | return v256_from_v128(v128_pack_s32_u16(a.val[1], a.val[0]), |
529 | 0 | v128_pack_s32_u16(b.val[1], b.val[0])); |
530 | 0 | } |
531 | | |
532 | 0 | SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) { |
533 | 0 | return v256_from_v128(v128_pack_s16_u8(a.val[1], a.val[0]), |
534 | 0 | v128_pack_s16_u8(b.val[1], b.val[0])); |
535 | 0 | } |
536 | | |
537 | 0 | SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) { |
538 | 0 | return v256_from_v128(v128_pack_s16_s8(a.val[1], a.val[0]), |
539 | 0 | v128_pack_s16_s8(b.val[1], b.val[0])); |
540 | 0 | } |
541 | | |
542 | 0 | SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) { |
543 | 0 | return v256_from_v128(v128_unpackhi_u16_s32(a), v128_unpacklo_u16_s32(a)); |
544 | 0 | } |
545 | | |
546 | 0 | SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) { |
547 | 0 | return v256_from_v128(v128_unpackhi_s16_s32(a), v128_unpacklo_s16_s32(a)); |
548 | 0 | } |
549 | | |
550 | 0 | SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) { |
551 | 0 | return v256_from_v128(v128_unpackhi_u16_s32(a.val[0]), |
552 | 0 | v128_unpacklo_u16_s32(a.val[0])); |
553 | 0 | } |
554 | | |
555 | 0 | SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) { |
556 | 0 | return v256_from_v128(v128_unpackhi_s16_s32(a.val[0]), |
557 | 0 | v128_unpacklo_s16_s32(a.val[0])); |
558 | 0 | } |
559 | | |
560 | 0 | SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) { |
561 | 0 | return v256_from_v128(v128_unpackhi_u16_s32(a.val[1]), |
562 | 0 | v128_unpacklo_u16_s32(a.val[1])); |
563 | 0 | } |
564 | | |
565 | 0 | SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) { |
566 | 0 | return v256_from_v128(v128_unpackhi_s16_s32(a.val[1]), |
567 | 0 | v128_unpacklo_s16_s32(a.val[1])); |
568 | 0 | } |
569 | | |
570 | 0 | SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) { |
571 | 0 | return v256_from_v128(v128_cmpgt_s8(a.val[1], b.val[1]), |
572 | 0 | v128_cmpgt_s8(a.val[0], b.val[0])); |
573 | 0 | } |
574 | | |
575 | 0 | SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) { |
576 | 0 | return v256_from_v128(v128_cmplt_s8(a.val[1], b.val[1]), |
577 | 0 | v128_cmplt_s8(a.val[0], b.val[0])); |
578 | 0 | } |
579 | | |
580 | 0 | SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) { |
581 | 0 | return v256_from_v128(v128_cmpeq_8(a.val[1], b.val[1]), |
582 | 0 | v128_cmpeq_8(a.val[0], b.val[0])); |
583 | 0 | } |
584 | | |
585 | 0 | SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) { |
586 | 0 | return v256_from_v128(v128_cmpgt_s16(a.val[1], b.val[1]), |
587 | 0 | v128_cmpgt_s16(a.val[0], b.val[0])); |
588 | 0 | } |
589 | | |
590 | 0 | SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) { |
591 | 0 | return v256_from_v128(v128_cmplt_s16(a.val[1], b.val[1]), |
592 | 0 | v128_cmplt_s16(a.val[0], b.val[0])); |
593 | 0 | } |
594 | | |
595 | 0 | SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) { |
596 | 0 | return v256_from_v128(v128_cmpeq_16(a.val[1], b.val[1]), |
597 | 0 | v128_cmpeq_16(a.val[0], b.val[0])); |
598 | 0 | } |
599 | | |
600 | 0 | SIMD_INLINE v256 v256_cmpgt_s32(v256 a, v256 b) { |
601 | 0 | return v256_from_v128(v128_cmpgt_s32(a.val[1], b.val[1]), |
602 | 0 | v128_cmpgt_s32(a.val[0], b.val[0])); |
603 | 0 | } |
604 | | |
605 | 0 | SIMD_INLINE v256 v256_cmplt_s32(v256 a, v256 b) { |
606 | 0 | return v256_from_v128(v128_cmplt_s32(a.val[1], b.val[1]), |
607 | 0 | v128_cmplt_s32(a.val[0], b.val[0])); |
608 | 0 | } |
609 | | |
610 | 0 | SIMD_INLINE v256 v256_cmpeq_32(v256 a, v256 b) { |
611 | 0 | return v256_from_v128(v128_cmpeq_32(a.val[1], b.val[1]), |
612 | 0 | v128_cmpeq_32(a.val[0], b.val[0])); |
613 | 0 | } |
614 | | |
615 | 0 | SIMD_INLINE v256 v256_shuffle_8(v256 x, v256 pattern) { |
616 | 0 | v128 c16 = v128_dup_8(16); |
617 | 0 | v128 maskhi = v128_cmplt_s8(pattern.val[1], c16); |
618 | 0 | v128 masklo = v128_cmplt_s8(pattern.val[0], c16); |
619 | 0 | return v256_from_v128( |
620 | 0 | v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[1], c16)), |
621 | 0 | v128_shuffle_8(x.val[0], pattern.val[1]), maskhi), |
622 | 0 | v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[0], c16)), |
623 | 0 | v128_shuffle_8(x.val[0], pattern.val[0]), masklo)); |
624 | 0 | } |
625 | | |
626 | 0 | SIMD_INLINE v256 v256_wideshuffle_8(v256 x, v256 y, v256 pattern) { |
627 | 0 | v128 c16 = v128_dup_8(16); |
628 | 0 | v128 c32 = v128_dup_8(32); |
629 | 0 | v128 c48 = v128_dup_8(48); |
630 | 0 | v128 maskhi16 = v128_cmpgt_s8(c16, pattern.val[1]); |
631 | 0 | v128 masklo16 = v128_cmpgt_s8(c16, pattern.val[0]); |
632 | 0 | v128 maskhi48 = v128_cmpgt_s8(c48, pattern.val[1]); |
633 | 0 | v128 masklo48 = v128_cmpgt_s8(c48, pattern.val[0]); |
634 | 0 | v256 r1 = v256_from_v128( |
635 | 0 | v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[1], c48)), |
636 | 0 | v128_shuffle_8(x.val[0], v128_sub_8(pattern.val[1], c32)), |
637 | 0 | maskhi48), |
638 | 0 | v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[0], c48)), |
639 | 0 | v128_shuffle_8(x.val[0], v128_sub_8(pattern.val[0], c32)), |
640 | 0 | masklo48)); |
641 | 0 | v256 r2 = v256_from_v128( |
642 | 0 | v128_blend_8(v128_shuffle_8(y.val[1], v128_sub_8(pattern.val[1], c16)), |
643 | 0 | v128_shuffle_8(y.val[0], pattern.val[1]), maskhi16), |
644 | 0 | v128_blend_8(v128_shuffle_8(y.val[1], v128_sub_8(pattern.val[0], c16)), |
645 | 0 | v128_shuffle_8(y.val[0], pattern.val[0]), masklo16)); |
646 | 0 | return v256_blend_8(r1, r2, v256_cmpgt_s8(v256_from_v128(c32, c32), pattern)); |
647 | 0 | } |
648 | | |
649 | 0 | SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) { |
650 | 0 | return v256_from_v128( |
651 | 0 | v128_shuffle_8(v256_high_v128(a), v256_high_v128(pattern)), |
652 | 0 | v128_shuffle_8(v256_low_v128(a), v256_low_v128(pattern))); |
653 | 0 | } |
654 | | |
655 | 0 | SIMD_INLINE v256 v256_shl_8(v256 a, const unsigned int c) { |
656 | 0 | return v256_from_v128(v128_shl_8(a.val[1], c), v128_shl_8(a.val[0], c)); |
657 | 0 | } |
658 | | |
659 | 0 | SIMD_INLINE v256 v256_shr_u8(v256 a, const unsigned int c) { |
660 | 0 | return v256_from_v128(v128_shr_u8(a.val[1], c), v128_shr_u8(a.val[0], c)); |
661 | 0 | } |
662 | | |
663 | 0 | SIMD_INLINE v256 v256_shr_s8(v256 a, const unsigned int c) { |
664 | 0 | return v256_from_v128(v128_shr_s8(a.val[1], c), v128_shr_s8(a.val[0], c)); |
665 | 0 | } |
666 | | |
667 | 0 | SIMD_INLINE v256 v256_shl_16(v256 a, const unsigned int c) { |
668 | 0 | return v256_from_v128(v128_shl_16(a.val[1], c), v128_shl_16(a.val[0], c)); |
669 | 0 | } |
670 | | |
671 | 0 | SIMD_INLINE v256 v256_shr_u16(v256 a, const unsigned int c) { |
672 | 0 | return v256_from_v128(v128_shr_u16(a.val[1], c), v128_shr_u16(a.val[0], c)); |
673 | 0 | } |
674 | | |
675 | 0 | SIMD_INLINE v256 v256_shr_s16(v256 a, const unsigned int c) { |
676 | 0 | return v256_from_v128(v128_shr_s16(a.val[1], c), v128_shr_s16(a.val[0], c)); |
677 | 0 | } |
678 | | |
679 | 0 | SIMD_INLINE v256 v256_shl_32(v256 a, const unsigned int c) { |
680 | 0 | return v256_from_v128(v128_shl_32(a.val[1], c), v128_shl_32(a.val[0], c)); |
681 | 0 | } |
682 | | |
683 | 0 | SIMD_INLINE v256 v256_shr_u32(v256 a, const unsigned int c) { |
684 | 0 | return v256_from_v128(v128_shr_u32(a.val[1], c), v128_shr_u32(a.val[0], c)); |
685 | 0 | } |
686 | | |
687 | 0 | SIMD_INLINE v256 v256_shr_s32(v256 a, const unsigned int c) { |
688 | 0 | return v256_from_v128(v128_shr_s32(a.val[1], c), v128_shr_s32(a.val[0], c)); |
689 | 0 | } |
690 | | |
691 | 0 | SIMD_INLINE v256 v256_shl_64(v256 a, const unsigned int c) { |
692 | 0 | return v256_from_v128(v128_shl_64(a.val[1], c), v128_shl_64(a.val[0], c)); |
693 | 0 | } |
694 | | |
695 | 0 | SIMD_INLINE v256 v256_shr_u64(v256 a, const unsigned int c) { |
696 | 0 | return v256_from_v128(v128_shr_u64(a.val[1], c), v128_shr_u64(a.val[0], c)); |
697 | 0 | } |
698 | | |
699 | 0 | SIMD_INLINE v256 v256_shr_s64(v256 a, const unsigned int c) { |
700 | 0 | return v256_from_v128(v128_shr_s64(a.val[1], c), v128_shr_s64(a.val[0], c)); |
701 | 0 | } |
702 | | |
703 | | /* These intrinsics require immediate values, so we must use #defines |
704 | | to enforce that. */ |
705 | | #define v256_shl_n_byte(a, n) \ |
706 | | ((n) < 16 ? v256_from_v128(v128_or(v128_shl_n_byte(a.val[1], n), \ |
707 | | v128_shr_n_byte(a.val[0], 16 - (n))), \ |
708 | | v128_shl_n_byte(a.val[0], (n))) \ |
709 | | : v256_from_v128( \ |
710 | | (n) > 16 ? v128_shl_n_byte(a.val[0], (n)-16) : a.val[0], \ |
711 | | v128_zero())) |
712 | | |
713 | | #define v256_shr_n_byte(a, n) \ |
714 | | (n == 0 \ |
715 | | ? a \ |
716 | | : ((n) < 16 \ |
717 | | ? v256_from_v128(v128_shr_n_byte(a.val[1], n), \ |
718 | | v128_or(v128_shr_n_byte(a.val[0], n), \ |
719 | | v128_shl_n_byte(a.val[1], 16 - (n)))) \ |
720 | | : v256_from_v128( \ |
721 | | v128_zero(), \ |
722 | | (n) > 16 ? v128_shr_n_byte(a.val[1], (n)-16) : a.val[1]))) |
723 | | |
724 | | #define v256_align(a, b, c) \ |
725 | | ((c) ? v256_or(v256_shr_n_byte(b, c), v256_shl_n_byte(a, 32 - (c))) : b) |
726 | | |
727 | | #define v256_shl_n_8(a, n) \ |
728 | | v256_from_v128(v128_shl_n_8(a.val[1], n), v128_shl_n_8(a.val[0], n)) |
729 | | #define v256_shl_n_16(a, n) \ |
730 | | v256_from_v128(v128_shl_n_16(a.val[1], n), v128_shl_n_16(a.val[0], n)) |
731 | | #define v256_shl_n_32(a, n) \ |
732 | | v256_from_v128(v128_shl_n_32(a.val[1], n), v128_shl_n_32(a.val[0], n)) |
733 | | #define v256_shl_n_64(a, n) \ |
734 | | v256_from_v128(v128_shl_n_64(a.val[1], n), v128_shl_n_64(a.val[0], n)) |
735 | | #define v256_shr_n_u8(a, n) \ |
736 | | v256_from_v128(v128_shr_n_u8(a.val[1], n), v128_shr_n_u8(a.val[0], n)) |
737 | | #define v256_shr_n_u16(a, n) \ |
738 | | v256_from_v128(v128_shr_n_u16(a.val[1], n), v128_shr_n_u16(a.val[0], n)) |
739 | | #define v256_shr_n_u32(a, n) \ |
740 | | v256_from_v128(v128_shr_n_u32(a.val[1], n), v128_shr_n_u32(a.val[0], n)) |
741 | | #define v256_shr_n_u64(a, n) \ |
742 | | v256_from_v128(v128_shr_n_u64(a.val[1], n), v128_shr_n_u64(a.val[0], n)) |
743 | | #define v256_shr_n_s8(a, n) \ |
744 | | v256_from_v128(v128_shr_n_s8(a.val[1], n), v128_shr_n_s8(a.val[0], n)) |
745 | | #define v256_shr_n_s16(a, n) \ |
746 | 0 | v256_from_v128(v128_shr_n_s16(a.val[1], n), v128_shr_n_s16(a.val[0], n)) |
747 | | #define v256_shr_n_s32(a, n) \ |
748 | | v256_from_v128(v128_shr_n_s32(a.val[1], n), v128_shr_n_s32(a.val[0], n)) |
749 | | #define v256_shr_n_s64(a, n) \ |
750 | | v256_from_v128(v128_shr_n_s64(a.val[1], n), v128_shr_n_s64(a.val[0], n)) |
751 | | |
752 | | #define v256_shr_n_word(a, n) v256_shr_n_byte(a, 2 * (n)) |
753 | | #define v256_shl_n_word(a, n) v256_shl_n_byte(a, 2 * (n)) |
754 | | |
755 | | typedef struct { |
756 | | sad128_internal_u16 val[2]; |
757 | | } sad256_internal_u16; |
758 | | |
759 | 0 | SIMD_INLINE sad256_internal_u16 v256_sad_u16_init(void) { |
760 | 0 | sad256_internal_u16 t; |
761 | 0 | t.val[1] = v128_sad_u16_init(); |
762 | 0 | t.val[0] = v128_sad_u16_init(); |
763 | 0 | return t; |
764 | 0 | } |
765 | | |
766 | | /* Implementation dependent return value. Result must be finalised with |
767 | | v256_sad_u16_sum(). |
768 | | The result for more than 16 v256_sad_u16() calls is undefined. */ |
769 | | SIMD_INLINE sad256_internal_u16 v256_sad_u16(sad256_internal_u16 s, v256 a, |
770 | 0 | v256 b) { |
771 | 0 | sad256_internal_u16 t; |
772 | 0 | t.val[1] = v128_sad_u16(s.val[1], a.val[1], b.val[1]); |
773 | 0 | t.val[0] = v128_sad_u16(s.val[0], a.val[0], b.val[0]); |
774 | 0 | return t; |
775 | 0 | } |
776 | | |
777 | 0 | SIMD_INLINE uint32_t v256_sad_u16_sum(sad256_internal_u16 s) { |
778 | 0 | return v128_sad_u16_sum(s.val[1]) + v128_sad_u16_sum(s.val[0]); |
779 | 0 | } |
780 | | |
781 | | typedef struct { |
782 | | ssd128_internal_s16 val[2]; |
783 | | } ssd256_internal_s16; |
784 | | |
785 | 0 | SIMD_INLINE ssd256_internal_s16 v256_ssd_s16_init(void) { |
786 | 0 | ssd256_internal_s16 t; |
787 | 0 | t.val[1] = v128_ssd_s16_init(); |
788 | 0 | t.val[0] = v128_ssd_s16_init(); |
789 | 0 | return t; |
790 | 0 | } |
791 | | |
792 | | /* Implementation dependent return value. Result must be finalised with |
793 | | * v256_ssd_s16_sum(). */ |
794 | | SIMD_INLINE ssd256_internal_s16 v256_ssd_s16(ssd256_internal_s16 s, v256 a, |
795 | 0 | v256 b) { |
796 | 0 | ssd256_internal_s16 t; |
797 | 0 | t.val[1] = v128_ssd_s16(s.val[1], a.val[1], b.val[1]); |
798 | 0 | t.val[0] = v128_ssd_s16(s.val[0], a.val[0], b.val[0]); |
799 | 0 | return t; |
800 | 0 | } |
801 | | |
802 | 0 | SIMD_INLINE uint64_t v256_ssd_s16_sum(ssd256_internal_s16 s) { |
803 | 0 | return v128_ssd_s16_sum(s.val[1]) + v128_ssd_s16_sum(s.val[0]); |
804 | 0 | } |
805 | | |
806 | | #endif // AOM_AOM_DSP_SIMD_V256_INTRINSICS_V128_H_ |