Coverage Report

Created: 2025-06-13 07:07

/src/aom/aom_dsp/simd/v256_intrinsics_v128.h
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 */
11
12
#ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_V128_H_
13
#define AOM_AOM_DSP_SIMD_V256_INTRINSICS_V128_H_
14
15
#include "config/aom_config.h"
16
17
#if HAVE_NEON
18
#error "Do not use this file for Neon"
19
#endif
20
21
#if HAVE_SSE2
22
#include "aom_dsp/simd/v128_intrinsics_x86.h"
23
#else
24
#include "aom_dsp/simd/v128_intrinsics.h"
25
#endif
26
27
typedef struct {
28
  v128 val[2];
29
} v256;
30
31
0
SIMD_INLINE uint32_t v256_low_u32(v256 a) { return v128_low_u32(a.val[0]); }
32
33
0
SIMD_INLINE v64 v256_low_v64(v256 a) { return v128_low_v64(a.val[0]); }
34
35
0
SIMD_INLINE uint64_t v256_low_u64(v256 a) { return v64_u64(v256_low_v64(a)); }
36
37
0
SIMD_INLINE v128 v256_low_v128(v256 a) { return a.val[0]; }
38
39
0
SIMD_INLINE v128 v256_high_v128(v256 a) { return a.val[1]; }
40
41
0
SIMD_INLINE v256 v256_from_v128(v128 hi, v128 lo) {
42
0
  v256 t;
43
0
  t.val[1] = hi;
44
0
  t.val[0] = lo;
45
0
  return t;
46
0
}
47
48
0
SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) {
49
0
  return v256_from_v128(v128_from_64(a, b), v128_from_64(c, d));
50
0
}
51
52
0
SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) {
53
0
  return v256_from_v128(v128_from_v64(a, b), v128_from_v64(c, d));
54
0
}
55
56
0
SIMD_INLINE v256 v256_load_unaligned(const void *p) {
57
0
  return v256_from_v128(v128_load_unaligned((uint8_t *)p + 16),
58
0
                        v128_load_unaligned(p));
59
0
}
60
61
0
SIMD_INLINE v256 v256_load_aligned(const void *p) {
62
0
  return v256_from_v128(v128_load_aligned((uint8_t *)p + 16),
63
0
                        v128_load_aligned(p));
64
0
}
65
66
0
SIMD_INLINE void v256_store_unaligned(void *p, v256 a) {
67
0
  v128_store_unaligned(p, a.val[0]);
68
0
  v128_store_unaligned((uint8_t *)p + 16, a.val[1]);
69
0
}
70
71
0
SIMD_INLINE void v256_store_aligned(void *p, v256 a) {
72
0
  v128_store_aligned(p, a.val[0]);
73
0
  v128_store_aligned((uint8_t *)p + 16, a.val[1]);
74
0
}
75
76
0
SIMD_INLINE v256 v256_zero(void) {
77
0
  return v256_from_v128(v128_zero(), v128_zero());
78
0
}
79
80
0
SIMD_INLINE v256 v256_dup_8(uint8_t x) {
81
0
  v128 t = v128_dup_8(x);
82
0
  return v256_from_v128(t, t);
83
0
}
84
85
0
SIMD_INLINE v256 v256_dup_16(uint16_t x) {
86
0
  v128 t = v128_dup_16(x);
87
0
  return v256_from_v128(t, t);
88
0
}
89
90
0
SIMD_INLINE v256 v256_dup_32(uint32_t x) {
91
0
  v128 t = v128_dup_32(x);
92
0
  return v256_from_v128(t, t);
93
0
}
94
95
0
SIMD_INLINE v256 v256_dup_64(uint64_t x) {
96
0
  v128 t = v128_dup_64(x);
97
0
  return v256_from_v128(t, t);
98
0
}
99
100
0
SIMD_INLINE int64_t v256_dotp_su8(v256 a, v256 b) {
101
0
  return v128_dotp_su8(a.val[1], b.val[1]) + v128_dotp_su8(a.val[0], b.val[0]);
102
0
}
103
104
0
SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) {
105
0
  return v128_dotp_s16(a.val[1], b.val[1]) + v128_dotp_s16(a.val[0], b.val[0]);
106
0
}
107
108
0
SIMD_INLINE int64_t v256_dotp_s32(v256 a, v256 b) {
109
0
  return v128_dotp_s32(a.val[1], b.val[1]) + v128_dotp_s32(a.val[0], b.val[0]);
110
0
}
111
112
0
SIMD_INLINE uint64_t v256_hadd_u8(v256 a) {
113
0
  return v128_hadd_u8(a.val[1]) + v128_hadd_u8(a.val[0]);
114
0
}
115
116
typedef struct {
117
  sad128_internal val[2];
118
} sad256_internal;
119
120
0
SIMD_INLINE sad256_internal v256_sad_u8_init(void) {
121
0
  sad256_internal t;
122
0
  t.val[1] = v128_sad_u8_init();
123
0
  t.val[0] = v128_sad_u8_init();
124
0
  return t;
125
0
}
126
127
/* Implementation dependent return value.  Result must be finalised with
128
   v256_sad_u8_sum().
129
   The result for more than 16 v256_sad_u8() calls is undefined. */
130
0
SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) {
131
0
  sad256_internal t;
132
0
  t.val[1] = v128_sad_u8(s.val[1], a.val[1], b.val[1]);
133
0
  t.val[0] = v128_sad_u8(s.val[0], a.val[0], b.val[0]);
134
0
  return t;
135
0
}
136
137
0
SIMD_INLINE uint32_t v256_sad_u8_sum(sad256_internal s) {
138
0
  return v128_sad_u8_sum(s.val[1]) + v128_sad_u8_sum(s.val[0]);
139
0
}
140
141
typedef struct {
142
  ssd128_internal val[2];
143
} ssd256_internal;
144
145
0
SIMD_INLINE ssd256_internal v256_ssd_u8_init(void) {
146
0
  ssd256_internal t;
147
0
  t.val[1] = v128_ssd_u8_init();
148
0
  t.val[0] = v128_ssd_u8_init();
149
0
  return t;
150
0
}
151
152
/* Implementation dependent return value.  Result must be finalised with
153
 * v256_ssd_u8_sum(). */
154
0
SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) {
155
0
  ssd256_internal t;
156
0
  t.val[1] = v128_ssd_u8(s.val[1], a.val[1], b.val[1]);
157
0
  t.val[0] = v128_ssd_u8(s.val[0], a.val[0], b.val[0]);
158
0
  return t;
159
0
}
160
161
0
SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) {
162
0
  return v128_ssd_u8_sum(s.val[1]) + v128_ssd_u8_sum(s.val[0]);
163
0
}
164
165
0
SIMD_INLINE v256 v256_or(v256 a, v256 b) {
166
0
  return v256_from_v128(v128_or(a.val[1], b.val[1]),
167
0
                        v128_or(a.val[0], b.val[0]));
168
0
}
169
170
0
SIMD_INLINE v256 v256_xor(v256 a, v256 b) {
171
0
  return v256_from_v128(v128_xor(a.val[1], b.val[1]),
172
0
                        v128_xor(a.val[0], b.val[0]));
173
0
}
174
175
0
SIMD_INLINE v256 v256_and(v256 a, v256 b) {
176
0
  return v256_from_v128(v128_and(a.val[1], b.val[1]),
177
0
                        v128_and(a.val[0], b.val[0]));
178
0
}
179
180
0
SIMD_INLINE v256 v256_andn(v256 a, v256 b) {
181
0
  return v256_from_v128(v128_andn(a.val[1], b.val[1]),
182
0
                        v128_andn(a.val[0], b.val[0]));
183
0
}
184
185
0
SIMD_INLINE v256 v256_add_8(v256 a, v256 b) {
186
0
  return v256_from_v128(v128_add_8(a.val[1], b.val[1]),
187
0
                        v128_add_8(a.val[0], b.val[0]));
188
0
}
189
190
0
SIMD_INLINE v256 v256_add_16(v256 a, v256 b) {
191
0
  return v256_from_v128(v128_add_16(a.val[1], b.val[1]),
192
0
                        v128_add_16(a.val[0], b.val[0]));
193
0
}
194
195
0
SIMD_INLINE v256 v256_sadd_s8(v256 a, v256 b) {
196
0
  return v256_from_v128(v128_sadd_s8(a.val[1], b.val[1]),
197
0
                        v128_sadd_s8(a.val[0], b.val[0]));
198
0
}
199
200
0
SIMD_INLINE v256 v256_sadd_u8(v256 a, v256 b) {
201
0
  return v256_from_v128(v128_sadd_u8(a.val[1], b.val[1]),
202
0
                        v128_sadd_u8(a.val[0], b.val[0]));
203
0
}
204
205
0
SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) {
206
0
  return v256_from_v128(v128_sadd_s16(a.val[1], b.val[1]),
207
0
                        v128_sadd_s16(a.val[0], b.val[0]));
208
0
}
209
210
0
SIMD_INLINE v256 v256_add_32(v256 a, v256 b) {
211
0
  return v256_from_v128(v128_add_32(a.val[1], b.val[1]),
212
0
                        v128_add_32(a.val[0], b.val[0]));
213
0
}
214
215
0
SIMD_INLINE v256 v256_add_64(v256 a, v256 b) {
216
0
  return v256_from_v128(v128_add_64(a.val[1], b.val[1]),
217
0
                        v128_add_64(a.val[0], b.val[0]));
218
0
}
219
220
0
SIMD_INLINE v256 v256_padd_u8(v256 a) {
221
0
  return v256_from_v128(v128_padd_u8(a.val[1]), v128_padd_u8(a.val[0]));
222
0
}
223
224
0
SIMD_INLINE v256 v256_padd_s16(v256 a) {
225
0
  return v256_from_v128(v128_padd_s16(a.val[1]), v128_padd_s16(a.val[0]));
226
0
}
227
228
0
SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) {
229
0
  return v256_from_v128(v128_sub_8(a.val[1], b.val[1]),
230
0
                        v128_sub_8(a.val[0], b.val[0]));
231
0
}
232
233
0
SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) {
234
0
  return v256_from_v128(v128_ssub_u8(a.val[1], b.val[1]),
235
0
                        v128_ssub_u8(a.val[0], b.val[0]));
236
0
}
237
238
0
SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) {
239
0
  return v256_from_v128(v128_ssub_s8(a.val[1], b.val[1]),
240
0
                        v128_ssub_s8(a.val[0], b.val[0]));
241
0
}
242
243
0
SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) {
244
0
  return v256_from_v128(v128_sub_16(a.val[1], b.val[1]),
245
0
                        v128_sub_16(a.val[0], b.val[0]));
246
0
}
247
248
0
SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) {
249
0
  return v256_from_v128(v128_ssub_s16(a.val[1], b.val[1]),
250
0
                        v128_ssub_s16(a.val[0], b.val[0]));
251
0
}
252
253
0
SIMD_INLINE v256 v256_ssub_u16(v256 a, v256 b) {
254
0
  return v256_from_v128(v128_ssub_u16(a.val[1], b.val[1]),
255
0
                        v128_ssub_u16(a.val[0], b.val[0]));
256
0
}
257
258
0
SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) {
259
0
  return v256_from_v128(v128_sub_32(a.val[1], b.val[1]),
260
0
                        v128_sub_32(a.val[0], b.val[0]));
261
0
}
262
263
0
SIMD_INLINE v256 v256_sub_64(v256 a, v256 b) {
264
0
  return v256_from_v128(v128_sub_64(a.val[1], b.val[1]),
265
0
                        v128_sub_64(a.val[0], b.val[0]));
266
0
}
267
268
0
SIMD_INLINE v256 v256_abs_s16(v256 a) {
269
0
  return v256_from_v128(v128_abs_s16(a.val[1]), v128_abs_s16(a.val[0]));
270
0
}
271
272
0
SIMD_INLINE v256 v256_abs_s8(v256 a) {
273
0
  return v256_from_v128(v128_abs_s8(a.val[1]), v128_abs_s8(a.val[0]));
274
0
}
275
276
0
SIMD_INLINE v256 v256_mul_s16(v128 a, v128 b) {
277
0
  v128 lo_bits = v128_mullo_s16(a, b);
278
0
  v128 hi_bits = v128_mulhi_s16(a, b);
279
0
  return v256_from_v128(v128_ziphi_16(hi_bits, lo_bits),
280
0
                        v128_ziplo_16(hi_bits, lo_bits));
281
0
}
282
283
0
SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) {
284
0
  return v256_from_v128(v128_mullo_s16(a.val[1], b.val[1]),
285
0
                        v128_mullo_s16(a.val[0], b.val[0]));
286
0
}
287
288
0
SIMD_INLINE v256 v256_mulhi_s16(v256 a, v256 b) {
289
0
  return v256_from_v128(v128_mulhi_s16(a.val[1], b.val[1]),
290
0
                        v128_mulhi_s16(a.val[0], b.val[0]));
291
0
}
292
293
0
SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) {
294
0
  return v256_from_v128(v128_mullo_s32(a.val[1], b.val[1]),
295
0
                        v128_mullo_s32(a.val[0], b.val[0]));
296
0
}
297
298
0
SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) {
299
0
  return v256_from_v128(v128_madd_s16(a.val[1], b.val[1]),
300
0
                        v128_madd_s16(a.val[0], b.val[0]));
301
0
}
302
303
0
SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) {
304
0
  return v256_from_v128(v128_madd_us8(a.val[1], b.val[1]),
305
0
                        v128_madd_us8(a.val[0], b.val[0]));
306
0
}
307
308
0
SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) {
309
0
  return v256_from_v128(v128_avg_u8(a.val[1], b.val[1]),
310
0
                        v128_avg_u8(a.val[0], b.val[0]));
311
0
}
312
313
0
SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) {
314
0
  return v256_from_v128(v128_rdavg_u8(a.val[1], b.val[1]),
315
0
                        v128_rdavg_u8(a.val[0], b.val[0]));
316
0
}
317
318
0
SIMD_INLINE v256 v256_rdavg_u16(v256 a, v256 b) {
319
0
  return v256_from_v128(v128_rdavg_u16(a.val[1], b.val[1]),
320
0
                        v128_rdavg_u16(a.val[0], b.val[0]));
321
0
}
322
323
0
SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) {
324
0
  return v256_from_v128(v128_avg_u16(a.val[1], b.val[1]),
325
0
                        v128_avg_u16(a.val[0], b.val[0]));
326
0
}
327
328
0
SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) {
329
0
  return v256_from_v128(v128_min_u8(a.val[1], b.val[1]),
330
0
                        v128_min_u8(a.val[0], b.val[0]));
331
0
}
332
333
0
SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) {
334
0
  return v256_from_v128(v128_max_u8(a.val[1], b.val[1]),
335
0
                        v128_max_u8(a.val[0], b.val[0]));
336
0
}
337
338
0
SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) {
339
0
  return v256_from_v128(v128_min_s8(a.val[1], b.val[1]),
340
0
                        v128_min_s8(a.val[0], b.val[0]));
341
0
}
342
343
0
SIMD_INLINE uint32_t v256_movemask_8(v256 a) {
344
0
  return (v128_movemask_8(v256_high_v128(a)) << 16) |
345
0
         v128_movemask_8(v256_low_v128(a));
346
0
}
347
348
0
SIMD_INLINE v256 v256_blend_8(v256 a, v256 b, v256 c) {
349
0
  return v256_from_v128(v128_blend_8(a.val[1], b.val[1], c.val[1]),
350
0
                        v128_blend_8(a.val[0], b.val[0], c.val[0]));
351
0
}
352
353
0
SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) {
354
0
  return v256_from_v128(v128_max_s8(a.val[1], b.val[1]),
355
0
                        v128_max_s8(a.val[0], b.val[0]));
356
0
}
357
358
0
SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) {
359
0
  return v256_from_v128(v128_min_s16(a.val[1], b.val[1]),
360
0
                        v128_min_s16(a.val[0], b.val[0]));
361
0
}
362
363
0
SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) {
364
0
  return v256_from_v128(v128_max_s16(a.val[1], b.val[1]),
365
0
                        v128_max_s16(a.val[0], b.val[0]));
366
0
}
367
368
0
SIMD_INLINE v256 v256_min_s32(v256 a, v256 b) {
369
0
  return v256_from_v128(v128_min_s32(a.val[1], b.val[1]),
370
0
                        v128_min_s32(a.val[0], b.val[0]));
371
0
}
372
373
0
SIMD_INLINE v256 v256_max_s32(v256 a, v256 b) {
374
0
  return v256_from_v128(v128_max_s32(a.val[1], b.val[1]),
375
0
                        v128_max_s32(a.val[0], b.val[0]));
376
0
}
377
378
0
SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) {
379
0
  return v256_from_v128(v128_ziphi_8(a.val[0], b.val[0]),
380
0
                        v128_ziplo_8(a.val[0], b.val[0]));
381
0
}
382
383
0
SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) {
384
0
  return v256_from_v128(v128_ziphi_8(a.val[1], b.val[1]),
385
0
                        v128_ziplo_8(a.val[1], b.val[1]));
386
0
}
387
388
0
SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) {
389
0
  return v256_from_v128(v128_ziphi_16(a.val[0], b.val[0]),
390
0
                        v128_ziplo_16(a.val[0], b.val[0]));
391
0
}
392
393
0
SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) {
394
0
  return v256_from_v128(v128_ziphi_16(a.val[1], b.val[1]),
395
0
                        v128_ziplo_16(a.val[1], b.val[1]));
396
0
}
397
398
0
SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) {
399
0
  return v256_from_v128(v128_ziphi_32(a.val[0], b.val[0]),
400
0
                        v128_ziplo_32(a.val[0], b.val[0]));
401
0
}
402
403
0
SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) {
404
0
  return v256_from_v128(v128_ziphi_32(a.val[1], b.val[1]),
405
0
                        v128_ziplo_32(a.val[1], b.val[1]));
406
0
}
407
408
0
SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) {
409
0
  return v256_from_v128(v128_ziphi_64(a.val[0], b.val[0]),
410
0
                        v128_ziplo_64(a.val[0], b.val[0]));
411
0
}
412
413
0
SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) {
414
0
  return v256_from_v128(v128_ziphi_64(a.val[1], b.val[1]),
415
0
                        v128_ziplo_64(a.val[1], b.val[1]));
416
0
}
417
418
0
SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) {
419
0
  return v256_from_v128(a.val[0], b.val[0]);
420
0
}
421
422
0
SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) {
423
0
  return v256_from_v128(a.val[1], b.val[1]);
424
0
}
425
426
0
SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) {
427
0
  return v256_from_v128(v128_ziphi_8(a, b), v128_ziplo_8(a, b));
428
0
}
429
430
0
SIMD_INLINE v256 v256_zip_16(v128 a, v128 b) {
431
0
  return v256_from_v128(v128_ziphi_16(a, b), v128_ziplo_16(a, b));
432
0
}
433
434
0
SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) {
435
0
  return v256_from_v128(v128_ziphi_32(a, b), v128_ziplo_32(a, b));
436
0
}
437
438
0
SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) {
439
0
  return v256_from_v128(v128_unziplo_8(a.val[1], a.val[0]),
440
0
                        v128_unziplo_8(b.val[1], b.val[0]));
441
0
}
442
443
0
SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) {
444
0
  return v256_from_v128(v128_unziphi_8(a.val[1], a.val[0]),
445
0
                        v128_unziphi_8(b.val[1], b.val[0]));
446
0
}
447
448
0
SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) {
449
0
  return v256_from_v128(v128_unziplo_16(a.val[1], a.val[0]),
450
0
                        v128_unziplo_16(b.val[1], b.val[0]));
451
0
}
452
453
0
SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) {
454
0
  return v256_from_v128(v128_unziphi_16(a.val[1], a.val[0]),
455
0
                        v128_unziphi_16(b.val[1], b.val[0]));
456
0
}
457
458
0
SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) {
459
0
  return v256_from_v128(v128_unziplo_32(a.val[1], a.val[0]),
460
0
                        v128_unziplo_32(b.val[1], b.val[0]));
461
0
}
462
463
0
SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) {
464
0
  return v256_from_v128(v128_unziphi_32(a.val[1], a.val[0]),
465
0
                        v128_unziphi_32(b.val[1], b.val[0]));
466
0
}
467
468
0
SIMD_INLINE v256 v256_unziplo_64(v256 a, v256 b) {
469
0
#if HAVE_SSE2
470
0
  return v256_from_v128(
471
0
      _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(a.val[0]),
472
0
                                      _mm_castsi128_pd(a.val[1]), 0)),
473
0
      _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(b.val[0]),
474
0
                                      _mm_castsi128_pd(b.val[1]), 0)));
475
0
#else
476
0
  return v256_from_v64(v128_low_v64(a.val[1]), v128_low_v64(a.val[0]),
477
0
                       v128_low_v64(b.val[1]), v128_low_v64(b.val[0]));
478
0
#endif
479
0
}
480
481
0
SIMD_INLINE v256 v256_unziphi_64(v256 a, v256 b) {
482
0
#if HAVE_SSE2
483
0
  return v256_from_v128(
484
0
      _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(a.val[0]),
485
0
                                      _mm_castsi128_pd(a.val[1]), 3)),
486
0
      _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(b.val[0]),
487
0
                                      _mm_castsi128_pd(b.val[1]), 3)));
488
0
#else
489
0
  return v256_from_v64(v128_high_v64(a.val[1]), v128_high_v64(a.val[0]),
490
0
                       v128_high_v64(b.val[1]), v128_high_v64(b.val[0]));
491
0
#endif
492
0
}
493
494
0
SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) {
495
0
  return v256_from_v128(v128_unpackhi_u8_s16(a), v128_unpacklo_u8_s16(a));
496
0
}
497
498
0
SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) {
499
0
  return v256_from_v128(v128_unpackhi_u8_s16(a.val[0]),
500
0
                        v128_unpacklo_u8_s16(a.val[0]));
501
0
}
502
503
0
SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) {
504
0
  return v256_from_v128(v128_unpackhi_u8_s16(a.val[1]),
505
0
                        v128_unpacklo_u8_s16(a.val[1]));
506
0
}
507
508
0
SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) {
509
0
  return v256_from_v128(v128_unpackhi_s8_s16(a), v128_unpacklo_s8_s16(a));
510
0
}
511
512
0
SIMD_INLINE v256 v256_unpacklo_s8_s16(v256 a) {
513
0
  return v256_from_v128(v128_unpackhi_s8_s16(a.val[0]),
514
0
                        v128_unpacklo_s8_s16(a.val[0]));
515
0
}
516
517
0
SIMD_INLINE v256 v256_unpackhi_s8_s16(v256 a) {
518
0
  return v256_from_v128(v128_unpackhi_s8_s16(a.val[1]),
519
0
                        v128_unpacklo_s8_s16(a.val[1]));
520
0
}
521
522
0
SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) {
523
0
  return v256_from_v128(v128_pack_s32_s16(a.val[1], a.val[0]),
524
0
                        v128_pack_s32_s16(b.val[1], b.val[0]));
525
0
}
526
527
0
SIMD_INLINE v256 v256_pack_s32_u16(v256 a, v256 b) {
528
0
  return v256_from_v128(v128_pack_s32_u16(a.val[1], a.val[0]),
529
0
                        v128_pack_s32_u16(b.val[1], b.val[0]));
530
0
}
531
532
0
SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) {
533
0
  return v256_from_v128(v128_pack_s16_u8(a.val[1], a.val[0]),
534
0
                        v128_pack_s16_u8(b.val[1], b.val[0]));
535
0
}
536
537
0
SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) {
538
0
  return v256_from_v128(v128_pack_s16_s8(a.val[1], a.val[0]),
539
0
                        v128_pack_s16_s8(b.val[1], b.val[0]));
540
0
}
541
542
0
SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) {
543
0
  return v256_from_v128(v128_unpackhi_u16_s32(a), v128_unpacklo_u16_s32(a));
544
0
}
545
546
0
SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) {
547
0
  return v256_from_v128(v128_unpackhi_s16_s32(a), v128_unpacklo_s16_s32(a));
548
0
}
549
550
0
SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) {
551
0
  return v256_from_v128(v128_unpackhi_u16_s32(a.val[0]),
552
0
                        v128_unpacklo_u16_s32(a.val[0]));
553
0
}
554
555
0
SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) {
556
0
  return v256_from_v128(v128_unpackhi_s16_s32(a.val[0]),
557
0
                        v128_unpacklo_s16_s32(a.val[0]));
558
0
}
559
560
0
SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) {
561
0
  return v256_from_v128(v128_unpackhi_u16_s32(a.val[1]),
562
0
                        v128_unpacklo_u16_s32(a.val[1]));
563
0
}
564
565
0
SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) {
566
0
  return v256_from_v128(v128_unpackhi_s16_s32(a.val[1]),
567
0
                        v128_unpacklo_s16_s32(a.val[1]));
568
0
}
569
570
0
SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) {
571
0
  return v256_from_v128(v128_cmpgt_s8(a.val[1], b.val[1]),
572
0
                        v128_cmpgt_s8(a.val[0], b.val[0]));
573
0
}
574
575
0
SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) {
576
0
  return v256_from_v128(v128_cmplt_s8(a.val[1], b.val[1]),
577
0
                        v128_cmplt_s8(a.val[0], b.val[0]));
578
0
}
579
580
0
SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) {
581
0
  return v256_from_v128(v128_cmpeq_8(a.val[1], b.val[1]),
582
0
                        v128_cmpeq_8(a.val[0], b.val[0]));
583
0
}
584
585
0
SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) {
586
0
  return v256_from_v128(v128_cmpgt_s16(a.val[1], b.val[1]),
587
0
                        v128_cmpgt_s16(a.val[0], b.val[0]));
588
0
}
589
590
0
SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) {
591
0
  return v256_from_v128(v128_cmplt_s16(a.val[1], b.val[1]),
592
0
                        v128_cmplt_s16(a.val[0], b.val[0]));
593
0
}
594
595
0
SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) {
596
0
  return v256_from_v128(v128_cmpeq_16(a.val[1], b.val[1]),
597
0
                        v128_cmpeq_16(a.val[0], b.val[0]));
598
0
}
599
600
0
SIMD_INLINE v256 v256_cmpgt_s32(v256 a, v256 b) {
601
0
  return v256_from_v128(v128_cmpgt_s32(a.val[1], b.val[1]),
602
0
                        v128_cmpgt_s32(a.val[0], b.val[0]));
603
0
}
604
605
0
SIMD_INLINE v256 v256_cmplt_s32(v256 a, v256 b) {
606
0
  return v256_from_v128(v128_cmplt_s32(a.val[1], b.val[1]),
607
0
                        v128_cmplt_s32(a.val[0], b.val[0]));
608
0
}
609
610
0
SIMD_INLINE v256 v256_cmpeq_32(v256 a, v256 b) {
611
0
  return v256_from_v128(v128_cmpeq_32(a.val[1], b.val[1]),
612
0
                        v128_cmpeq_32(a.val[0], b.val[0]));
613
0
}
614
615
0
SIMD_INLINE v256 v256_shuffle_8(v256 x, v256 pattern) {
616
0
  v128 c16 = v128_dup_8(16);
617
0
  v128 maskhi = v128_cmplt_s8(pattern.val[1], c16);
618
0
  v128 masklo = v128_cmplt_s8(pattern.val[0], c16);
619
0
  return v256_from_v128(
620
0
      v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[1], c16)),
621
0
                   v128_shuffle_8(x.val[0], pattern.val[1]), maskhi),
622
0
      v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[0], c16)),
623
0
                   v128_shuffle_8(x.val[0], pattern.val[0]), masklo));
624
0
}
625
626
0
SIMD_INLINE v256 v256_wideshuffle_8(v256 x, v256 y, v256 pattern) {
627
0
  v128 c16 = v128_dup_8(16);
628
0
  v128 c32 = v128_dup_8(32);
629
0
  v128 c48 = v128_dup_8(48);
630
0
  v128 maskhi16 = v128_cmpgt_s8(c16, pattern.val[1]);
631
0
  v128 masklo16 = v128_cmpgt_s8(c16, pattern.val[0]);
632
0
  v128 maskhi48 = v128_cmpgt_s8(c48, pattern.val[1]);
633
0
  v128 masklo48 = v128_cmpgt_s8(c48, pattern.val[0]);
634
0
  v256 r1 = v256_from_v128(
635
0
      v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[1], c48)),
636
0
                   v128_shuffle_8(x.val[0], v128_sub_8(pattern.val[1], c32)),
637
0
                   maskhi48),
638
0
      v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[0], c48)),
639
0
                   v128_shuffle_8(x.val[0], v128_sub_8(pattern.val[0], c32)),
640
0
                   masklo48));
641
0
  v256 r2 = v256_from_v128(
642
0
      v128_blend_8(v128_shuffle_8(y.val[1], v128_sub_8(pattern.val[1], c16)),
643
0
                   v128_shuffle_8(y.val[0], pattern.val[1]), maskhi16),
644
0
      v128_blend_8(v128_shuffle_8(y.val[1], v128_sub_8(pattern.val[0], c16)),
645
0
                   v128_shuffle_8(y.val[0], pattern.val[0]), masklo16));
646
0
  return v256_blend_8(r1, r2, v256_cmpgt_s8(v256_from_v128(c32, c32), pattern));
647
0
}
648
649
0
SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) {
650
0
  return v256_from_v128(
651
0
      v128_shuffle_8(v256_high_v128(a), v256_high_v128(pattern)),
652
0
      v128_shuffle_8(v256_low_v128(a), v256_low_v128(pattern)));
653
0
}
654
655
0
SIMD_INLINE v256 v256_shl_8(v256 a, const unsigned int c) {
656
0
  return v256_from_v128(v128_shl_8(a.val[1], c), v128_shl_8(a.val[0], c));
657
0
}
658
659
0
SIMD_INLINE v256 v256_shr_u8(v256 a, const unsigned int c) {
660
0
  return v256_from_v128(v128_shr_u8(a.val[1], c), v128_shr_u8(a.val[0], c));
661
0
}
662
663
0
SIMD_INLINE v256 v256_shr_s8(v256 a, const unsigned int c) {
664
0
  return v256_from_v128(v128_shr_s8(a.val[1], c), v128_shr_s8(a.val[0], c));
665
0
}
666
667
0
SIMD_INLINE v256 v256_shl_16(v256 a, const unsigned int c) {
668
0
  return v256_from_v128(v128_shl_16(a.val[1], c), v128_shl_16(a.val[0], c));
669
0
}
670
671
0
SIMD_INLINE v256 v256_shr_u16(v256 a, const unsigned int c) {
672
0
  return v256_from_v128(v128_shr_u16(a.val[1], c), v128_shr_u16(a.val[0], c));
673
0
}
674
675
0
SIMD_INLINE v256 v256_shr_s16(v256 a, const unsigned int c) {
676
0
  return v256_from_v128(v128_shr_s16(a.val[1], c), v128_shr_s16(a.val[0], c));
677
0
}
678
679
0
SIMD_INLINE v256 v256_shl_32(v256 a, const unsigned int c) {
680
0
  return v256_from_v128(v128_shl_32(a.val[1], c), v128_shl_32(a.val[0], c));
681
0
}
682
683
0
SIMD_INLINE v256 v256_shr_u32(v256 a, const unsigned int c) {
684
0
  return v256_from_v128(v128_shr_u32(a.val[1], c), v128_shr_u32(a.val[0], c));
685
0
}
686
687
0
SIMD_INLINE v256 v256_shr_s32(v256 a, const unsigned int c) {
688
0
  return v256_from_v128(v128_shr_s32(a.val[1], c), v128_shr_s32(a.val[0], c));
689
0
}
690
691
0
SIMD_INLINE v256 v256_shl_64(v256 a, const unsigned int c) {
692
0
  return v256_from_v128(v128_shl_64(a.val[1], c), v128_shl_64(a.val[0], c));
693
0
}
694
695
0
SIMD_INLINE v256 v256_shr_u64(v256 a, const unsigned int c) {
696
0
  return v256_from_v128(v128_shr_u64(a.val[1], c), v128_shr_u64(a.val[0], c));
697
0
}
698
699
0
SIMD_INLINE v256 v256_shr_s64(v256 a, const unsigned int c) {
700
0
  return v256_from_v128(v128_shr_s64(a.val[1], c), v128_shr_s64(a.val[0], c));
701
0
}
702
703
/* These intrinsics require immediate values, so we must use #defines
704
   to enforce that. */
705
#define v256_shl_n_byte(a, n)                                              \
706
  ((n) < 16 ? v256_from_v128(v128_or(v128_shl_n_byte(a.val[1], n),         \
707
                                     v128_shr_n_byte(a.val[0], 16 - (n))), \
708
                             v128_shl_n_byte(a.val[0], (n)))               \
709
            : v256_from_v128(                                              \
710
                  (n) > 16 ? v128_shl_n_byte(a.val[0], (n)-16) : a.val[0], \
711
                  v128_zero()))
712
713
#define v256_shr_n_byte(a, n)                                                \
714
  (n == 0                                                                    \
715
       ? a                                                                   \
716
       : ((n) < 16                                                           \
717
              ? v256_from_v128(v128_shr_n_byte(a.val[1], n),                 \
718
                               v128_or(v128_shr_n_byte(a.val[0], n),         \
719
                                       v128_shl_n_byte(a.val[1], 16 - (n)))) \
720
              : v256_from_v128(                                              \
721
                    v128_zero(),                                             \
722
                    (n) > 16 ? v128_shr_n_byte(a.val[1], (n)-16) : a.val[1])))
723
724
#define v256_align(a, b, c) \
725
  ((c) ? v256_or(v256_shr_n_byte(b, c), v256_shl_n_byte(a, 32 - (c))) : b)
726
727
#define v256_shl_n_8(a, n) \
728
  v256_from_v128(v128_shl_n_8(a.val[1], n), v128_shl_n_8(a.val[0], n))
729
#define v256_shl_n_16(a, n) \
730
  v256_from_v128(v128_shl_n_16(a.val[1], n), v128_shl_n_16(a.val[0], n))
731
#define v256_shl_n_32(a, n) \
732
  v256_from_v128(v128_shl_n_32(a.val[1], n), v128_shl_n_32(a.val[0], n))
733
#define v256_shl_n_64(a, n) \
734
  v256_from_v128(v128_shl_n_64(a.val[1], n), v128_shl_n_64(a.val[0], n))
735
#define v256_shr_n_u8(a, n) \
736
  v256_from_v128(v128_shr_n_u8(a.val[1], n), v128_shr_n_u8(a.val[0], n))
737
#define v256_shr_n_u16(a, n) \
738
  v256_from_v128(v128_shr_n_u16(a.val[1], n), v128_shr_n_u16(a.val[0], n))
739
#define v256_shr_n_u32(a, n) \
740
  v256_from_v128(v128_shr_n_u32(a.val[1], n), v128_shr_n_u32(a.val[0], n))
741
#define v256_shr_n_u64(a, n) \
742
  v256_from_v128(v128_shr_n_u64(a.val[1], n), v128_shr_n_u64(a.val[0], n))
743
#define v256_shr_n_s8(a, n) \
744
  v256_from_v128(v128_shr_n_s8(a.val[1], n), v128_shr_n_s8(a.val[0], n))
745
#define v256_shr_n_s16(a, n) \
746
0
  v256_from_v128(v128_shr_n_s16(a.val[1], n), v128_shr_n_s16(a.val[0], n))
747
#define v256_shr_n_s32(a, n) \
748
  v256_from_v128(v128_shr_n_s32(a.val[1], n), v128_shr_n_s32(a.val[0], n))
749
#define v256_shr_n_s64(a, n) \
750
  v256_from_v128(v128_shr_n_s64(a.val[1], n), v128_shr_n_s64(a.val[0], n))
751
752
#define v256_shr_n_word(a, n) v256_shr_n_byte(a, 2 * (n))
753
#define v256_shl_n_word(a, n) v256_shl_n_byte(a, 2 * (n))
754
755
typedef struct {
756
  sad128_internal_u16 val[2];
757
} sad256_internal_u16;
758
759
0
SIMD_INLINE sad256_internal_u16 v256_sad_u16_init(void) {
760
0
  sad256_internal_u16 t;
761
0
  t.val[1] = v128_sad_u16_init();
762
0
  t.val[0] = v128_sad_u16_init();
763
0
  return t;
764
0
}
765
766
/* Implementation dependent return value.  Result must be finalised with
767
   v256_sad_u16_sum().
768
   The result for more than 16 v256_sad_u16() calls is undefined. */
769
SIMD_INLINE sad256_internal_u16 v256_sad_u16(sad256_internal_u16 s, v256 a,
770
0
                                             v256 b) {
771
0
  sad256_internal_u16 t;
772
0
  t.val[1] = v128_sad_u16(s.val[1], a.val[1], b.val[1]);
773
0
  t.val[0] = v128_sad_u16(s.val[0], a.val[0], b.val[0]);
774
0
  return t;
775
0
}
776
777
0
SIMD_INLINE uint32_t v256_sad_u16_sum(sad256_internal_u16 s) {
778
0
  return v128_sad_u16_sum(s.val[1]) + v128_sad_u16_sum(s.val[0]);
779
0
}
780
781
typedef struct {
782
  ssd128_internal_s16 val[2];
783
} ssd256_internal_s16;
784
785
0
SIMD_INLINE ssd256_internal_s16 v256_ssd_s16_init(void) {
786
0
  ssd256_internal_s16 t;
787
0
  t.val[1] = v128_ssd_s16_init();
788
0
  t.val[0] = v128_ssd_s16_init();
789
0
  return t;
790
0
}
791
792
/* Implementation dependent return value.  Result must be finalised with
793
 * v256_ssd_s16_sum(). */
794
SIMD_INLINE ssd256_internal_s16 v256_ssd_s16(ssd256_internal_s16 s, v256 a,
795
0
                                             v256 b) {
796
0
  ssd256_internal_s16 t;
797
0
  t.val[1] = v128_ssd_s16(s.val[1], a.val[1], b.val[1]);
798
0
  t.val[0] = v128_ssd_s16(s.val[0], a.val[0], b.val[0]);
799
0
  return t;
800
0
}
801
802
0
SIMD_INLINE uint64_t v256_ssd_s16_sum(ssd256_internal_s16 s) {
803
0
  return v128_ssd_s16_sum(s.val[1]) + v128_ssd_s16_sum(s.val[0]);
804
0
}
805
806
#endif  // AOM_AOM_DSP_SIMD_V256_INTRINSICS_V128_H_