Coverage Report

Created: 2024-09-06 07:53

/src/libvpx/vpx_dsp/x86/avg_intrin_sse2.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3
 *
4
 *  Use of this source code is governed by a BSD-style license
5
 *  that can be found in the LICENSE file in the root of the source
6
 *  tree. An additional intellectual property rights grant can be found
7
 *  in the file PATENTS.  All contributing project authors may
8
 *  be found in the AUTHORS file in the root of the source tree.
9
 */
10
11
#include <emmintrin.h>
12
13
#include "./vpx_dsp_rtcd.h"
14
#include "vpx/vpx_integer.h"
15
#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
16
#include "vpx_ports/mem.h"
17
18
static INLINE void sign_extend_16bit_to_32bit_sse2(__m128i in, __m128i zero,
19
                                                   __m128i *out_lo,
20
0
                                                   __m128i *out_hi) {
21
0
  const __m128i sign_bits = _mm_cmplt_epi16(in, zero);
22
0
  *out_lo = _mm_unpacklo_epi16(in, sign_bits);
23
0
  *out_hi = _mm_unpackhi_epi16(in, sign_bits);
24
0
}
25
26
void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
27
0
                         int *min, int *max) {
28
0
  __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff;
29
0
  u0 = _mm_setzero_si128();
30
  // Row 0
31
0
  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
32
0
  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d)), u0);
33
0
  diff = _mm_subs_epi16(s0, d0);
34
0
  negdiff = _mm_subs_epi16(u0, diff);
35
0
  absdiff0 = _mm_max_epi16(diff, negdiff);
36
  // Row 1
37
0
  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
38
0
  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + dp)), u0);
39
0
  diff = _mm_subs_epi16(s0, d0);
40
0
  negdiff = _mm_subs_epi16(u0, diff);
41
0
  absdiff = _mm_max_epi16(diff, negdiff);
42
0
  maxabsdiff = _mm_max_epi16(absdiff0, absdiff);
43
0
  minabsdiff = _mm_min_epi16(absdiff0, absdiff);
44
  // Row 2
45
0
  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
46
0
  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 2 * dp)), u0);
47
0
  diff = _mm_subs_epi16(s0, d0);
48
0
  negdiff = _mm_subs_epi16(u0, diff);
49
0
  absdiff = _mm_max_epi16(diff, negdiff);
50
0
  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
51
0
  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
52
  // Row 3
53
0
  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
54
0
  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 3 * dp)), u0);
55
0
  diff = _mm_subs_epi16(s0, d0);
56
0
  negdiff = _mm_subs_epi16(u0, diff);
57
0
  absdiff = _mm_max_epi16(diff, negdiff);
58
0
  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
59
0
  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
60
  // Row 4
61
0
  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);
62
0
  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 4 * dp)), u0);
63
0
  diff = _mm_subs_epi16(s0, d0);
64
0
  negdiff = _mm_subs_epi16(u0, diff);
65
0
  absdiff = _mm_max_epi16(diff, negdiff);
66
0
  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
67
0
  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
68
  // Row 5
69
0
  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);
70
0
  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 5 * dp)), u0);
71
0
  diff = _mm_subs_epi16(s0, d0);
72
0
  negdiff = _mm_subs_epi16(u0, diff);
73
0
  absdiff = _mm_max_epi16(diff, negdiff);
74
0
  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
75
0
  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
76
  // Row 6
77
0
  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);
78
0
  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 6 * dp)), u0);
79
0
  diff = _mm_subs_epi16(s0, d0);
80
0
  negdiff = _mm_subs_epi16(u0, diff);
81
0
  absdiff = _mm_max_epi16(diff, negdiff);
82
0
  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
83
0
  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
84
  // Row 7
85
0
  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);
86
0
  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 7 * dp)), u0);
87
0
  diff = _mm_subs_epi16(s0, d0);
88
0
  negdiff = _mm_subs_epi16(u0, diff);
89
0
  absdiff = _mm_max_epi16(diff, negdiff);
90
0
  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
91
0
  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
92
93
0
  maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_si128(maxabsdiff, 8));
94
0
  maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 32));
95
0
  maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 16));
96
0
  *max = _mm_extract_epi16(maxabsdiff, 0);
97
98
0
  minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_si128(minabsdiff, 8));
99
0
  minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 32));
100
0
  minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 16));
101
0
  *min = _mm_extract_epi16(minabsdiff, 0);
102
0
}
103
104
0
unsigned int vpx_avg_8x8_sse2(const uint8_t *s, int p) {
105
0
  __m128i s0, s1, u0;
106
0
  unsigned int avg = 0;
107
0
  u0 = _mm_setzero_si128();
108
0
  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
109
0
  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
110
0
  s0 = _mm_adds_epu16(s0, s1);
111
0
  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
112
0
  s0 = _mm_adds_epu16(s0, s1);
113
0
  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
114
0
  s0 = _mm_adds_epu16(s0, s1);
115
0
  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);
116
0
  s0 = _mm_adds_epu16(s0, s1);
117
0
  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);
118
0
  s0 = _mm_adds_epu16(s0, s1);
119
0
  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);
120
0
  s0 = _mm_adds_epu16(s0, s1);
121
0
  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);
122
0
  s0 = _mm_adds_epu16(s0, s1);
123
124
0
  s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 8));
125
0
  s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 32));
126
0
  s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));
127
0
  avg = _mm_extract_epi16(s0, 0);
128
0
  return (avg + 32) >> 6;
129
0
}
130
131
0
unsigned int vpx_avg_4x4_sse2(const uint8_t *s, int p) {
132
0
  __m128i s0, s1, u0;
133
0
  unsigned int avg = 0;
134
0
  u0 = _mm_setzero_si128();
135
0
  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
136
0
  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
137
0
  s0 = _mm_adds_epu16(s0, s1);
138
0
  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
139
0
  s0 = _mm_adds_epu16(s0, s1);
140
0
  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
141
0
  s0 = _mm_adds_epu16(s0, s1);
142
143
0
  s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4));
144
0
  s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));
145
0
  avg = _mm_extract_epi16(s0, 0);
146
0
  return (avg + 8) >> 4;
147
0
}
148
149
#if CONFIG_VP9_HIGHBITDEPTH
150
0
unsigned int vpx_highbd_avg_8x8_sse2(const uint8_t *s8, int p) {
151
0
  __m128i s0, s1;
152
0
  unsigned int avg;
153
0
  const uint16_t *s = CONVERT_TO_SHORTPTR(s8);
154
0
  const __m128i zero = _mm_setzero_si128();
155
0
  s0 = _mm_loadu_si128((const __m128i *)(s));
156
0
  s1 = _mm_loadu_si128((const __m128i *)(s + p));
157
0
  s0 = _mm_adds_epu16(s0, s1);
158
0
  s1 = _mm_loadu_si128((const __m128i *)(s + 2 * p));
159
0
  s0 = _mm_adds_epu16(s0, s1);
160
0
  s1 = _mm_loadu_si128((const __m128i *)(s + 3 * p));
161
0
  s0 = _mm_adds_epu16(s0, s1);
162
0
  s1 = _mm_loadu_si128((const __m128i *)(s + 4 * p));
163
0
  s0 = _mm_adds_epu16(s0, s1);
164
0
  s1 = _mm_loadu_si128((const __m128i *)(s + 5 * p));
165
0
  s0 = _mm_adds_epu16(s0, s1);
166
0
  s1 = _mm_loadu_si128((const __m128i *)(s + 6 * p));
167
0
  s0 = _mm_adds_epu16(s0, s1);
168
0
  s1 = _mm_loadu_si128((const __m128i *)(s + 7 * p));
169
0
  s0 = _mm_adds_epu16(s0, s1);
170
0
  s1 = _mm_unpackhi_epi16(s0, zero);
171
0
  s0 = _mm_unpacklo_epi16(s0, zero);
172
0
  s0 = _mm_add_epi32(s0, s1);
173
0
  s0 = _mm_add_epi32(s0, _mm_srli_si128(s0, 8));
174
0
  s0 = _mm_add_epi32(s0, _mm_srli_si128(s0, 4));
175
0
  avg = (unsigned int)_mm_cvtsi128_si32(s0);
176
177
0
  return (avg + 32) >> 6;
178
0
}
179
180
0
unsigned int vpx_highbd_avg_4x4_sse2(const uint8_t *s8, int p) {
181
0
  __m128i s0, s1;
182
0
  unsigned int avg;
183
0
  const uint16_t *s = CONVERT_TO_SHORTPTR(s8);
184
0
  s0 = _mm_loadl_epi64((const __m128i *)(s));
185
0
  s1 = _mm_loadl_epi64((const __m128i *)(s + p));
186
0
  s0 = _mm_adds_epu16(s0, s1);
187
0
  s1 = _mm_loadl_epi64((const __m128i *)(s + 2 * p));
188
0
  s0 = _mm_adds_epu16(s0, s1);
189
0
  s1 = _mm_loadl_epi64((const __m128i *)(s + 3 * p));
190
0
  s0 = _mm_adds_epu16(s0, s1);
191
0
  s0 = _mm_add_epi16(s0, _mm_srli_si128(s0, 4));
192
0
  s0 = _mm_add_epi16(s0, _mm_srli_si128(s0, 2));
193
0
  avg = _mm_extract_epi16(s0, 0);
194
195
0
  return (avg + 8) >> 4;
196
0
}
197
#endif  // CONFIG_VP9_HIGHBITDEPTH
198
199
0
static void hadamard_col8_sse2(__m128i *in, int iter) {
200
0
  __m128i a0 = in[0];
201
0
  __m128i a1 = in[1];
202
0
  __m128i a2 = in[2];
203
0
  __m128i a3 = in[3];
204
0
  __m128i a4 = in[4];
205
0
  __m128i a5 = in[5];
206
0
  __m128i a6 = in[6];
207
0
  __m128i a7 = in[7];
208
209
0
  __m128i b0 = _mm_add_epi16(a0, a1);
210
0
  __m128i b1 = _mm_sub_epi16(a0, a1);
211
0
  __m128i b2 = _mm_add_epi16(a2, a3);
212
0
  __m128i b3 = _mm_sub_epi16(a2, a3);
213
0
  __m128i b4 = _mm_add_epi16(a4, a5);
214
0
  __m128i b5 = _mm_sub_epi16(a4, a5);
215
0
  __m128i b6 = _mm_add_epi16(a6, a7);
216
0
  __m128i b7 = _mm_sub_epi16(a6, a7);
217
218
0
  a0 = _mm_add_epi16(b0, b2);
219
0
  a1 = _mm_add_epi16(b1, b3);
220
0
  a2 = _mm_sub_epi16(b0, b2);
221
0
  a3 = _mm_sub_epi16(b1, b3);
222
0
  a4 = _mm_add_epi16(b4, b6);
223
0
  a5 = _mm_add_epi16(b5, b7);
224
0
  a6 = _mm_sub_epi16(b4, b6);
225
0
  a7 = _mm_sub_epi16(b5, b7);
226
227
0
  if (iter == 0) {
228
0
    b0 = _mm_add_epi16(a0, a4);
229
0
    b7 = _mm_add_epi16(a1, a5);
230
0
    b3 = _mm_add_epi16(a2, a6);
231
0
    b4 = _mm_add_epi16(a3, a7);
232
0
    b2 = _mm_sub_epi16(a0, a4);
233
0
    b6 = _mm_sub_epi16(a1, a5);
234
0
    b1 = _mm_sub_epi16(a2, a6);
235
0
    b5 = _mm_sub_epi16(a3, a7);
236
237
0
    a0 = _mm_unpacklo_epi16(b0, b1);
238
0
    a1 = _mm_unpacklo_epi16(b2, b3);
239
0
    a2 = _mm_unpackhi_epi16(b0, b1);
240
0
    a3 = _mm_unpackhi_epi16(b2, b3);
241
0
    a4 = _mm_unpacklo_epi16(b4, b5);
242
0
    a5 = _mm_unpacklo_epi16(b6, b7);
243
0
    a6 = _mm_unpackhi_epi16(b4, b5);
244
0
    a7 = _mm_unpackhi_epi16(b6, b7);
245
246
0
    b0 = _mm_unpacklo_epi32(a0, a1);
247
0
    b1 = _mm_unpacklo_epi32(a4, a5);
248
0
    b2 = _mm_unpackhi_epi32(a0, a1);
249
0
    b3 = _mm_unpackhi_epi32(a4, a5);
250
0
    b4 = _mm_unpacklo_epi32(a2, a3);
251
0
    b5 = _mm_unpacklo_epi32(a6, a7);
252
0
    b6 = _mm_unpackhi_epi32(a2, a3);
253
0
    b7 = _mm_unpackhi_epi32(a6, a7);
254
255
0
    in[0] = _mm_unpacklo_epi64(b0, b1);
256
0
    in[1] = _mm_unpackhi_epi64(b0, b1);
257
0
    in[2] = _mm_unpacklo_epi64(b2, b3);
258
0
    in[3] = _mm_unpackhi_epi64(b2, b3);
259
0
    in[4] = _mm_unpacklo_epi64(b4, b5);
260
0
    in[5] = _mm_unpackhi_epi64(b4, b5);
261
0
    in[6] = _mm_unpacklo_epi64(b6, b7);
262
0
    in[7] = _mm_unpackhi_epi64(b6, b7);
263
0
  } else {
264
0
    in[0] = _mm_add_epi16(a0, a4);
265
0
    in[7] = _mm_add_epi16(a1, a5);
266
0
    in[3] = _mm_add_epi16(a2, a6);
267
0
    in[4] = _mm_add_epi16(a3, a7);
268
0
    in[2] = _mm_sub_epi16(a0, a4);
269
0
    in[6] = _mm_sub_epi16(a1, a5);
270
0
    in[1] = _mm_sub_epi16(a2, a6);
271
0
    in[5] = _mm_sub_epi16(a3, a7);
272
0
  }
273
0
}
274
275
static INLINE void hadamard_8x8_sse2(const int16_t *src_diff,
276
                                     ptrdiff_t src_stride, tran_low_t *coeff,
277
0
                                     int is_final) {
278
0
  __m128i src[8];
279
0
  src[0] = _mm_load_si128((const __m128i *)src_diff);
280
0
  src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
281
0
  src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
282
0
  src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
283
0
  src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
284
0
  src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
285
0
  src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
286
0
  src[7] = _mm_load_si128((const __m128i *)(src_diff + src_stride));
287
288
0
  hadamard_col8_sse2(src, 0);
289
0
  hadamard_col8_sse2(src, 1);
290
291
0
  if (is_final) {
292
0
    store_tran_low(src[0], coeff);
293
0
    coeff += 8;
294
0
    store_tran_low(src[1], coeff);
295
0
    coeff += 8;
296
0
    store_tran_low(src[2], coeff);
297
0
    coeff += 8;
298
0
    store_tran_low(src[3], coeff);
299
0
    coeff += 8;
300
0
    store_tran_low(src[4], coeff);
301
0
    coeff += 8;
302
0
    store_tran_low(src[5], coeff);
303
0
    coeff += 8;
304
0
    store_tran_low(src[6], coeff);
305
0
    coeff += 8;
306
0
    store_tran_low(src[7], coeff);
307
0
  } else {
308
0
    int16_t *coeff16 = (int16_t *)coeff;
309
0
    _mm_store_si128((__m128i *)coeff16, src[0]);
310
0
    coeff16 += 8;
311
0
    _mm_store_si128((__m128i *)coeff16, src[1]);
312
0
    coeff16 += 8;
313
0
    _mm_store_si128((__m128i *)coeff16, src[2]);
314
0
    coeff16 += 8;
315
0
    _mm_store_si128((__m128i *)coeff16, src[3]);
316
0
    coeff16 += 8;
317
0
    _mm_store_si128((__m128i *)coeff16, src[4]);
318
0
    coeff16 += 8;
319
0
    _mm_store_si128((__m128i *)coeff16, src[5]);
320
0
    coeff16 += 8;
321
0
    _mm_store_si128((__m128i *)coeff16, src[6]);
322
0
    coeff16 += 8;
323
0
    _mm_store_si128((__m128i *)coeff16, src[7]);
324
0
  }
325
0
}
326
327
void vpx_hadamard_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
328
0
                           tran_low_t *coeff) {
329
0
  hadamard_8x8_sse2(src_diff, src_stride, coeff, 1);
330
0
}
331
332
static INLINE void hadamard_16x16_sse2(const int16_t *src_diff,
333
                                       ptrdiff_t src_stride, tran_low_t *coeff,
334
0
                                       int is_final) {
335
0
#if CONFIG_VP9_HIGHBITDEPTH
336
  // For high bitdepths, it is unnecessary to store_tran_low
337
  // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the
338
  // next stage.  Output to an intermediate buffer first, then store_tran_low()
339
  // in the final stage.
340
0
  DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]);
341
0
  int16_t *t_coeff = temp_coeff;
342
#else
343
  int16_t *t_coeff = coeff;
344
#endif
345
0
  int16_t *coeff16 = (int16_t *)coeff;
346
0
  int idx;
347
0
  for (idx = 0; idx < 4; ++idx) {
348
0
    const int16_t *src_ptr =
349
0
        src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
350
0
    hadamard_8x8_sse2(src_ptr, src_stride, (tran_low_t *)(t_coeff + idx * 64),
351
0
                      0);
352
0
  }
353
354
0
  for (idx = 0; idx < 64; idx += 8) {
355
0
    __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff);
356
0
    __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 64));
357
0
    __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 128));
358
0
    __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 192));
359
360
0
    __m128i b0 = _mm_add_epi16(coeff0, coeff1);
361
0
    __m128i b1 = _mm_sub_epi16(coeff0, coeff1);
362
0
    __m128i b2 = _mm_add_epi16(coeff2, coeff3);
363
0
    __m128i b3 = _mm_sub_epi16(coeff2, coeff3);
364
365
0
    b0 = _mm_srai_epi16(b0, 1);
366
0
    b1 = _mm_srai_epi16(b1, 1);
367
0
    b2 = _mm_srai_epi16(b2, 1);
368
0
    b3 = _mm_srai_epi16(b3, 1);
369
370
0
    coeff0 = _mm_add_epi16(b0, b2);
371
0
    coeff1 = _mm_add_epi16(b1, b3);
372
0
    coeff2 = _mm_sub_epi16(b0, b2);
373
0
    coeff3 = _mm_sub_epi16(b1, b3);
374
375
0
    if (is_final) {
376
0
      store_tran_low(coeff0, coeff);
377
0
      store_tran_low(coeff1, coeff + 64);
378
0
      store_tran_low(coeff2, coeff + 128);
379
0
      store_tran_low(coeff3, coeff + 192);
380
0
      coeff += 8;
381
0
    } else {
382
0
      _mm_store_si128((__m128i *)coeff16, coeff0);
383
0
      _mm_store_si128((__m128i *)(coeff16 + 64), coeff1);
384
0
      _mm_store_si128((__m128i *)(coeff16 + 128), coeff2);
385
0
      _mm_store_si128((__m128i *)(coeff16 + 192), coeff3);
386
0
      coeff16 += 8;
387
0
    }
388
389
0
    t_coeff += 8;
390
0
  }
391
0
}
392
393
void vpx_hadamard_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
394
0
                             tran_low_t *coeff) {
395
0
  hadamard_16x16_sse2(src_diff, src_stride, coeff, 1);
396
0
}
397
398
void vpx_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
399
0
                             tran_low_t *coeff) {
400
0
#if CONFIG_VP9_HIGHBITDEPTH
401
  // For high bitdepths, it is unnecessary to store_tran_low
402
  // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the
403
  // next stage.  Output to an intermediate buffer first, then store_tran_low()
404
  // in the final stage.
405
0
  DECLARE_ALIGNED(32, int16_t, temp_coeff[32 * 32]);
406
0
  int16_t *t_coeff = temp_coeff;
407
#else
408
  int16_t *t_coeff = coeff;
409
#endif
410
0
  int idx;
411
0
  __m128i coeff0_lo, coeff1_lo, coeff2_lo, coeff3_lo, b0_lo, b1_lo, b2_lo,
412
0
      b3_lo;
413
0
  __m128i coeff0_hi, coeff1_hi, coeff2_hi, coeff3_hi, b0_hi, b1_hi, b2_hi,
414
0
      b3_hi;
415
0
  __m128i b0, b1, b2, b3;
416
0
  const __m128i zero = _mm_setzero_si128();
417
0
  for (idx = 0; idx < 4; ++idx) {
418
0
    const int16_t *src_ptr =
419
0
        src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
420
0
    hadamard_16x16_sse2(src_ptr, src_stride,
421
0
                        (tran_low_t *)(t_coeff + idx * 256), 0);
422
0
  }
423
424
0
  for (idx = 0; idx < 256; idx += 8) {
425
0
    __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff);
426
0
    __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 256));
427
0
    __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 512));
428
0
    __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 768));
429
430
    // Sign extend 16 bit to 32 bit.
431
0
    sign_extend_16bit_to_32bit_sse2(coeff0, zero, &coeff0_lo, &coeff0_hi);
432
0
    sign_extend_16bit_to_32bit_sse2(coeff1, zero, &coeff1_lo, &coeff1_hi);
433
0
    sign_extend_16bit_to_32bit_sse2(coeff2, zero, &coeff2_lo, &coeff2_hi);
434
0
    sign_extend_16bit_to_32bit_sse2(coeff3, zero, &coeff3_lo, &coeff3_hi);
435
436
0
    b0_lo = _mm_add_epi32(coeff0_lo, coeff1_lo);
437
0
    b0_hi = _mm_add_epi32(coeff0_hi, coeff1_hi);
438
439
0
    b1_lo = _mm_sub_epi32(coeff0_lo, coeff1_lo);
440
0
    b1_hi = _mm_sub_epi32(coeff0_hi, coeff1_hi);
441
442
0
    b2_lo = _mm_add_epi32(coeff2_lo, coeff3_lo);
443
0
    b2_hi = _mm_add_epi32(coeff2_hi, coeff3_hi);
444
445
0
    b3_lo = _mm_sub_epi32(coeff2_lo, coeff3_lo);
446
0
    b3_hi = _mm_sub_epi32(coeff2_hi, coeff3_hi);
447
448
0
    b0_lo = _mm_srai_epi32(b0_lo, 2);
449
0
    b1_lo = _mm_srai_epi32(b1_lo, 2);
450
0
    b2_lo = _mm_srai_epi32(b2_lo, 2);
451
0
    b3_lo = _mm_srai_epi32(b3_lo, 2);
452
453
0
    b0_hi = _mm_srai_epi32(b0_hi, 2);
454
0
    b1_hi = _mm_srai_epi32(b1_hi, 2);
455
0
    b2_hi = _mm_srai_epi32(b2_hi, 2);
456
0
    b3_hi = _mm_srai_epi32(b3_hi, 2);
457
458
0
    b0 = _mm_packs_epi32(b0_lo, b0_hi);
459
0
    b1 = _mm_packs_epi32(b1_lo, b1_hi);
460
0
    b2 = _mm_packs_epi32(b2_lo, b2_hi);
461
0
    b3 = _mm_packs_epi32(b3_lo, b3_hi);
462
463
0
    coeff0 = _mm_add_epi16(b0, b2);
464
0
    coeff1 = _mm_add_epi16(b1, b3);
465
0
    store_tran_low(coeff0, coeff);
466
0
    store_tran_low(coeff1, coeff + 256);
467
468
0
    coeff2 = _mm_sub_epi16(b0, b2);
469
0
    coeff3 = _mm_sub_epi16(b1, b3);
470
0
    store_tran_low(coeff2, coeff + 512);
471
0
    store_tran_low(coeff3, coeff + 768);
472
473
0
    coeff += 8;
474
0
    t_coeff += 8;
475
0
  }
476
0
}
477
478
0
int vpx_satd_sse2(const tran_low_t *coeff, int length) {
479
0
  int i;
480
0
  const __m128i zero = _mm_setzero_si128();
481
0
  __m128i accum = zero;
482
483
0
  for (i = 0; i < length; i += 8) {
484
0
    const __m128i src_line = load_tran_low(coeff);
485
0
    const __m128i inv = _mm_sub_epi16(zero, src_line);
486
0
    const __m128i abs = _mm_max_epi16(src_line, inv);  // abs(src_line)
487
0
    const __m128i abs_lo = _mm_unpacklo_epi16(abs, zero);
488
0
    const __m128i abs_hi = _mm_unpackhi_epi16(abs, zero);
489
0
    const __m128i sum = _mm_add_epi32(abs_lo, abs_hi);
490
0
    accum = _mm_add_epi32(accum, sum);
491
0
    coeff += 8;
492
0
  }
493
494
0
  {  // cascading summation of accum
495
0
    __m128i hi = _mm_srli_si128(accum, 8);
496
0
    accum = _mm_add_epi32(accum, hi);
497
0
    hi = _mm_srli_epi64(accum, 32);
498
0
    accum = _mm_add_epi32(accum, hi);
499
0
  }
500
501
0
  return _mm_cvtsi128_si32(accum);
502
0
}
503
504
void vpx_int_pro_row_sse2(int16_t hbuf[16], const uint8_t *ref,
505
0
                          const int ref_stride, const int height) {
506
0
  int idx;
507
0
  __m128i zero = _mm_setzero_si128();
508
0
  __m128i src_line = _mm_loadu_si128((const __m128i *)ref);
509
0
  __m128i s0 = _mm_unpacklo_epi8(src_line, zero);
510
0
  __m128i s1 = _mm_unpackhi_epi8(src_line, zero);
511
0
  __m128i t0, t1;
512
0
  int height_1 = height - 1;
513
0
  ref += ref_stride;
514
515
0
  for (idx = 1; idx < height_1; idx += 2) {
516
0
    src_line = _mm_loadu_si128((const __m128i *)ref);
517
0
    t0 = _mm_unpacklo_epi8(src_line, zero);
518
0
    t1 = _mm_unpackhi_epi8(src_line, zero);
519
0
    s0 = _mm_adds_epu16(s0, t0);
520
0
    s1 = _mm_adds_epu16(s1, t1);
521
0
    ref += ref_stride;
522
523
0
    src_line = _mm_loadu_si128((const __m128i *)ref);
524
0
    t0 = _mm_unpacklo_epi8(src_line, zero);
525
0
    t1 = _mm_unpackhi_epi8(src_line, zero);
526
0
    s0 = _mm_adds_epu16(s0, t0);
527
0
    s1 = _mm_adds_epu16(s1, t1);
528
0
    ref += ref_stride;
529
0
  }
530
531
0
  src_line = _mm_loadu_si128((const __m128i *)ref);
532
0
  t0 = _mm_unpacklo_epi8(src_line, zero);
533
0
  t1 = _mm_unpackhi_epi8(src_line, zero);
534
0
  s0 = _mm_adds_epu16(s0, t0);
535
0
  s1 = _mm_adds_epu16(s1, t1);
536
537
0
  if (height == 64) {
538
0
    s0 = _mm_srai_epi16(s0, 5);
539
0
    s1 = _mm_srai_epi16(s1, 5);
540
0
  } else if (height == 32) {
541
0
    s0 = _mm_srai_epi16(s0, 4);
542
0
    s1 = _mm_srai_epi16(s1, 4);
543
0
  } else {
544
0
    s0 = _mm_srai_epi16(s0, 3);
545
0
    s1 = _mm_srai_epi16(s1, 3);
546
0
  }
547
548
0
  _mm_storeu_si128((__m128i *)hbuf, s0);
549
0
  hbuf += 8;
550
0
  _mm_storeu_si128((__m128i *)hbuf, s1);
551
0
}
552
553
0
int16_t vpx_int_pro_col_sse2(const uint8_t *ref, const int width) {
554
0
  __m128i zero = _mm_setzero_si128();
555
0
  __m128i src_line = _mm_loadu_si128((const __m128i *)ref);
556
0
  __m128i s0 = _mm_sad_epu8(src_line, zero);
557
0
  __m128i s1;
558
0
  int i;
559
560
0
  for (i = 16; i < width; i += 16) {
561
0
    ref += 16;
562
0
    src_line = _mm_loadu_si128((const __m128i *)ref);
563
0
    s1 = _mm_sad_epu8(src_line, zero);
564
0
    s0 = _mm_adds_epu16(s0, s1);
565
0
  }
566
567
0
  s1 = _mm_srli_si128(s0, 8);
568
0
  s0 = _mm_adds_epu16(s0, s1);
569
570
0
  return _mm_extract_epi16(s0, 0);
571
0
}
572
573
0
int vpx_vector_var_sse2(const int16_t *ref, const int16_t *src, const int bwl) {
574
0
  int idx;
575
0
  int width = 4 << bwl;
576
0
  int16_t mean;
577
0
  __m128i v0 = _mm_loadu_si128((const __m128i *)ref);
578
0
  __m128i v1 = _mm_load_si128((const __m128i *)src);
579
0
  __m128i diff = _mm_subs_epi16(v0, v1);
580
0
  __m128i sum = diff;
581
0
  __m128i sse = _mm_madd_epi16(diff, diff);
582
583
0
  ref += 8;
584
0
  src += 8;
585
586
0
  for (idx = 8; idx < width; idx += 8) {
587
0
    v0 = _mm_loadu_si128((const __m128i *)ref);
588
0
    v1 = _mm_load_si128((const __m128i *)src);
589
0
    diff = _mm_subs_epi16(v0, v1);
590
591
0
    sum = _mm_add_epi16(sum, diff);
592
0
    v0 = _mm_madd_epi16(diff, diff);
593
0
    sse = _mm_add_epi32(sse, v0);
594
595
0
    ref += 8;
596
0
    src += 8;
597
0
  }
598
599
0
  v0 = _mm_srli_si128(sum, 8);
600
0
  sum = _mm_add_epi16(sum, v0);
601
0
  v0 = _mm_srli_epi64(sum, 32);
602
0
  sum = _mm_add_epi16(sum, v0);
603
0
  v0 = _mm_srli_epi32(sum, 16);
604
0
  sum = _mm_add_epi16(sum, v0);
605
606
0
  v1 = _mm_srli_si128(sse, 8);
607
0
  sse = _mm_add_epi32(sse, v1);
608
0
  v1 = _mm_srli_epi64(sse, 32);
609
0
  sse = _mm_add_epi32(sse, v1);
610
611
0
  mean = (int16_t)_mm_extract_epi16(sum, 0);
612
613
0
  return _mm_cvtsi128_si32(sse) - ((mean * mean) >> (bwl + 2));
614
0
}