Coverage Report

Created: 2024-09-06 07:53

/src/libvpx/vpx_dsp/x86/variance_sse2.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3
 *
4
 *  Use of this source code is governed by a BSD-style license
5
 *  that can be found in the LICENSE file in the root of the source
6
 *  tree. An additional intellectual property rights grant can be found
7
 *  in the file PATENTS.  All contributing project authors may
8
 *  be found in the AUTHORS file in the root of the source tree.
9
 */
10
11
#include <assert.h>
12
#include <emmintrin.h>  // SSE2
13
14
#include "./vpx_config.h"
15
#include "./vpx_dsp_rtcd.h"
16
#include "vpx_ports/mem.h"
17
#include "vpx_dsp/x86/mem_sse2.h"
18
19
305M
static INLINE unsigned int add32x4_sse2(__m128i val) {
20
305M
  val = _mm_add_epi32(val, _mm_srli_si128(val, 8));
21
305M
  val = _mm_add_epi32(val, _mm_srli_si128(val, 4));
22
305M
  return (unsigned int)_mm_cvtsi128_si32(val);
23
305M
}
24
25
0
unsigned int vpx_get_mb_ss_sse2(const int16_t *src_ptr) {
26
0
  __m128i vsum = _mm_setzero_si128();
27
0
  int i;
28
29
0
  for (i = 0; i < 32; ++i) {
30
0
    const __m128i v = _mm_loadu_si128((const __m128i *)src_ptr);
31
0
    vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v));
32
0
    src_ptr += 8;
33
0
  }
34
35
0
  return add32x4_sse2(vsum);
36
0
}
37
38
1.25G
static INLINE __m128i load4x2_sse2(const uint8_t *const p, const int stride) {
39
1.25G
  const __m128i p0 = _mm_cvtsi32_si128(loadu_int32(p + 0 * stride));
40
1.25G
  const __m128i p1 = _mm_cvtsi32_si128(loadu_int32(p + 1 * stride));
41
1.25G
  const __m128i p01 = _mm_unpacklo_epi32(p0, p1);
42
1.25G
  return _mm_unpacklo_epi8(p01, _mm_setzero_si128());
43
1.25G
}
44
45
static INLINE void variance_kernel_sse2(const __m128i src_ptr,
46
                                        const __m128i ref_ptr,
47
                                        __m128i *const sse,
48
627M
                                        __m128i *const sum) {
49
627M
  const __m128i diff = _mm_sub_epi16(src_ptr, ref_ptr);
50
627M
  *sse = _mm_add_epi32(*sse, _mm_madd_epi16(diff, diff));
51
627M
  *sum = _mm_add_epi16(*sum, diff);
52
627M
}
53
54
// Can handle 128 pixels' diff sum (such as 8x16 or 16x8)
55
// Slightly faster than variance_final_256_pel_sse2()
56
static INLINE void variance_final_128_pel_sse2(__m128i vsse, __m128i vsum,
57
                                               unsigned int *const sse,
58
305M
                                               int *const sum) {
59
305M
  *sse = add32x4_sse2(vsse);
60
61
305M
  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
62
305M
  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
63
305M
  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
64
305M
  *sum = (int16_t)_mm_extract_epi16(vsum, 0);
65
305M
}
66
67
// Can handle 256 pixels' diff sum (such as 16x16)
68
static INLINE void variance_final_256_pel_sse2(__m128i vsse, __m128i vsum,
69
                                               unsigned int *const sse,
70
0
                                               int *const sum) {
71
0
  *sse = add32x4_sse2(vsse);
72
73
0
  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
74
0
  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
75
0
  *sum = (int16_t)_mm_extract_epi16(vsum, 0);
76
0
  *sum += (int16_t)_mm_extract_epi16(vsum, 1);
77
0
}
78
79
// Can handle 512 pixels' diff sum (such as 16x32 or 32x16)
80
static INLINE void variance_final_512_pel_sse2(__m128i vsse, __m128i vsum,
81
                                               unsigned int *const sse,
82
0
                                               int *const sum) {
83
0
  *sse = add32x4_sse2(vsse);
84
85
0
  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
86
0
  vsum = _mm_unpacklo_epi16(vsum, vsum);
87
0
  vsum = _mm_srai_epi32(vsum, 16);
88
0
  *sum = (int)add32x4_sse2(vsum);
89
0
}
90
91
0
static INLINE __m128i sum_to_32bit_sse2(const __m128i sum) {
92
0
  const __m128i sum_lo = _mm_srai_epi32(_mm_unpacklo_epi16(sum, sum), 16);
93
0
  const __m128i sum_hi = _mm_srai_epi32(_mm_unpackhi_epi16(sum, sum), 16);
94
0
  return _mm_add_epi32(sum_lo, sum_hi);
95
0
}
96
97
// Can handle 1024 pixels' diff sum (such as 32x32)
98
0
static INLINE int sum_final_sse2(const __m128i sum) {
99
0
  const __m128i t = sum_to_32bit_sse2(sum);
100
0
  return (int)add32x4_sse2(t);
101
0
}
102
103
static INLINE void variance4_sse2(const uint8_t *src_ptr, const int src_stride,
104
                                  const uint8_t *ref_ptr, const int ref_stride,
105
                                  const int h, __m128i *const sse,
106
305M
                                  __m128i *const sum) {
107
305M
  int i;
108
109
305M
  assert(h <= 256);  // May overflow for larger height.
110
305M
  *sse = _mm_setzero_si128();
111
305M
  *sum = _mm_setzero_si128();
112
113
932M
  for (i = 0; i < h; i += 2) {
114
627M
    const __m128i s = load4x2_sse2(src_ptr, src_stride);
115
627M
    const __m128i r = load4x2_sse2(ref_ptr, ref_stride);
116
117
627M
    variance_kernel_sse2(s, r, sse, sum);
118
627M
    src_ptr += 2 * src_stride;
119
627M
    ref_ptr += 2 * ref_stride;
120
627M
  }
121
305M
}
122
123
static INLINE void variance8_sse2(const uint8_t *src_ptr, const int src_stride,
124
                                  const uint8_t *ref_ptr, const int ref_stride,
125
                                  const int h, __m128i *const sse,
126
0
                                  __m128i *const sum) {
127
0
  const __m128i zero = _mm_setzero_si128();
128
0
  int i;
129
130
0
  assert(h <= 128);  // May overflow for larger height.
131
0
  *sse = _mm_setzero_si128();
132
0
  *sum = _mm_setzero_si128();
133
134
0
  for (i = 0; i < h; i++) {
135
0
    const __m128i s =
136
0
        _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)src_ptr), zero);
137
0
    const __m128i r =
138
0
        _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ref_ptr), zero);
139
140
0
    variance_kernel_sse2(s, r, sse, sum);
141
0
    src_ptr += src_stride;
142
0
    ref_ptr += ref_stride;
143
0
  }
144
0
}
145
146
static INLINE void variance16_kernel_sse2(const uint8_t *const src_ptr,
147
                                          const uint8_t *const ref_ptr,
148
                                          __m128i *const sse,
149
0
                                          __m128i *const sum) {
150
0
  const __m128i zero = _mm_setzero_si128();
151
0
  const __m128i s = _mm_loadu_si128((const __m128i *)src_ptr);
152
0
  const __m128i r = _mm_loadu_si128((const __m128i *)ref_ptr);
153
0
  const __m128i src0 = _mm_unpacklo_epi8(s, zero);
154
0
  const __m128i ref0 = _mm_unpacklo_epi8(r, zero);
155
0
  const __m128i src1 = _mm_unpackhi_epi8(s, zero);
156
0
  const __m128i ref1 = _mm_unpackhi_epi8(r, zero);
157
158
0
  variance_kernel_sse2(src0, ref0, sse, sum);
159
0
  variance_kernel_sse2(src1, ref1, sse, sum);
160
0
}
161
162
static INLINE void variance16_sse2(const uint8_t *src_ptr, const int src_stride,
163
                                   const uint8_t *ref_ptr, const int ref_stride,
164
                                   const int h, __m128i *const sse,
165
0
                                   __m128i *const sum) {
166
0
  int i;
167
168
0
  assert(h <= 64);  // May overflow for larger height.
169
0
  *sse = _mm_setzero_si128();
170
0
  *sum = _mm_setzero_si128();
171
172
0
  for (i = 0; i < h; ++i) {
173
0
    variance16_kernel_sse2(src_ptr, ref_ptr, sse, sum);
174
0
    src_ptr += src_stride;
175
0
    ref_ptr += ref_stride;
176
0
  }
177
0
}
178
179
static INLINE void variance32_sse2(const uint8_t *src_ptr, const int src_stride,
180
                                   const uint8_t *ref_ptr, const int ref_stride,
181
                                   const int h, __m128i *const sse,
182
0
                                   __m128i *const sum) {
183
0
  int i;
184
185
0
  assert(h <= 32);  // May overflow for larger height.
186
  // Don't initialize sse here since it's an accumulation.
187
0
  *sum = _mm_setzero_si128();
188
189
0
  for (i = 0; i < h; ++i) {
190
0
    variance16_kernel_sse2(src_ptr + 0, ref_ptr + 0, sse, sum);
191
0
    variance16_kernel_sse2(src_ptr + 16, ref_ptr + 16, sse, sum);
192
0
    src_ptr += src_stride;
193
0
    ref_ptr += ref_stride;
194
0
  }
195
0
}
196
197
static INLINE void variance64_sse2(const uint8_t *src_ptr, const int src_stride,
198
                                   const uint8_t *ref_ptr, const int ref_stride,
199
                                   const int h, __m128i *const sse,
200
0
                                   __m128i *const sum) {
201
0
  int i;
202
203
0
  assert(h <= 16);  // May overflow for larger height.
204
  // Don't initialize sse here since it's an accumulation.
205
0
  *sum = _mm_setzero_si128();
206
207
0
  for (i = 0; i < h; ++i) {
208
0
    variance16_kernel_sse2(src_ptr + 0, ref_ptr + 0, sse, sum);
209
0
    variance16_kernel_sse2(src_ptr + 16, ref_ptr + 16, sse, sum);
210
0
    variance16_kernel_sse2(src_ptr + 32, ref_ptr + 32, sse, sum);
211
0
    variance16_kernel_sse2(src_ptr + 48, ref_ptr + 48, sse, sum);
212
0
    src_ptr += src_stride;
213
0
    ref_ptr += ref_stride;
214
0
  }
215
0
}
216
217
void vpx_get8x8var_sse2(const uint8_t *src_ptr, int src_stride,
218
                        const uint8_t *ref_ptr, int ref_stride,
219
0
                        unsigned int *sse, int *sum) {
220
0
  __m128i vsse, vsum;
221
0
  variance8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum);
222
0
  variance_final_128_pel_sse2(vsse, vsum, sse, sum);
223
0
}
224
225
void vpx_get16x16var_sse2(const uint8_t *src_ptr, int src_stride,
226
                          const uint8_t *ref_ptr, int ref_stride,
227
0
                          unsigned int *sse, int *sum) {
228
0
  __m128i vsse, vsum;
229
0
  variance16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum);
230
0
  variance_final_256_pel_sse2(vsse, vsum, sse, sum);
231
0
}
232
233
unsigned int vpx_variance4x4_sse2(const uint8_t *src_ptr, int src_stride,
234
                                  const uint8_t *ref_ptr, int ref_stride,
235
296M
                                  unsigned int *sse) {
236
296M
  __m128i vsse, vsum;
237
296M
  int sum;
238
296M
  variance4_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 4, &vsse, &vsum);
239
296M
  variance_final_128_pel_sse2(vsse, vsum, sse, &sum);
240
296M
  return *sse - ((sum * sum) >> 4);
241
296M
}
242
243
unsigned int vpx_variance4x8_sse2(const uint8_t *src_ptr, int src_stride,
244
                                  const uint8_t *ref_ptr, int ref_stride,
245
8.46M
                                  unsigned int *sse) {
246
8.46M
  __m128i vsse, vsum;
247
8.46M
  int sum;
248
8.46M
  variance4_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum);
249
8.46M
  variance_final_128_pel_sse2(vsse, vsum, sse, &sum);
250
8.46M
  return *sse - ((sum * sum) >> 5);
251
8.46M
}
252
253
unsigned int vpx_variance8x4_sse2(const uint8_t *src_ptr, int src_stride,
254
                                  const uint8_t *ref_ptr, int ref_stride,
255
0
                                  unsigned int *sse) {
256
0
  __m128i vsse, vsum;
257
0
  int sum;
258
0
  variance8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 4, &vsse, &vsum);
259
0
  variance_final_128_pel_sse2(vsse, vsum, sse, &sum);
260
0
  return *sse - ((sum * sum) >> 5);
261
0
}
262
263
unsigned int vpx_variance8x8_sse2(const uint8_t *src_ptr, int src_stride,
264
                                  const uint8_t *ref_ptr, int ref_stride,
265
0
                                  unsigned int *sse) {
266
0
  __m128i vsse, vsum;
267
0
  int sum;
268
0
  variance8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum);
269
0
  variance_final_128_pel_sse2(vsse, vsum, sse, &sum);
270
0
  return *sse - ((sum * sum) >> 6);
271
0
}
272
273
unsigned int vpx_variance8x16_sse2(const uint8_t *src_ptr, int src_stride,
274
                                   const uint8_t *ref_ptr, int ref_stride,
275
0
                                   unsigned int *sse) {
276
0
  __m128i vsse, vsum;
277
0
  int sum;
278
0
  variance8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum);
279
0
  variance_final_128_pel_sse2(vsse, vsum, sse, &sum);
280
0
  return *sse - ((sum * sum) >> 7);
281
0
}
282
283
unsigned int vpx_variance16x8_sse2(const uint8_t *src_ptr, int src_stride,
284
                                   const uint8_t *ref_ptr, int ref_stride,
285
0
                                   unsigned int *sse) {
286
0
  __m128i vsse, vsum;
287
0
  int sum;
288
0
  variance16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum);
289
0
  variance_final_128_pel_sse2(vsse, vsum, sse, &sum);
290
0
  return *sse - ((sum * sum) >> 7);
291
0
}
292
293
unsigned int vpx_variance16x16_sse2(const uint8_t *src_ptr, int src_stride,
294
                                    const uint8_t *ref_ptr, int ref_stride,
295
0
                                    unsigned int *sse) {
296
0
  __m128i vsse, vsum;
297
0
  int sum;
298
0
  variance16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum);
299
0
  variance_final_256_pel_sse2(vsse, vsum, sse, &sum);
300
0
  return *sse - (uint32_t)(((int64_t)sum * sum) >> 8);
301
0
}
302
303
unsigned int vpx_variance16x32_sse2(const uint8_t *src_ptr, int src_stride,
304
                                    const uint8_t *ref_ptr, int ref_stride,
305
0
                                    unsigned int *sse) {
306
0
  __m128i vsse, vsum;
307
0
  int sum;
308
0
  variance16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 32, &vsse, &vsum);
309
0
  variance_final_512_pel_sse2(vsse, vsum, sse, &sum);
310
0
  return *sse - (unsigned int)(((int64_t)sum * sum) >> 9);
311
0
}
312
313
unsigned int vpx_variance32x16_sse2(const uint8_t *src_ptr, int src_stride,
314
                                    const uint8_t *ref_ptr, int ref_stride,
315
0
                                    unsigned int *sse) {
316
0
  __m128i vsse = _mm_setzero_si128();
317
0
  __m128i vsum;
318
0
  int sum;
319
0
  variance32_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum);
320
0
  variance_final_512_pel_sse2(vsse, vsum, sse, &sum);
321
0
  return *sse - (unsigned int)(((int64_t)sum * sum) >> 9);
322
0
}
323
324
unsigned int vpx_variance32x32_sse2(const uint8_t *src_ptr, int src_stride,
325
                                    const uint8_t *ref_ptr, int ref_stride,
326
0
                                    unsigned int *sse) {
327
0
  __m128i vsse = _mm_setzero_si128();
328
0
  __m128i vsum;
329
0
  int sum;
330
0
  variance32_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 32, &vsse, &vsum);
331
0
  *sse = add32x4_sse2(vsse);
332
0
  sum = sum_final_sse2(vsum);
333
0
  return *sse - (unsigned int)(((int64_t)sum * sum) >> 10);
334
0
}
335
336
unsigned int vpx_variance32x64_sse2(const uint8_t *src_ptr, int src_stride,
337
                                    const uint8_t *ref_ptr, int ref_stride,
338
0
                                    unsigned int *sse) {
339
0
  __m128i vsse = _mm_setzero_si128();
340
0
  __m128i vsum = _mm_setzero_si128();
341
0
  int sum;
342
0
  int i = 0;
343
344
0
  for (i = 0; i < 2; i++) {
345
0
    __m128i vsum16;
346
0
    variance32_sse2(src_ptr + 32 * i * src_stride, src_stride,
347
0
                    ref_ptr + 32 * i * ref_stride, ref_stride, 32, &vsse,
348
0
                    &vsum16);
349
0
    vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16));
350
0
  }
351
0
  *sse = add32x4_sse2(vsse);
352
0
  sum = (int)add32x4_sse2(vsum);
353
0
  return *sse - (unsigned int)(((int64_t)sum * sum) >> 11);
354
0
}
355
356
unsigned int vpx_variance64x32_sse2(const uint8_t *src_ptr, int src_stride,
357
                                    const uint8_t *ref_ptr, int ref_stride,
358
0
                                    unsigned int *sse) {
359
0
  __m128i vsse = _mm_setzero_si128();
360
0
  __m128i vsum = _mm_setzero_si128();
361
0
  int sum;
362
0
  int i = 0;
363
364
0
  for (i = 0; i < 2; i++) {
365
0
    __m128i vsum16;
366
0
    variance64_sse2(src_ptr + 16 * i * src_stride, src_stride,
367
0
                    ref_ptr + 16 * i * ref_stride, ref_stride, 16, &vsse,
368
0
                    &vsum16);
369
0
    vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16));
370
0
  }
371
0
  *sse = add32x4_sse2(vsse);
372
0
  sum = (int)add32x4_sse2(vsum);
373
0
  return *sse - (unsigned int)(((int64_t)sum * sum) >> 11);
374
0
}
375
376
unsigned int vpx_variance64x64_sse2(const uint8_t *src_ptr, int src_stride,
377
                                    const uint8_t *ref_ptr, int ref_stride,
378
0
                                    unsigned int *sse) {
379
0
  __m128i vsse = _mm_setzero_si128();
380
0
  __m128i vsum = _mm_setzero_si128();
381
0
  int sum;
382
0
  int i = 0;
383
384
0
  for (i = 0; i < 4; i++) {
385
0
    __m128i vsum16;
386
0
    variance64_sse2(src_ptr + 16 * i * src_stride, src_stride,
387
0
                    ref_ptr + 16 * i * ref_stride, ref_stride, 16, &vsse,
388
0
                    &vsum16);
389
0
    vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16));
390
0
  }
391
0
  *sse = add32x4_sse2(vsse);
392
0
  sum = (int)add32x4_sse2(vsum);
393
0
  return *sse - (unsigned int)(((int64_t)sum * sum) >> 12);
394
0
}
395
396
unsigned int vpx_mse8x8_sse2(const uint8_t *src_ptr, int src_stride,
397
                             const uint8_t *ref_ptr, int ref_stride,
398
0
                             unsigned int *sse) {
399
0
  vpx_variance8x8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, sse);
400
0
  return *sse;
401
0
}
402
403
unsigned int vpx_mse8x16_sse2(const uint8_t *src_ptr, int src_stride,
404
                              const uint8_t *ref_ptr, int ref_stride,
405
0
                              unsigned int *sse) {
406
0
  vpx_variance8x16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, sse);
407
0
  return *sse;
408
0
}
409
410
unsigned int vpx_mse16x8_sse2(const uint8_t *src_ptr, int src_stride,
411
                              const uint8_t *ref_ptr, int ref_stride,
412
0
                              unsigned int *sse) {
413
0
  vpx_variance16x8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, sse);
414
0
  return *sse;
415
0
}
416
417
unsigned int vpx_mse16x16_sse2(const uint8_t *src_ptr, int src_stride,
418
                               const uint8_t *ref_ptr, int ref_stride,
419
0
                               unsigned int *sse) {
420
0
  vpx_variance16x16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, sse);
421
0
  return *sse;
422
0
}
423
424
// The 2 unused parameters are place holders for PIC enabled build.
425
// These definitions are for functions defined in subpel_variance.asm
426
#define DECL(w, opt)                                                          \
427
  int vpx_sub_pixel_variance##w##xh_##opt(                                    \
428
      const uint8_t *src_ptr, ptrdiff_t src_stride, int x_offset,             \
429
      int y_offset, const uint8_t *ref_ptr, ptrdiff_t ref_stride, int height, \
430
      unsigned int *sse, void *unused0, void *unused)
431
#define DECLS(opt1, opt2) \
432
  DECL(4, opt1);          \
433
  DECL(8, opt1);          \
434
  DECL(16, opt1)
435
436
DECLS(sse2, sse2);
437
DECLS(ssse3, ssse3);
438
#undef DECLS
439
#undef DECL
440
441
#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast)                  \
442
  unsigned int vpx_sub_pixel_variance##w##x##h##_##opt(                   \
443
      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
444
253M
      const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) {        \
445
253M
    unsigned int sse_tmp;                                                 \
446
253M
    int se = vpx_sub_pixel_variance##wf##xh_##opt(                        \
447
253M
        src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride, h,  \
448
253M
        &sse_tmp, NULL, NULL);                                            \
449
253M
    if (w > wf) {                                                         \
450
3.45M
      unsigned int sse2;                                                  \
451
3.45M
      int se2 = vpx_sub_pixel_variance##wf##xh_##opt(                     \
452
3.45M
          src_ptr + 16, src_stride, x_offset, y_offset, ref_ptr + 16,     \
453
3.45M
          ref_stride, h, &sse2, NULL, NULL);                              \
454
3.45M
      se += se2;                                                          \
455
3.45M
      sse_tmp += sse2;                                                    \
456
3.45M
      if (w > wf * 2) {                                                   \
457
658k
        se2 = vpx_sub_pixel_variance##wf##xh_##opt(                       \
458
658k
            src_ptr + 32, src_stride, x_offset, y_offset, ref_ptr + 32,   \
459
658k
            ref_stride, h, &sse2, NULL, NULL);                            \
460
658k
        se += se2;                                                        \
461
658k
        sse_tmp += sse2;                                                  \
462
658k
        se2 = vpx_sub_pixel_variance##wf##xh_##opt(                       \
463
658k
            src_ptr + 48, src_stride, x_offset, y_offset, ref_ptr + 48,   \
464
658k
            ref_stride, h, &sse2, NULL, NULL);                            \
465
658k
        se += se2;                                                        \
466
658k
        sse_tmp += sse2;                                                  \
467
658k
      }                                                                   \
468
3.45M
    }                                                                     \
469
253M
    *sse = sse_tmp;                                                       \
470
253M
    return sse_tmp -                                                      \
471
253M
           (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));    \
472
253M
  }
Unexecuted instantiation: vpx_sub_pixel_variance64x64_sse2
Unexecuted instantiation: vpx_sub_pixel_variance64x32_sse2
Unexecuted instantiation: vpx_sub_pixel_variance32x64_sse2
Unexecuted instantiation: vpx_sub_pixel_variance32x32_sse2
Unexecuted instantiation: vpx_sub_pixel_variance32x16_sse2
Unexecuted instantiation: vpx_sub_pixel_variance16x32_sse2
Unexecuted instantiation: vpx_sub_pixel_variance16x16_sse2
Unexecuted instantiation: vpx_sub_pixel_variance16x8_sse2
Unexecuted instantiation: vpx_sub_pixel_variance8x16_sse2
Unexecuted instantiation: vpx_sub_pixel_variance8x8_sse2
Unexecuted instantiation: vpx_sub_pixel_variance8x4_sse2
Unexecuted instantiation: vpx_sub_pixel_variance4x8_sse2
Unexecuted instantiation: vpx_sub_pixel_variance4x4_sse2
Unexecuted instantiation: vpx_sub_pixel_variance64x64_ssse3
vpx_sub_pixel_variance64x32_ssse3
Line
Count
Source
444
658k
      const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) {        \
445
658k
    unsigned int sse_tmp;                                                 \
446
658k
    int se = vpx_sub_pixel_variance##wf##xh_##opt(                        \
447
658k
        src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride, h,  \
448
658k
        &sse_tmp, NULL, NULL);                                            \
449
658k
    if (w > wf) {                                                         \
450
658k
      unsigned int sse2;                                                  \
451
658k
      int se2 = vpx_sub_pixel_variance##wf##xh_##opt(                     \
452
658k
          src_ptr + 16, src_stride, x_offset, y_offset, ref_ptr + 16,     \
453
658k
          ref_stride, h, &sse2, NULL, NULL);                              \
454
658k
      se += se2;                                                          \
455
658k
      sse_tmp += sse2;                                                    \
456
658k
      if (w > wf * 2) {                                                   \
457
658k
        se2 = vpx_sub_pixel_variance##wf##xh_##opt(                       \
458
658k
            src_ptr + 32, src_stride, x_offset, y_offset, ref_ptr + 32,   \
459
658k
            ref_stride, h, &sse2, NULL, NULL);                            \
460
658k
        se += se2;                                                        \
461
658k
        sse_tmp += sse2;                                                  \
462
658k
        se2 = vpx_sub_pixel_variance##wf##xh_##opt(                       \
463
658k
            src_ptr + 48, src_stride, x_offset, y_offset, ref_ptr + 48,   \
464
658k
            ref_stride, h, &sse2, NULL, NULL);                            \
465
658k
        se += se2;                                                        \
466
658k
        sse_tmp += sse2;                                                  \
467
658k
      }                                                                   \
468
658k
    }                                                                     \
469
658k
    *sse = sse_tmp;                                                       \
470
658k
    return sse_tmp -                                                      \
471
658k
           (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));    \
472
658k
  }
vpx_sub_pixel_variance32x64_ssse3
Line
Count
Source
444
586k
      const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) {        \
445
586k
    unsigned int sse_tmp;                                                 \
446
586k
    int se = vpx_sub_pixel_variance##wf##xh_##opt(                        \
447
586k
        src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride, h,  \
448
586k
        &sse_tmp, NULL, NULL);                                            \
449
586k
    if (w > wf) {                                                         \
450
586k
      unsigned int sse2;                                                  \
451
586k
      int se2 = vpx_sub_pixel_variance##wf##xh_##opt(                     \
452
586k
          src_ptr + 16, src_stride, x_offset, y_offset, ref_ptr + 16,     \
453
586k
          ref_stride, h, &sse2, NULL, NULL);                              \
454
586k
      se += se2;                                                          \
455
586k
      sse_tmp += sse2;                                                    \
456
586k
      if (w > wf * 2) {                                                   \
457
0
        se2 = vpx_sub_pixel_variance##wf##xh_##opt(                       \
458
0
            src_ptr + 32, src_stride, x_offset, y_offset, ref_ptr + 32,   \
459
0
            ref_stride, h, &sse2, NULL, NULL);                            \
460
0
        se += se2;                                                        \
461
0
        sse_tmp += sse2;                                                  \
462
0
        se2 = vpx_sub_pixel_variance##wf##xh_##opt(                       \
463
0
            src_ptr + 48, src_stride, x_offset, y_offset, ref_ptr + 48,   \
464
0
            ref_stride, h, &sse2, NULL, NULL);                            \
465
0
        se += se2;                                                        \
466
0
        sse_tmp += sse2;                                                  \
467
0
      }                                                                   \
468
586k
    }                                                                     \
469
586k
    *sse = sse_tmp;                                                       \
470
586k
    return sse_tmp -                                                      \
471
586k
           (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));    \
472
586k
  }
Unexecuted instantiation: vpx_sub_pixel_variance32x32_ssse3
vpx_sub_pixel_variance32x16_ssse3
Line
Count
Source
444
2.20M
      const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) {        \
445
2.20M
    unsigned int sse_tmp;                                                 \
446
2.20M
    int se = vpx_sub_pixel_variance##wf##xh_##opt(                        \
447
2.20M
        src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride, h,  \
448
2.20M
        &sse_tmp, NULL, NULL);                                            \
449
2.20M
    if (w > wf) {                                                         \
450
2.20M
      unsigned int sse2;                                                  \
451
2.20M
      int se2 = vpx_sub_pixel_variance##wf##xh_##opt(                     \
452
2.20M
          src_ptr + 16, src_stride, x_offset, y_offset, ref_ptr + 16,     \
453
2.20M
          ref_stride, h, &sse2, NULL, NULL);                              \
454
2.20M
      se += se2;                                                          \
455
2.20M
      sse_tmp += sse2;                                                    \
456
2.20M
      if (w > wf * 2) {                                                   \
457
0
        se2 = vpx_sub_pixel_variance##wf##xh_##opt(                       \
458
0
            src_ptr + 32, src_stride, x_offset, y_offset, ref_ptr + 32,   \
459
0
            ref_stride, h, &sse2, NULL, NULL);                            \
460
0
        se += se2;                                                        \
461
0
        sse_tmp += sse2;                                                  \
462
0
        se2 = vpx_sub_pixel_variance##wf##xh_##opt(                       \
463
0
            src_ptr + 48, src_stride, x_offset, y_offset, ref_ptr + 48,   \
464
0
            ref_stride, h, &sse2, NULL, NULL);                            \
465
0
        se += se2;                                                        \
466
0
        sse_tmp += sse2;                                                  \
467
0
      }                                                                   \
468
2.20M
    }                                                                     \
469
2.20M
    *sse = sse_tmp;                                                       \
470
2.20M
    return sse_tmp -                                                      \
471
2.20M
           (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));    \
472
2.20M
  }
vpx_sub_pixel_variance16x32_ssse3
Line
Count
Source
444
2.14M
      const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) {        \
445
2.14M
    unsigned int sse_tmp;                                                 \
446
2.14M
    int se = vpx_sub_pixel_variance##wf##xh_##opt(                        \
447
2.14M
        src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride, h,  \
448
2.14M
        &sse_tmp, NULL, NULL);                                            \
449
2.14M
    if (w > wf) {                                                         \
450
0
      unsigned int sse2;                                                  \
451
0
      int se2 = vpx_sub_pixel_variance##wf##xh_##opt(                     \
452
0
          src_ptr + 16, src_stride, x_offset, y_offset, ref_ptr + 16,     \
453
0
          ref_stride, h, &sse2, NULL, NULL);                              \
454
0
      se += se2;                                                          \
455
0
      sse_tmp += sse2;                                                    \
456
0
      if (w > wf * 2) {                                                   \
457
0
        se2 = vpx_sub_pixel_variance##wf##xh_##opt(                       \
458
0
            src_ptr + 32, src_stride, x_offset, y_offset, ref_ptr + 32,   \
459
0
            ref_stride, h, &sse2, NULL, NULL);                            \
460
0
        se += se2;                                                        \
461
0
        sse_tmp += sse2;                                                  \
462
0
        se2 = vpx_sub_pixel_variance##wf##xh_##opt(                       \
463
0
            src_ptr + 48, src_stride, x_offset, y_offset, ref_ptr + 48,   \
464
0
            ref_stride, h, &sse2, NULL, NULL);                            \
465
0
        se += se2;                                                        \
466
0
        sse_tmp += sse2;                                                  \
467
0
      }                                                                   \
468
0
    }                                                                     \
469
2.14M
    *sse = sse_tmp;                                                       \
470
2.14M
    return sse_tmp -                                                      \
471
2.14M
           (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));    \
472
2.14M
  }
vpx_sub_pixel_variance16x16_ssse3
Line
Count
Source
444
26.5M
      const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) {        \
445
26.5M
    unsigned int sse_tmp;                                                 \
446
26.5M
    int se = vpx_sub_pixel_variance##wf##xh_##opt(                        \
447
26.5M
        src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride, h,  \
448
26.5M
        &sse_tmp, NULL, NULL);                                            \
449
26.5M
    if (w > wf) {                                                         \
450
0
      unsigned int sse2;                                                  \
451
0
      int se2 = vpx_sub_pixel_variance##wf##xh_##opt(                     \
452
0
          src_ptr + 16, src_stride, x_offset, y_offset, ref_ptr + 16,     \
453
0
          ref_stride, h, &sse2, NULL, NULL);                              \
454
0
      se += se2;                                                          \
455
0
      sse_tmp += sse2;                                                    \
456
0
      if (w > wf * 2) {                                                   \
457
0
        se2 = vpx_sub_pixel_variance##wf##xh_##opt(                       \
458
0
            src_ptr + 32, src_stride, x_offset, y_offset, ref_ptr + 32,   \
459
0
            ref_stride, h, &sse2, NULL, NULL);                            \
460
0
        se += se2;                                                        \
461
0
        sse_tmp += sse2;                                                  \
462
0
        se2 = vpx_sub_pixel_variance##wf##xh_##opt(                       \
463
0
            src_ptr + 48, src_stride, x_offset, y_offset, ref_ptr + 48,   \
464
0
            ref_stride, h, &sse2, NULL, NULL);                            \
465
0
        se += se2;                                                        \
466
0
        sse_tmp += sse2;                                                  \
467
0
      }                                                                   \
468
0
    }                                                                     \
469
26.5M
    *sse = sse_tmp;                                                       \
470
26.5M
    return sse_tmp -                                                      \
471
26.5M
           (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));    \
472
26.5M
  }
vpx_sub_pixel_variance16x8_ssse3
Line
Count
Source
444
15.2M
      const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) {        \
445
15.2M
    unsigned int sse_tmp;                                                 \
446
15.2M
    int se = vpx_sub_pixel_variance##wf##xh_##opt(                        \
447
15.2M
        src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride, h,  \
448
15.2M
        &sse_tmp, NULL, NULL);                                            \
449
15.2M
    if (w > wf) {                                                         \
450
0
      unsigned int sse2;                                                  \
451
0
      int se2 = vpx_sub_pixel_variance##wf##xh_##opt(                     \
452
0
          src_ptr + 16, src_stride, x_offset, y_offset, ref_ptr + 16,     \
453
0
          ref_stride, h, &sse2, NULL, NULL);                              \
454
0
      se += se2;                                                          \
455
0
      sse_tmp += sse2;                                                    \
456
0
      if (w > wf * 2) {                                                   \
457
0
        se2 = vpx_sub_pixel_variance##wf##xh_##opt(                       \
458
0
            src_ptr + 32, src_stride, x_offset, y_offset, ref_ptr + 32,   \
459
0
            ref_stride, h, &sse2, NULL, NULL);                            \
460
0
        se += se2;                                                        \
461
0
        sse_tmp += sse2;                                                  \
462
0
        se2 = vpx_sub_pixel_variance##wf##xh_##opt(                       \
463
0
            src_ptr + 48, src_stride, x_offset, y_offset, ref_ptr + 48,   \
464
0
            ref_stride, h, &sse2, NULL, NULL);                            \
465
0
        se += se2;                                                        \
466
0
        sse_tmp += sse2;                                                  \
467
0
      }                                                                   \
468
0
    }                                                                     \
469
15.2M
    *sse = sse_tmp;                                                       \
470
15.2M
    return sse_tmp -                                                      \
471
15.2M
           (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));    \
472
15.2M
  }
vpx_sub_pixel_variance8x16_ssse3
Line
Count
Source
444
15.7M
      const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) {        \
445
15.7M
    unsigned int sse_tmp;                                                 \
446
15.7M
    int se = vpx_sub_pixel_variance##wf##xh_##opt(                        \
447
15.7M
        src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride, h,  \
448
15.7M
        &sse_tmp, NULL, NULL);                                            \
449
15.7M
    if (w > wf) {                                                         \
450
0
      unsigned int sse2;                                                  \
451
0
      int se2 = vpx_sub_pixel_variance##wf##xh_##opt(                     \
452
0
          src_ptr + 16, src_stride, x_offset, y_offset, ref_ptr + 16,     \
453
0
          ref_stride, h, &sse2, NULL, NULL);                              \
454
0
      se += se2;                                                          \
455
0
      sse_tmp += sse2;                                                    \
456
0
      if (w > wf * 2) {                                                   \
457
0
        se2 = vpx_sub_pixel_variance##wf##xh_##opt(                       \
458
0
            src_ptr + 32, src_stride, x_offset, y_offset, ref_ptr + 32,   \
459
0
            ref_stride, h, &sse2, NULL, NULL);                            \
460
0
        se += se2;                                                        \
461
0
        sse_tmp += sse2;                                                  \
462
0
        se2 = vpx_sub_pixel_variance##wf##xh_##opt(                       \
463
0
            src_ptr + 48, src_stride, x_offset, y_offset, ref_ptr + 48,   \
464
0
            ref_stride, h, &sse2, NULL, NULL);                            \
465
0
        se += se2;                                                        \
466
0
        sse_tmp += sse2;                                                  \
467
0
      }                                                                   \
468
0
    }                                                                     \
469
15.7M
    *sse = sse_tmp;                                                       \
470
15.7M
    return sse_tmp -                                                      \
471
15.7M
           (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));    \
472
15.7M
  }
vpx_sub_pixel_variance8x8_ssse3
Line
Count
Source
444
44.8M
      const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) {        \
445
44.8M
    unsigned int sse_tmp;                                                 \
446
44.8M
    int se = vpx_sub_pixel_variance##wf##xh_##opt(                        \
447
44.8M
        src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride, h,  \
448
44.8M
        &sse_tmp, NULL, NULL);                                            \
449
44.8M
    if (w > wf) {                                                         \
450
0
      unsigned int sse2;                                                  \
451
0
      int se2 = vpx_sub_pixel_variance##wf##xh_##opt(                     \
452
0
          src_ptr + 16, src_stride, x_offset, y_offset, ref_ptr + 16,     \
453
0
          ref_stride, h, &sse2, NULL, NULL);                              \
454
0
      se += se2;                                                          \
455
0
      sse_tmp += sse2;                                                    \
456
0
      if (w > wf * 2) {                                                   \
457
0
        se2 = vpx_sub_pixel_variance##wf##xh_##opt(                       \
458
0
            src_ptr + 32, src_stride, x_offset, y_offset, ref_ptr + 32,   \
459
0
            ref_stride, h, &sse2, NULL, NULL);                            \
460
0
        se += se2;                                                        \
461
0
        sse_tmp += sse2;                                                  \
462
0
        se2 = vpx_sub_pixel_variance##wf##xh_##opt(                       \
463
0
            src_ptr + 48, src_stride, x_offset, y_offset, ref_ptr + 48,   \
464
0
            ref_stride, h, &sse2, NULL, NULL);                            \
465
0
        se += se2;                                                        \
466
0
        sse_tmp += sse2;                                                  \
467
0
      }                                                                   \
468
0
    }                                                                     \
469
44.8M
    *sse = sse_tmp;                                                       \
470
44.8M
    return sse_tmp -                                                      \
471
44.8M
           (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));    \
472
44.8M
  }
vpx_sub_pixel_variance8x4_ssse3
Line
Count
Source
444
28.4M
      const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) {        \
445
28.4M
    unsigned int sse_tmp;                                                 \
446
28.4M
    int se = vpx_sub_pixel_variance##wf##xh_##opt(                        \
447
28.4M
        src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride, h,  \
448
28.4M
        &sse_tmp, NULL, NULL);                                            \
449
28.4M
    if (w > wf) {                                                         \
450
0
      unsigned int sse2;                                                  \
451
0
      int se2 = vpx_sub_pixel_variance##wf##xh_##opt(                     \
452
0
          src_ptr + 16, src_stride, x_offset, y_offset, ref_ptr + 16,     \
453
0
          ref_stride, h, &sse2, NULL, NULL);                              \
454
0
      se += se2;                                                          \
455
0
      sse_tmp += sse2;                                                    \
456
0
      if (w > wf * 2) {                                                   \
457
0
        se2 = vpx_sub_pixel_variance##wf##xh_##opt(                       \
458
0
            src_ptr + 32, src_stride, x_offset, y_offset, ref_ptr + 32,   \
459
0
            ref_stride, h, &sse2, NULL, NULL);                            \
460
0
        se += se2;                                                        \
461
0
        sse_tmp += sse2;                                                  \
462
0
        se2 = vpx_sub_pixel_variance##wf##xh_##opt(                       \
463
0
            src_ptr + 48, src_stride, x_offset, y_offset, ref_ptr + 48,   \
464
0
            ref_stride, h, &sse2, NULL, NULL);                            \
465
0
        se += se2;                                                        \
466
0
        sse_tmp += sse2;                                                  \
467
0
      }                                                                   \
468
0
    }                                                                     \
469
28.4M
    *sse = sse_tmp;                                                       \
470
28.4M
    return sse_tmp -                                                      \
471
28.4M
           (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));    \
472
28.4M
  }
vpx_sub_pixel_variance4x8_ssse3
Line
Count
Source
444
28.8M
      const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) {        \
445
28.8M
    unsigned int sse_tmp;                                                 \
446
28.8M
    int se = vpx_sub_pixel_variance##wf##xh_##opt(                        \
447
28.8M
        src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride, h,  \
448
28.8M
        &sse_tmp, NULL, NULL);                                            \
449
28.8M
    if (w > wf) {                                                         \
450
0
      unsigned int sse2;                                                  \
451
0
      int se2 = vpx_sub_pixel_variance##wf##xh_##opt(                     \
452
0
          src_ptr + 16, src_stride, x_offset, y_offset, ref_ptr + 16,     \
453
0
          ref_stride, h, &sse2, NULL, NULL);                              \
454
0
      se += se2;                                                          \
455
0
      sse_tmp += sse2;                                                    \
456
0
      if (w > wf * 2) {                                                   \
457
0
        se2 = vpx_sub_pixel_variance##wf##xh_##opt(                       \
458
0
            src_ptr + 32, src_stride, x_offset, y_offset, ref_ptr + 32,   \
459
0
            ref_stride, h, &sse2, NULL, NULL);                            \
460
0
        se += se2;                                                        \
461
0
        sse_tmp += sse2;                                                  \
462
0
        se2 = vpx_sub_pixel_variance##wf##xh_##opt(                       \
463
0
            src_ptr + 48, src_stride, x_offset, y_offset, ref_ptr + 48,   \
464
0
            ref_stride, h, &sse2, NULL, NULL);                            \
465
0
        se += se2;                                                        \
466
0
        sse_tmp += sse2;                                                  \
467
0
      }                                                                   \
468
0
    }                                                                     \
469
28.8M
    *sse = sse_tmp;                                                       \
470
28.8M
    return sse_tmp -                                                      \
471
28.8M
           (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));    \
472
28.8M
  }
vpx_sub_pixel_variance4x4_ssse3
Line
Count
Source
444
88.6M
      const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) {        \
445
88.6M
    unsigned int sse_tmp;                                                 \
446
88.6M
    int se = vpx_sub_pixel_variance##wf##xh_##opt(                        \
447
88.6M
        src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride, h,  \
448
88.6M
        &sse_tmp, NULL, NULL);                                            \
449
88.6M
    if (w > wf) {                                                         \
450
0
      unsigned int sse2;                                                  \
451
0
      int se2 = vpx_sub_pixel_variance##wf##xh_##opt(                     \
452
0
          src_ptr + 16, src_stride, x_offset, y_offset, ref_ptr + 16,     \
453
0
          ref_stride, h, &sse2, NULL, NULL);                              \
454
0
      se += se2;                                                          \
455
0
      sse_tmp += sse2;                                                    \
456
0
      if (w > wf * 2) {                                                   \
457
0
        se2 = vpx_sub_pixel_variance##wf##xh_##opt(                       \
458
0
            src_ptr + 32, src_stride, x_offset, y_offset, ref_ptr + 32,   \
459
0
            ref_stride, h, &sse2, NULL, NULL);                            \
460
0
        se += se2;                                                        \
461
0
        sse_tmp += sse2;                                                  \
462
0
        se2 = vpx_sub_pixel_variance##wf##xh_##opt(                       \
463
0
            src_ptr + 48, src_stride, x_offset, y_offset, ref_ptr + 48,   \
464
0
            ref_stride, h, &sse2, NULL, NULL);                            \
465
0
        se += se2;                                                        \
466
0
        sse_tmp += sse2;                                                  \
467
0
      }                                                                   \
468
0
    }                                                                     \
469
88.6M
    *sse = sse_tmp;                                                       \
470
88.6M
    return sse_tmp -                                                      \
471
88.6M
           (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));    \
472
88.6M
  }
473
474
#define FNS(opt1, opt2)                             \
475
  FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t))  \
476
  FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t))  \
477
  FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t))  \
478
  FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t))  \
479
  FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t))  \
480
  FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t))  \
481
  FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)) \
482
  FN(16, 8, 16, 4, 3, opt1, (int32_t), (int32_t))   \
483
  FN(8, 16, 8, 3, 4, opt1, (int32_t), (int32_t))    \
484
  FN(8, 8, 8, 3, 3, opt1, (int32_t), (int32_t))     \
485
  FN(8, 4, 8, 3, 2, opt1, (int32_t), (int32_t))     \
486
  FN(4, 8, 4, 2, 3, opt1, (int32_t), (int32_t))     \
487
  FN(4, 4, 4, 2, 2, opt1, (int32_t), (int32_t))
488
489
FNS(sse2, sse2)
490
FNS(ssse3, ssse3)
491
492
#undef FNS
493
#undef FN
494
495
// The 2 unused parameters are place holders for PIC enabled build.
496
#define DECL(w, opt)                                                   \
497
  int vpx_sub_pixel_avg_variance##w##xh_##opt(                         \
498
      const uint8_t *src_ptr, ptrdiff_t src_stride, int x_offset,      \
499
      int y_offset, const uint8_t *ref_ptr, ptrdiff_t ref_stride,      \
500
      const uint8_t *second_pred, ptrdiff_t second_stride, int height, \
501
      unsigned int *sse, void *unused0, void *unused)
502
#define DECLS(opt1, opt2) \
503
  DECL(4, opt1);          \
504
  DECL(8, opt1);          \
505
  DECL(16, opt1)
506
507
DECLS(sse2, sse2);
508
DECLS(ssse3, ssse3);
509
#undef DECL
510
#undef DECLS
511
512
#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast)                  \
513
  unsigned int vpx_sub_pixel_avg_variance##w##x##h##_##opt(               \
514
      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
515
      const uint8_t *ref_ptr, int ref_stride, unsigned int *sse,          \
516
0
      const uint8_t *second_pred) {                                       \
517
0
    unsigned int sse_tmp;                                                 \
518
0
    int se = vpx_sub_pixel_avg_variance##wf##xh_##opt(                    \
519
0
        src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride,     \
520
0
        second_pred, w, h, &sse_tmp, NULL, NULL);                         \
521
0
    if (w > wf) {                                                         \
522
0
      unsigned int sse2;                                                  \
523
0
      int se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(                 \
524
0
          src_ptr + 16, src_stride, x_offset, y_offset, ref_ptr + 16,     \
525
0
          ref_stride, second_pred + 16, w, h, &sse2, NULL, NULL);         \
526
0
      se += se2;                                                          \
527
0
      sse_tmp += sse2;                                                    \
528
0
      if (w > wf * 2) {                                                   \
529
0
        se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(                   \
530
0
            src_ptr + 32, src_stride, x_offset, y_offset, ref_ptr + 32,   \
531
0
            ref_stride, second_pred + 32, w, h, &sse2, NULL, NULL);       \
532
0
        se += se2;                                                        \
533
0
        sse_tmp += sse2;                                                  \
534
0
        se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(                   \
535
0
            src_ptr + 48, src_stride, x_offset, y_offset, ref_ptr + 48,   \
536
0
            ref_stride, second_pred + 48, w, h, &sse2, NULL, NULL);       \
537
0
        se += se2;                                                        \
538
0
        sse_tmp += sse2;                                                  \
539
0
      }                                                                   \
540
0
    }                                                                     \
541
0
    *sse = sse_tmp;                                                       \
542
0
    return sse_tmp -                                                      \
543
0
           (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));    \
544
0
  }
Unexecuted instantiation: vpx_sub_pixel_avg_variance64x64_sse2
Unexecuted instantiation: vpx_sub_pixel_avg_variance64x32_sse2
Unexecuted instantiation: vpx_sub_pixel_avg_variance32x64_sse2
Unexecuted instantiation: vpx_sub_pixel_avg_variance32x32_sse2
Unexecuted instantiation: vpx_sub_pixel_avg_variance32x16_sse2
Unexecuted instantiation: vpx_sub_pixel_avg_variance16x32_sse2
Unexecuted instantiation: vpx_sub_pixel_avg_variance16x16_sse2
Unexecuted instantiation: vpx_sub_pixel_avg_variance16x8_sse2
Unexecuted instantiation: vpx_sub_pixel_avg_variance8x16_sse2
Unexecuted instantiation: vpx_sub_pixel_avg_variance8x8_sse2
Unexecuted instantiation: vpx_sub_pixel_avg_variance8x4_sse2
Unexecuted instantiation: vpx_sub_pixel_avg_variance4x8_sse2
Unexecuted instantiation: vpx_sub_pixel_avg_variance4x4_sse2
Unexecuted instantiation: vpx_sub_pixel_avg_variance64x64_ssse3
Unexecuted instantiation: vpx_sub_pixel_avg_variance64x32_ssse3
Unexecuted instantiation: vpx_sub_pixel_avg_variance32x64_ssse3
Unexecuted instantiation: vpx_sub_pixel_avg_variance32x32_ssse3
Unexecuted instantiation: vpx_sub_pixel_avg_variance32x16_ssse3
Unexecuted instantiation: vpx_sub_pixel_avg_variance16x32_ssse3
Unexecuted instantiation: vpx_sub_pixel_avg_variance16x16_ssse3
Unexecuted instantiation: vpx_sub_pixel_avg_variance16x8_ssse3
Unexecuted instantiation: vpx_sub_pixel_avg_variance8x16_ssse3
Unexecuted instantiation: vpx_sub_pixel_avg_variance8x8_ssse3
Unexecuted instantiation: vpx_sub_pixel_avg_variance8x4_ssse3
Unexecuted instantiation: vpx_sub_pixel_avg_variance4x8_ssse3
Unexecuted instantiation: vpx_sub_pixel_avg_variance4x4_ssse3
545
546
#define FNS(opt1, opt2)                             \
547
  FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t))  \
548
  FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t))  \
549
  FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t))  \
550
  FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t))  \
551
  FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t))  \
552
  FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t))  \
553
  FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)) \
554
  FN(16, 8, 16, 4, 3, opt1, (uint32_t), (int32_t))  \
555
  FN(8, 16, 8, 3, 4, opt1, (uint32_t), (int32_t))   \
556
  FN(8, 8, 8, 3, 3, opt1, (uint32_t), (int32_t))    \
557
  FN(8, 4, 8, 3, 2, opt1, (uint32_t), (int32_t))    \
558
  FN(4, 8, 4, 2, 3, opt1, (uint32_t), (int32_t))    \
559
  FN(4, 4, 4, 2, 2, opt1, (uint32_t), (int32_t))
560
561
FNS(sse2, sse)
562
FNS(ssse3, ssse3)
563
564
#undef FNS
565
#undef FN