Coverage Report

Created: 2024-09-06 07:53

/src/libvpx/vpx_dsp/x86/highbd_variance_sse2.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3
 *
4
 *  Use of this source code is governed by a BSD-style license
5
 *  that can be found in the LICENSE file in the root of the source
6
 *  tree. An additional intellectual property rights grant can be found
7
 *  in the file PATENTS.  All contributing project authors may
8
 *  be found in the AUTHORS file in the root of the source tree.
9
 */
10
#include <emmintrin.h>  // SSE2
11
12
#include "./vpx_config.h"
13
#include "./vpx_dsp_rtcd.h"
14
#include "vpx_ports/mem.h"
15
16
typedef uint32_t (*high_variance_fn_t)(const uint16_t *src, int src_stride,
17
                                       const uint16_t *ref, int ref_stride,
18
                                       uint32_t *sse, int *sum);
19
20
uint32_t vpx_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride,
21
                                    const uint16_t *ref, int ref_stride,
22
                                    uint32_t *sse, int *sum);
23
24
uint32_t vpx_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride,
25
                                      const uint16_t *ref, int ref_stride,
26
                                      uint32_t *sse, int *sum);
27
28
static void highbd_8_variance_sse2(const uint16_t *src, int src_stride,
29
                                   const uint16_t *ref, int ref_stride, int w,
30
                                   int h, uint32_t *sse, int *sum,
31
0
                                   high_variance_fn_t var_fn, int block_size) {
32
0
  int i, j;
33
34
0
  *sse = 0;
35
0
  *sum = 0;
36
37
0
  for (i = 0; i < h; i += block_size) {
38
0
    for (j = 0; j < w; j += block_size) {
39
0
      unsigned int sse0;
40
0
      int sum0;
41
0
      var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
42
0
             ref_stride, &sse0, &sum0);
43
0
      *sse += sse0;
44
0
      *sum += sum0;
45
0
    }
46
0
  }
47
0
}
48
49
static void highbd_10_variance_sse2(const uint16_t *src, int src_stride,
50
                                    const uint16_t *ref, int ref_stride, int w,
51
                                    int h, uint32_t *sse, int *sum,
52
0
                                    high_variance_fn_t var_fn, int block_size) {
53
0
  int i, j;
54
0
  uint64_t sse_long = 0;
55
0
  int32_t sum_long = 0;
56
57
0
  for (i = 0; i < h; i += block_size) {
58
0
    for (j = 0; j < w; j += block_size) {
59
0
      unsigned int sse0;
60
0
      int sum0;
61
0
      var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
62
0
             ref_stride, &sse0, &sum0);
63
0
      sse_long += sse0;
64
0
      sum_long += sum0;
65
0
    }
66
0
  }
67
0
  *sum = ROUND_POWER_OF_TWO(sum_long, 2);
68
0
  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
69
0
}
70
71
static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
72
                                    const uint16_t *ref, int ref_stride, int w,
73
                                    int h, uint32_t *sse, int *sum,
74
0
                                    high_variance_fn_t var_fn, int block_size) {
75
0
  int i, j;
76
0
  uint64_t sse_long = 0;
77
0
  int32_t sum_long = 0;
78
79
0
  for (i = 0; i < h; i += block_size) {
80
0
    for (j = 0; j < w; j += block_size) {
81
0
      unsigned int sse0;
82
0
      int sum0;
83
0
      var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
84
0
             ref_stride, &sse0, &sum0);
85
0
      sse_long += sse0;
86
0
      sum_long += sum0;
87
0
    }
88
0
  }
89
0
  *sum = ROUND_POWER_OF_TWO(sum_long, 4);
90
0
  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
91
0
}
92
93
#define HIGH_GET_VAR(S)                                                       \
94
  void vpx_highbd_8_get##S##x##S##var_sse2(                                   \
95
      const uint8_t *src8, int src_stride, const uint8_t *ref8,               \
96
0
      int ref_stride, uint32_t *sse, int *sum) {                              \
97
0
    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                \
98
0
    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                                \
99
0
    vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \
100
0
                                       sum);                                  \
101
0
  }                                                                           \
Unexecuted instantiation: vpx_highbd_8_get16x16var_sse2
Unexecuted instantiation: vpx_highbd_8_get8x8var_sse2
102
                                                                              \
103
  void vpx_highbd_10_get##S##x##S##var_sse2(                                  \
104
      const uint8_t *src8, int src_stride, const uint8_t *ref8,               \
105
0
      int ref_stride, uint32_t *sse, int *sum) {                              \
106
0
    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                \
107
0
    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                                \
108
0
    vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \
109
0
                                       sum);                                  \
110
0
    *sum = ROUND_POWER_OF_TWO(*sum, 2);                                       \
111
0
    *sse = ROUND_POWER_OF_TWO(*sse, 4);                                       \
112
0
  }                                                                           \
Unexecuted instantiation: vpx_highbd_10_get16x16var_sse2
Unexecuted instantiation: vpx_highbd_10_get8x8var_sse2
113
                                                                              \
114
  void vpx_highbd_12_get##S##x##S##var_sse2(                                  \
115
      const uint8_t *src8, int src_stride, const uint8_t *ref8,               \
116
0
      int ref_stride, uint32_t *sse, int *sum) {                              \
117
0
    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                \
118
0
    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                                \
119
0
    vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \
120
0
                                       sum);                                  \
121
0
    *sum = ROUND_POWER_OF_TWO(*sum, 4);                                       \
122
0
    *sse = ROUND_POWER_OF_TWO(*sse, 8);                                       \
123
0
  }
Unexecuted instantiation: vpx_highbd_12_get16x16var_sse2
Unexecuted instantiation: vpx_highbd_12_get8x8var_sse2
124
125
HIGH_GET_VAR(16)
126
HIGH_GET_VAR(8)
127
128
#undef HIGH_GET_VAR
129
130
#define VAR_FN(w, h, block_size, shift)                                    \
131
  uint32_t vpx_highbd_8_variance##w##x##h##_sse2(                          \
132
      const uint8_t *src8, int src_stride, const uint8_t *ref8,            \
133
0
      int ref_stride, uint32_t *sse) {                                     \
134
0
    int sum;                                                               \
135
0
    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                             \
136
0
    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                             \
137
0
    highbd_8_variance_sse2(                                                \
138
0
        src, src_stride, ref, ref_stride, w, h, sse, &sum,                 \
139
0
        vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
140
0
    return *sse - (uint32_t)(((int64_t)sum * sum) >> (shift));             \
141
0
  }                                                                        \
Unexecuted instantiation: vpx_highbd_8_variance64x64_sse2
Unexecuted instantiation: vpx_highbd_8_variance64x32_sse2
Unexecuted instantiation: vpx_highbd_8_variance32x64_sse2
Unexecuted instantiation: vpx_highbd_8_variance32x32_sse2
Unexecuted instantiation: vpx_highbd_8_variance32x16_sse2
Unexecuted instantiation: vpx_highbd_8_variance16x32_sse2
Unexecuted instantiation: vpx_highbd_8_variance16x16_sse2
Unexecuted instantiation: vpx_highbd_8_variance16x8_sse2
Unexecuted instantiation: vpx_highbd_8_variance8x16_sse2
Unexecuted instantiation: vpx_highbd_8_variance8x8_sse2
142
                                                                           \
143
  uint32_t vpx_highbd_10_variance##w##x##h##_sse2(                         \
144
      const uint8_t *src8, int src_stride, const uint8_t *ref8,            \
145
0
      int ref_stride, uint32_t *sse) {                                     \
146
0
    int sum;                                                               \
147
0
    int64_t var;                                                           \
148
0
    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                             \
149
0
    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                             \
150
0
    highbd_10_variance_sse2(                                               \
151
0
        src, src_stride, ref, ref_stride, w, h, sse, &sum,                 \
152
0
        vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
153
0
    var = (int64_t)(*sse) - (((int64_t)sum * sum) >> (shift));             \
154
0
    return (var >= 0) ? (uint32_t)var : 0;                                 \
155
0
  }                                                                        \
Unexecuted instantiation: vpx_highbd_10_variance64x64_sse2
Unexecuted instantiation: vpx_highbd_10_variance64x32_sse2
Unexecuted instantiation: vpx_highbd_10_variance32x64_sse2
Unexecuted instantiation: vpx_highbd_10_variance32x32_sse2
Unexecuted instantiation: vpx_highbd_10_variance32x16_sse2
Unexecuted instantiation: vpx_highbd_10_variance16x32_sse2
Unexecuted instantiation: vpx_highbd_10_variance16x16_sse2
Unexecuted instantiation: vpx_highbd_10_variance16x8_sse2
Unexecuted instantiation: vpx_highbd_10_variance8x16_sse2
Unexecuted instantiation: vpx_highbd_10_variance8x8_sse2
156
                                                                           \
157
  uint32_t vpx_highbd_12_variance##w##x##h##_sse2(                         \
158
      const uint8_t *src8, int src_stride, const uint8_t *ref8,            \
159
0
      int ref_stride, uint32_t *sse) {                                     \
160
0
    int sum;                                                               \
161
0
    int64_t var;                                                           \
162
0
    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                             \
163
0
    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                             \
164
0
    highbd_12_variance_sse2(                                               \
165
0
        src, src_stride, ref, ref_stride, w, h, sse, &sum,                 \
166
0
        vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
167
0
    var = (int64_t)(*sse) - (((int64_t)sum * sum) >> (shift));             \
168
0
    return (var >= 0) ? (uint32_t)var : 0;                                 \
169
0
  }
Unexecuted instantiation: vpx_highbd_12_variance64x64_sse2
Unexecuted instantiation: vpx_highbd_12_variance64x32_sse2
Unexecuted instantiation: vpx_highbd_12_variance32x64_sse2
Unexecuted instantiation: vpx_highbd_12_variance32x32_sse2
Unexecuted instantiation: vpx_highbd_12_variance32x16_sse2
Unexecuted instantiation: vpx_highbd_12_variance16x32_sse2
Unexecuted instantiation: vpx_highbd_12_variance16x16_sse2
Unexecuted instantiation: vpx_highbd_12_variance16x8_sse2
Unexecuted instantiation: vpx_highbd_12_variance8x16_sse2
Unexecuted instantiation: vpx_highbd_12_variance8x8_sse2
170
171
VAR_FN(64, 64, 16, 12)
172
VAR_FN(64, 32, 16, 11)
173
VAR_FN(32, 64, 16, 11)
174
VAR_FN(32, 32, 16, 10)
175
VAR_FN(32, 16, 16, 9)
176
VAR_FN(16, 32, 16, 9)
177
VAR_FN(16, 16, 16, 8)
178
VAR_FN(16, 8, 8, 7)
179
VAR_FN(8, 16, 8, 7)
180
VAR_FN(8, 8, 8, 6)
181
182
#undef VAR_FN
183
184
unsigned int vpx_highbd_8_mse16x16_sse2(const uint8_t *src8, int src_stride,
185
                                        const uint8_t *ref8, int ref_stride,
186
0
                                        unsigned int *sse) {
187
0
  int sum;
188
0
  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
189
0
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
190
0
  highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
191
0
                         vpx_highbd_calc16x16var_sse2, 16);
192
0
  return *sse;
193
0
}
194
195
unsigned int vpx_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride,
196
                                         const uint8_t *ref8, int ref_stride,
197
0
                                         unsigned int *sse) {
198
0
  int sum;
199
0
  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
200
0
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
201
0
  highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
202
0
                          vpx_highbd_calc16x16var_sse2, 16);
203
0
  return *sse;
204
0
}
205
206
unsigned int vpx_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride,
207
                                         const uint8_t *ref8, int ref_stride,
208
0
                                         unsigned int *sse) {
209
0
  int sum;
210
0
  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
211
0
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
212
0
  highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
213
0
                          vpx_highbd_calc16x16var_sse2, 16);
214
0
  return *sse;
215
0
}
216
217
unsigned int vpx_highbd_8_mse8x8_sse2(const uint8_t *src8, int src_stride,
218
                                      const uint8_t *ref8, int ref_stride,
219
0
                                      unsigned int *sse) {
220
0
  int sum;
221
0
  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
222
0
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
223
0
  highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum,
224
0
                         vpx_highbd_calc8x8var_sse2, 8);
225
0
  return *sse;
226
0
}
227
228
unsigned int vpx_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride,
229
                                       const uint8_t *ref8, int ref_stride,
230
0
                                       unsigned int *sse) {
231
0
  int sum;
232
0
  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
233
0
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
234
0
  highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum,
235
0
                          vpx_highbd_calc8x8var_sse2, 8);
236
0
  return *sse;
237
0
}
238
239
unsigned int vpx_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
240
                                       const uint8_t *ref8, int ref_stride,
241
0
                                       unsigned int *sse) {
242
0
  int sum;
243
0
  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
244
0
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
245
0
  highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum,
246
0
                          vpx_highbd_calc8x8var_sse2, 8);
247
0
  return *sse;
248
0
}
249
250
// The 2 unused parameters are place holders for PIC enabled build.
251
// These definitions are for functions defined in
252
// highbd_subpel_variance_impl_sse2.asm
253
#define DECL(w, opt)                                                         \
254
  int vpx_highbd_sub_pixel_variance##w##xh_##opt(                            \
255
      const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
256
      const uint16_t *ref, ptrdiff_t ref_stride, int height,                 \
257
      unsigned int *sse, void *unused0, void *unused);
258
#define DECLS(opt) \
259
  DECL(8, opt)     \
260
  DECL(16, opt)
261
262
DECLS(sse2)
263
264
#undef DECLS
265
#undef DECL
266
267
#define FN(w, h, wf, wlog2, hlog2, opt, cast)                                  \
268
  uint32_t vpx_highbd_8_sub_pixel_variance##w##x##h##_##opt(                   \
269
      const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
270
0
      const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr) {                \
271
0
    uint32_t sse;                                                              \
272
0
    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
273
0
    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                                 \
274
0
    int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                      \
275
0
        src, src_stride, x_offset, y_offset, ref, ref_stride, h, &sse, NULL,   \
276
0
        NULL);                                                                 \
277
0
    if (w > wf) {                                                              \
278
0
      unsigned int sse2;                                                       \
279
0
      int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                   \
280
0
          src + 16, src_stride, x_offset, y_offset, ref + 16, ref_stride, h,   \
281
0
          &sse2, NULL, NULL);                                                  \
282
0
      se += se2;                                                               \
283
0
      sse += sse2;                                                             \
284
0
      if (w > wf * 2) {                                                        \
285
0
        se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                     \
286
0
            src + 32, src_stride, x_offset, y_offset, ref + 32, ref_stride, h, \
287
0
            &sse2, NULL, NULL);                                                \
288
0
        se += se2;                                                             \
289
0
        sse += sse2;                                                           \
290
0
        se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                     \
291
0
            src + 48, src_stride, x_offset, y_offset, ref + 48, ref_stride, h, \
292
0
            &sse2, NULL, NULL);                                                \
293
0
        se += se2;                                                             \
294
0
        sse += sse2;                                                           \
295
0
      }                                                                        \
296
0
    }                                                                          \
297
0
    *sse_ptr = sse;                                                            \
298
0
    return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2));                \
299
0
  }                                                                            \
Unexecuted instantiation: vpx_highbd_8_sub_pixel_variance64x64_sse2
Unexecuted instantiation: vpx_highbd_8_sub_pixel_variance64x32_sse2
Unexecuted instantiation: vpx_highbd_8_sub_pixel_variance32x64_sse2
Unexecuted instantiation: vpx_highbd_8_sub_pixel_variance32x32_sse2
Unexecuted instantiation: vpx_highbd_8_sub_pixel_variance32x16_sse2
Unexecuted instantiation: vpx_highbd_8_sub_pixel_variance16x32_sse2
Unexecuted instantiation: vpx_highbd_8_sub_pixel_variance16x16_sse2
Unexecuted instantiation: vpx_highbd_8_sub_pixel_variance16x8_sse2
Unexecuted instantiation: vpx_highbd_8_sub_pixel_variance8x16_sse2
Unexecuted instantiation: vpx_highbd_8_sub_pixel_variance8x8_sse2
Unexecuted instantiation: vpx_highbd_8_sub_pixel_variance8x4_sse2
300
                                                                               \
301
  uint32_t vpx_highbd_10_sub_pixel_variance##w##x##h##_##opt(                  \
302
      const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
303
0
      const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr) {                \
304
0
    int64_t var;                                                               \
305
0
    uint32_t sse;                                                              \
306
0
    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
307
0
    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                                 \
308
0
    int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                      \
309
0
        src, src_stride, x_offset, y_offset, ref, ref_stride, h, &sse, NULL,   \
310
0
        NULL);                                                                 \
311
0
    if (w > wf) {                                                              \
312
0
      uint32_t sse2;                                                           \
313
0
      int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                   \
314
0
          src + 16, src_stride, x_offset, y_offset, ref + 16, ref_stride, h,   \
315
0
          &sse2, NULL, NULL);                                                  \
316
0
      se += se2;                                                               \
317
0
      sse += sse2;                                                             \
318
0
      if (w > wf * 2) {                                                        \
319
0
        se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                     \
320
0
            src + 32, src_stride, x_offset, y_offset, ref + 32, ref_stride, h, \
321
0
            &sse2, NULL, NULL);                                                \
322
0
        se += se2;                                                             \
323
0
        sse += sse2;                                                           \
324
0
        se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                     \
325
0
            src + 48, src_stride, x_offset, y_offset, ref + 48, ref_stride, h, \
326
0
            &sse2, NULL, NULL);                                                \
327
0
        se += se2;                                                             \
328
0
        sse += sse2;                                                           \
329
0
      }                                                                        \
330
0
    }                                                                          \
331
0
    se = ROUND_POWER_OF_TWO(se, 2);                                            \
332
0
    sse = ROUND_POWER_OF_TWO(sse, 4);                                          \
333
0
    *sse_ptr = sse;                                                            \
334
0
    var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2));                \
335
0
    return (var >= 0) ? (uint32_t)var : 0;                                     \
336
0
  }                                                                            \
Unexecuted instantiation: vpx_highbd_10_sub_pixel_variance64x64_sse2
Unexecuted instantiation: vpx_highbd_10_sub_pixel_variance64x32_sse2
Unexecuted instantiation: vpx_highbd_10_sub_pixel_variance32x64_sse2
Unexecuted instantiation: vpx_highbd_10_sub_pixel_variance32x32_sse2
Unexecuted instantiation: vpx_highbd_10_sub_pixel_variance32x16_sse2
Unexecuted instantiation: vpx_highbd_10_sub_pixel_variance16x32_sse2
Unexecuted instantiation: vpx_highbd_10_sub_pixel_variance16x16_sse2
Unexecuted instantiation: vpx_highbd_10_sub_pixel_variance16x8_sse2
Unexecuted instantiation: vpx_highbd_10_sub_pixel_variance8x16_sse2
Unexecuted instantiation: vpx_highbd_10_sub_pixel_variance8x8_sse2
Unexecuted instantiation: vpx_highbd_10_sub_pixel_variance8x4_sse2
337
                                                                               \
338
  uint32_t vpx_highbd_12_sub_pixel_variance##w##x##h##_##opt(                  \
339
      const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
340
0
      const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr) {                \
341
0
    int start_row;                                                             \
342
0
    uint32_t sse;                                                              \
343
0
    int se = 0;                                                                \
344
0
    int64_t var;                                                               \
345
0
    uint64_t long_sse = 0;                                                     \
346
0
    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
347
0
    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                                 \
348
0
    for (start_row = 0; start_row < h; start_row += 16) {                      \
349
0
      uint32_t sse2;                                                           \
350
0
      int height = h - start_row < 16 ? h - start_row : 16;                    \
351
0
      int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                   \
352
0
          src + (start_row * src_stride), src_stride, x_offset, y_offset,      \
353
0
          ref + (start_row * ref_stride), ref_stride, height, &sse2, NULL,     \
354
0
          NULL);                                                               \
355
0
      se += se2;                                                               \
356
0
      long_sse += sse2;                                                        \
357
0
      if (w > wf) {                                                            \
358
0
        se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                     \
359
0
            src + 16 + (start_row * src_stride), src_stride, x_offset,         \
360
0
            y_offset, ref + 16 + (start_row * ref_stride), ref_stride, height, \
361
0
            &sse2, NULL, NULL);                                                \
362
0
        se += se2;                                                             \
363
0
        long_sse += sse2;                                                      \
364
0
        if (w > wf * 2) {                                                      \
365
0
          se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                   \
366
0
              src + 32 + (start_row * src_stride), src_stride, x_offset,       \
367
0
              y_offset, ref + 32 + (start_row * ref_stride), ref_stride,       \
368
0
              height, &sse2, NULL, NULL);                                      \
369
0
          se += se2;                                                           \
370
0
          long_sse += sse2;                                                    \
371
0
          se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                   \
372
0
              src + 48 + (start_row * src_stride), src_stride, x_offset,       \
373
0
              y_offset, ref + 48 + (start_row * ref_stride), ref_stride,       \
374
0
              height, &sse2, NULL, NULL);                                      \
375
0
          se += se2;                                                           \
376
0
          long_sse += sse2;                                                    \
377
0
        }                                                                      \
378
0
      }                                                                        \
379
0
    }                                                                          \
380
0
    se = ROUND_POWER_OF_TWO(se, 4);                                            \
381
0
    sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8);                           \
382
0
    *sse_ptr = sse;                                                            \
383
0
    var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2));                \
384
0
    return (var >= 0) ? (uint32_t)var : 0;                                     \
385
0
  }
Unexecuted instantiation: vpx_highbd_12_sub_pixel_variance64x64_sse2
Unexecuted instantiation: vpx_highbd_12_sub_pixel_variance64x32_sse2
Unexecuted instantiation: vpx_highbd_12_sub_pixel_variance32x64_sse2
Unexecuted instantiation: vpx_highbd_12_sub_pixel_variance32x32_sse2
Unexecuted instantiation: vpx_highbd_12_sub_pixel_variance32x16_sse2
Unexecuted instantiation: vpx_highbd_12_sub_pixel_variance16x32_sse2
Unexecuted instantiation: vpx_highbd_12_sub_pixel_variance16x16_sse2
Unexecuted instantiation: vpx_highbd_12_sub_pixel_variance16x8_sse2
Unexecuted instantiation: vpx_highbd_12_sub_pixel_variance8x16_sse2
Unexecuted instantiation: vpx_highbd_12_sub_pixel_variance8x8_sse2
Unexecuted instantiation: vpx_highbd_12_sub_pixel_variance8x4_sse2
386
387
#define FNS(opt)                       \
388
  FN(64, 64, 16, 6, 6, opt, (int64_t)) \
389
  FN(64, 32, 16, 6, 5, opt, (int64_t)) \
390
  FN(32, 64, 16, 5, 6, opt, (int64_t)) \
391
  FN(32, 32, 16, 5, 5, opt, (int64_t)) \
392
  FN(32, 16, 16, 5, 4, opt, (int64_t)) \
393
  FN(16, 32, 16, 4, 5, opt, (int64_t)) \
394
  FN(16, 16, 16, 4, 4, opt, (int64_t)) \
395
  FN(16, 8, 16, 4, 3, opt, (int64_t))  \
396
  FN(8, 16, 8, 3, 4, opt, (int64_t))   \
397
  FN(8, 8, 8, 3, 3, opt, (int64_t))    \
398
  FN(8, 4, 8, 3, 2, opt, (int64_t))
399
400
FNS(sse2)
401
402
#undef FNS
403
#undef FN
404
405
// The 2 unused parameters are place holders for PIC enabled build.
406
#define DECL(w, opt)                                                         \
407
  int vpx_highbd_sub_pixel_avg_variance##w##xh_##opt(                        \
408
      const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
409
      const uint16_t *ref, ptrdiff_t ref_stride, const uint16_t *second,     \
410
      ptrdiff_t second_stride, int height, unsigned int *sse, void *unused0, \
411
      void *unused);
412
#define DECLS(opt1) \
413
  DECL(16, opt1)    \
414
  DECL(8, opt1)
415
416
DECLS(sse2)
417
#undef DECL
418
#undef DECLS
419
420
#define FN(w, h, wf, wlog2, hlog2, opt, cast)                                  \
421
  uint32_t vpx_highbd_8_sub_pixel_avg_variance##w##x##h##_##opt(               \
422
      const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
423
      const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr,                  \
424
0
      const uint8_t *sec8) {                                                   \
425
0
    uint32_t sse;                                                              \
426
0
    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
427
0
    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                                 \
428
0
    uint16_t *sec = CONVERT_TO_SHORTPTR(sec8);                                 \
429
0
    int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(                  \
430
0
        src, src_stride, x_offset, y_offset, ref, ref_stride, sec, w, h, &sse, \
431
0
        NULL, NULL);                                                           \
432
0
    if (w > wf) {                                                              \
433
0
      uint32_t sse2;                                                           \
434
0
      int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
435
0
          src + 16, src_stride, x_offset, y_offset, ref + 16, ref_stride,      \
436
0
          sec + 16, w, h, &sse2, NULL, NULL);                                  \
437
0
      se += se2;                                                               \
438
0
      sse += sse2;                                                             \
439
0
      if (w > wf * 2) {                                                        \
440
0
        se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
441
0
            src + 32, src_stride, x_offset, y_offset, ref + 32, ref_stride,    \
442
0
            sec + 32, w, h, &sse2, NULL, NULL);                                \
443
0
        se += se2;                                                             \
444
0
        sse += sse2;                                                           \
445
0
        se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
446
0
            src + 48, src_stride, x_offset, y_offset, ref + 48, ref_stride,    \
447
0
            sec + 48, w, h, &sse2, NULL, NULL);                                \
448
0
        se += se2;                                                             \
449
0
        sse += sse2;                                                           \
450
0
      }                                                                        \
451
0
    }                                                                          \
452
0
    *sse_ptr = sse;                                                            \
453
0
    return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2));                \
454
0
  }                                                                            \
Unexecuted instantiation: vpx_highbd_8_sub_pixel_avg_variance64x64_sse2
Unexecuted instantiation: vpx_highbd_8_sub_pixel_avg_variance64x32_sse2
Unexecuted instantiation: vpx_highbd_8_sub_pixel_avg_variance32x64_sse2
Unexecuted instantiation: vpx_highbd_8_sub_pixel_avg_variance32x32_sse2
Unexecuted instantiation: vpx_highbd_8_sub_pixel_avg_variance32x16_sse2
Unexecuted instantiation: vpx_highbd_8_sub_pixel_avg_variance16x32_sse2
Unexecuted instantiation: vpx_highbd_8_sub_pixel_avg_variance16x16_sse2
Unexecuted instantiation: vpx_highbd_8_sub_pixel_avg_variance16x8_sse2
Unexecuted instantiation: vpx_highbd_8_sub_pixel_avg_variance8x16_sse2
Unexecuted instantiation: vpx_highbd_8_sub_pixel_avg_variance8x8_sse2
Unexecuted instantiation: vpx_highbd_8_sub_pixel_avg_variance8x4_sse2
455
                                                                               \
456
  uint32_t vpx_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt(              \
457
      const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
458
      const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr,                  \
459
0
      const uint8_t *sec8) {                                                   \
460
0
    int64_t var;                                                               \
461
0
    uint32_t sse;                                                              \
462
0
    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
463
0
    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                                 \
464
0
    uint16_t *sec = CONVERT_TO_SHORTPTR(sec8);                                 \
465
0
    int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(                  \
466
0
        src, src_stride, x_offset, y_offset, ref, ref_stride, sec, w, h, &sse, \
467
0
        NULL, NULL);                                                           \
468
0
    if (w > wf) {                                                              \
469
0
      uint32_t sse2;                                                           \
470
0
      int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
471
0
          src + 16, src_stride, x_offset, y_offset, ref + 16, ref_stride,      \
472
0
          sec + 16, w, h, &sse2, NULL, NULL);                                  \
473
0
      se += se2;                                                               \
474
0
      sse += sse2;                                                             \
475
0
      if (w > wf * 2) {                                                        \
476
0
        se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
477
0
            src + 32, src_stride, x_offset, y_offset, ref + 32, ref_stride,    \
478
0
            sec + 32, w, h, &sse2, NULL, NULL);                                \
479
0
        se += se2;                                                             \
480
0
        sse += sse2;                                                           \
481
0
        se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
482
0
            src + 48, src_stride, x_offset, y_offset, ref + 48, ref_stride,    \
483
0
            sec + 48, w, h, &sse2, NULL, NULL);                                \
484
0
        se += se2;                                                             \
485
0
        sse += sse2;                                                           \
486
0
      }                                                                        \
487
0
    }                                                                          \
488
0
    se = ROUND_POWER_OF_TWO(se, 2);                                            \
489
0
    sse = ROUND_POWER_OF_TWO(sse, 4);                                          \
490
0
    *sse_ptr = sse;                                                            \
491
0
    var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2));                \
492
0
    return (var >= 0) ? (uint32_t)var : 0;                                     \
493
0
  }                                                                            \
Unexecuted instantiation: vpx_highbd_10_sub_pixel_avg_variance64x64_sse2
Unexecuted instantiation: vpx_highbd_10_sub_pixel_avg_variance64x32_sse2
Unexecuted instantiation: vpx_highbd_10_sub_pixel_avg_variance32x64_sse2
Unexecuted instantiation: vpx_highbd_10_sub_pixel_avg_variance32x32_sse2
Unexecuted instantiation: vpx_highbd_10_sub_pixel_avg_variance32x16_sse2
Unexecuted instantiation: vpx_highbd_10_sub_pixel_avg_variance16x32_sse2
Unexecuted instantiation: vpx_highbd_10_sub_pixel_avg_variance16x16_sse2
Unexecuted instantiation: vpx_highbd_10_sub_pixel_avg_variance16x8_sse2
Unexecuted instantiation: vpx_highbd_10_sub_pixel_avg_variance8x16_sse2
Unexecuted instantiation: vpx_highbd_10_sub_pixel_avg_variance8x8_sse2
Unexecuted instantiation: vpx_highbd_10_sub_pixel_avg_variance8x4_sse2
494
                                                                               \
495
  uint32_t vpx_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt(              \
496
      const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
497
      const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr,                  \
498
0
      const uint8_t *sec8) {                                                   \
499
0
    int start_row;                                                             \
500
0
    int64_t var;                                                               \
501
0
    uint32_t sse;                                                              \
502
0
    int se = 0;                                                                \
503
0
    uint64_t long_sse = 0;                                                     \
504
0
    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
505
0
    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                                 \
506
0
    uint16_t *sec = CONVERT_TO_SHORTPTR(sec8);                                 \
507
0
    for (start_row = 0; start_row < h; start_row += 16) {                      \
508
0
      uint32_t sse2;                                                           \
509
0
      int height = h - start_row < 16 ? h - start_row : 16;                    \
510
0
      int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
511
0
          src + (start_row * src_stride), src_stride, x_offset, y_offset,      \
512
0
          ref + (start_row * ref_stride), ref_stride, sec + (start_row * w),   \
513
0
          w, height, &sse2, NULL, NULL);                                       \
514
0
      se += se2;                                                               \
515
0
      long_sse += sse2;                                                        \
516
0
      if (w > wf) {                                                            \
517
0
        se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
518
0
            src + 16 + (start_row * src_stride), src_stride, x_offset,         \
519
0
            y_offset, ref + 16 + (start_row * ref_stride), ref_stride,         \
520
0
            sec + 16 + (start_row * w), w, height, &sse2, NULL, NULL);         \
521
0
        se += se2;                                                             \
522
0
        long_sse += sse2;                                                      \
523
0
        if (w > wf * 2) {                                                      \
524
0
          se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
525
0
              src + 32 + (start_row * src_stride), src_stride, x_offset,       \
526
0
              y_offset, ref + 32 + (start_row * ref_stride), ref_stride,       \
527
0
              sec + 32 + (start_row * w), w, height, &sse2, NULL, NULL);       \
528
0
          se += se2;                                                           \
529
0
          long_sse += sse2;                                                    \
530
0
          se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
531
0
              src + 48 + (start_row * src_stride), src_stride, x_offset,       \
532
0
              y_offset, ref + 48 + (start_row * ref_stride), ref_stride,       \
533
0
              sec + 48 + (start_row * w), w, height, &sse2, NULL, NULL);       \
534
0
          se += se2;                                                           \
535
0
          long_sse += sse2;                                                    \
536
0
        }                                                                      \
537
0
      }                                                                        \
538
0
    }                                                                          \
539
0
    se = ROUND_POWER_OF_TWO(se, 4);                                            \
540
0
    sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8);                           \
541
0
    *sse_ptr = sse;                                                            \
542
0
    var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2));                \
543
0
    return (var >= 0) ? (uint32_t)var : 0;                                     \
544
0
  }
Unexecuted instantiation: vpx_highbd_12_sub_pixel_avg_variance64x64_sse2
Unexecuted instantiation: vpx_highbd_12_sub_pixel_avg_variance64x32_sse2
Unexecuted instantiation: vpx_highbd_12_sub_pixel_avg_variance32x64_sse2
Unexecuted instantiation: vpx_highbd_12_sub_pixel_avg_variance32x32_sse2
Unexecuted instantiation: vpx_highbd_12_sub_pixel_avg_variance32x16_sse2
Unexecuted instantiation: vpx_highbd_12_sub_pixel_avg_variance16x32_sse2
Unexecuted instantiation: vpx_highbd_12_sub_pixel_avg_variance16x16_sse2
Unexecuted instantiation: vpx_highbd_12_sub_pixel_avg_variance16x8_sse2
Unexecuted instantiation: vpx_highbd_12_sub_pixel_avg_variance8x16_sse2
Unexecuted instantiation: vpx_highbd_12_sub_pixel_avg_variance8x8_sse2
Unexecuted instantiation: vpx_highbd_12_sub_pixel_avg_variance8x4_sse2
545
546
#define FNS(opt1)                       \
547
  FN(64, 64, 16, 6, 6, opt1, (int64_t)) \
548
  FN(64, 32, 16, 6, 5, opt1, (int64_t)) \
549
  FN(32, 64, 16, 5, 6, opt1, (int64_t)) \
550
  FN(32, 32, 16, 5, 5, opt1, (int64_t)) \
551
  FN(32, 16, 16, 5, 4, opt1, (int64_t)) \
552
  FN(16, 32, 16, 4, 5, opt1, (int64_t)) \
553
  FN(16, 16, 16, 4, 4, opt1, (int64_t)) \
554
  FN(16, 8, 16, 4, 3, opt1, (int64_t))  \
555
  FN(8, 16, 8, 4, 3, opt1, (int64_t))   \
556
  FN(8, 8, 8, 3, 3, opt1, (int64_t))    \
557
  FN(8, 4, 8, 3, 2, opt1, (int64_t))
558
559
FNS(sse2)
560
561
#undef FNS
562
#undef FN
563
564
void vpx_highbd_comp_avg_pred_sse2(uint16_t *comp_pred, const uint16_t *pred,
565
                                   int width, int height, const uint16_t *ref,
566
0
                                   int ref_stride) {
567
0
  int i, j;
568
0
  if (width > 8) {
569
0
    for (i = 0; i < height; ++i) {
570
0
      for (j = 0; j < width; j += 16) {
571
0
        const __m128i p0 = _mm_loadu_si128((const __m128i *)&pred[j]);
572
0
        const __m128i p1 = _mm_loadu_si128((const __m128i *)&pred[j + 8]);
573
0
        const __m128i r0 = _mm_loadu_si128((const __m128i *)&ref[j]);
574
0
        const __m128i r1 = _mm_loadu_si128((const __m128i *)&ref[j + 8]);
575
0
        _mm_storeu_si128((__m128i *)&comp_pred[j], _mm_avg_epu16(p0, r0));
576
0
        _mm_storeu_si128((__m128i *)&comp_pred[j + 8], _mm_avg_epu16(p1, r1));
577
0
      }
578
0
      comp_pred += width;
579
0
      pred += width;
580
0
      ref += ref_stride;
581
0
    }
582
0
  } else if (width == 8) {
583
0
    for (i = 0; i < height; i += 2) {
584
0
      const __m128i p0 = _mm_loadu_si128((const __m128i *)&pred[0]);
585
0
      const __m128i p1 = _mm_loadu_si128((const __m128i *)&pred[8]);
586
0
      const __m128i r0 = _mm_loadu_si128((const __m128i *)&ref[0]);
587
0
      const __m128i r1 = _mm_loadu_si128((const __m128i *)&ref[ref_stride]);
588
0
      _mm_storeu_si128((__m128i *)&comp_pred[0], _mm_avg_epu16(p0, r0));
589
0
      _mm_storeu_si128((__m128i *)&comp_pred[8], _mm_avg_epu16(p1, r1));
590
0
      comp_pred += 8 << 1;
591
0
      pred += 8 << 1;
592
0
      ref += ref_stride << 1;
593
0
    }
594
0
  } else {
595
0
    assert(width == 4);
596
0
    for (i = 0; i < height; i += 2) {
597
0
      const __m128i p0 = _mm_loadl_epi64((const __m128i *)&pred[0]);
598
0
      const __m128i p1 = _mm_loadl_epi64((const __m128i *)&pred[4]);
599
0
      const __m128i r0 = _mm_loadl_epi64((const __m128i *)&ref[0]);
600
0
      const __m128i r1 = _mm_loadl_epi64((const __m128i *)&ref[ref_stride]);
601
0
      _mm_storel_epi64((__m128i *)&comp_pred[0], _mm_avg_epu16(p0, r0));
602
0
      _mm_storel_epi64((__m128i *)&comp_pred[4], _mm_avg_epu16(p1, r1));
603
0
      comp_pred += 4 << 1;
604
0
      pred += 4 << 1;
605
0
      ref += ref_stride << 1;
606
0
    }
607
0
  }
608
0
}