Coverage Report

Created: 2026-02-14 06:59

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libvpx/vpx_dsp/x86/highbd_variance_sse2.c
Line
Count
Source
1
/*
2
 *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3
 *
4
 *  Use of this source code is governed by a BSD-style license
5
 *  that can be found in the LICENSE file in the root of the source
6
 *  tree. An additional intellectual property rights grant can be found
7
 *  in the file PATENTS.  All contributing project authors may
8
 *  be found in the AUTHORS file in the root of the source tree.
9
 */
10
#include <emmintrin.h>  // SSE2
11
12
#include "./vpx_config.h"
13
#include "./vpx_dsp_rtcd.h"
14
#include "vpx_ports/mem.h"
15
16
#if HAVE_X86_ASM
17
typedef uint32_t (*high_variance_fn_t)(const uint16_t *src, int src_stride,
18
                                       const uint16_t *ref, int ref_stride,
19
                                       uint32_t *sse, int *sum);
20
21
uint32_t vpx_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride,
22
                                    const uint16_t *ref, int ref_stride,
23
                                    uint32_t *sse, int *sum);
24
25
uint32_t vpx_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride,
26
                                      const uint16_t *ref, int ref_stride,
27
                                      uint32_t *sse, int *sum);
28
29
static void highbd_8_variance_sse2(const uint16_t *src, int src_stride,
30
                                   const uint16_t *ref, int ref_stride, int w,
31
                                   int h, uint32_t *sse, int *sum,
32
0
                                   high_variance_fn_t var_fn, int block_size) {
33
0
  int i, j;
34
35
0
  *sse = 0;
36
0
  *sum = 0;
37
38
0
  for (i = 0; i < h; i += block_size) {
39
0
    for (j = 0; j < w; j += block_size) {
40
0
      unsigned int sse0;
41
0
      int sum0;
42
0
      var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
43
0
             ref_stride, &sse0, &sum0);
44
0
      *sse += sse0;
45
0
      *sum += sum0;
46
0
    }
47
0
  }
48
0
}
49
50
static void highbd_10_variance_sse2(const uint16_t *src, int src_stride,
51
                                    const uint16_t *ref, int ref_stride, int w,
52
                                    int h, uint32_t *sse, int *sum,
53
0
                                    high_variance_fn_t var_fn, int block_size) {
54
0
  int i, j;
55
0
  uint64_t sse_long = 0;
56
0
  int32_t sum_long = 0;
57
58
0
  for (i = 0; i < h; i += block_size) {
59
0
    for (j = 0; j < w; j += block_size) {
60
0
      unsigned int sse0;
61
0
      int sum0;
62
0
      var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
63
0
             ref_stride, &sse0, &sum0);
64
0
      sse_long += sse0;
65
0
      sum_long += sum0;
66
0
    }
67
0
  }
68
0
  *sum = ROUND_POWER_OF_TWO(sum_long, 2);
69
0
  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
70
0
}
71
72
static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
73
                                    const uint16_t *ref, int ref_stride, int w,
74
                                    int h, uint32_t *sse, int *sum,
75
0
                                    high_variance_fn_t var_fn, int block_size) {
76
0
  int i, j;
77
0
  uint64_t sse_long = 0;
78
0
  int32_t sum_long = 0;
79
80
0
  for (i = 0; i < h; i += block_size) {
81
0
    for (j = 0; j < w; j += block_size) {
82
0
      unsigned int sse0;
83
0
      int sum0;
84
0
      var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
85
0
             ref_stride, &sse0, &sum0);
86
0
      sse_long += sse0;
87
0
      sum_long += sum0;
88
0
    }
89
0
  }
90
0
  *sum = ROUND_POWER_OF_TWO(sum_long, 4);
91
0
  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
92
0
}
93
94
#define HIGH_GET_VAR(S)                                                       \
95
  void vpx_highbd_8_get##S##x##S##var_sse2(                                   \
96
      const uint8_t *src8, int src_stride, const uint8_t *ref8,               \
97
0
      int ref_stride, uint32_t *sse, int *sum) {                              \
98
0
    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                \
99
0
    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                                \
100
0
    vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \
101
0
                                       sum);                                  \
102
0
  }                                                                           \
Unexecuted instantiation: vpx_highbd_8_get16x16var_sse2
Unexecuted instantiation: vpx_highbd_8_get8x8var_sse2
103
                                                                              \
104
  void vpx_highbd_10_get##S##x##S##var_sse2(                                  \
105
      const uint8_t *src8, int src_stride, const uint8_t *ref8,               \
106
0
      int ref_stride, uint32_t *sse, int *sum) {                              \
107
0
    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                \
108
0
    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                                \
109
0
    vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \
110
0
                                       sum);                                  \
111
0
    *sum = ROUND_POWER_OF_TWO(*sum, 2);                                       \
112
0
    *sse = ROUND_POWER_OF_TWO(*sse, 4);                                       \
113
0
  }                                                                           \
Unexecuted instantiation: vpx_highbd_10_get16x16var_sse2
Unexecuted instantiation: vpx_highbd_10_get8x8var_sse2
114
                                                                              \
115
  void vpx_highbd_12_get##S##x##S##var_sse2(                                  \
116
      const uint8_t *src8, int src_stride, const uint8_t *ref8,               \
117
0
      int ref_stride, uint32_t *sse, int *sum) {                              \
118
0
    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                \
119
0
    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                                \
120
0
    vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \
121
0
                                       sum);                                  \
122
0
    *sum = ROUND_POWER_OF_TWO(*sum, 4);                                       \
123
0
    *sse = ROUND_POWER_OF_TWO(*sse, 8);                                       \
124
0
  }
Unexecuted instantiation: vpx_highbd_12_get16x16var_sse2
Unexecuted instantiation: vpx_highbd_12_get8x8var_sse2
125
126
HIGH_GET_VAR(16)
127
HIGH_GET_VAR(8)
128
129
#undef HIGH_GET_VAR
130
131
#define VAR_FN(w, h, block_size, shift)                                    \
132
  uint32_t vpx_highbd_8_variance##w##x##h##_sse2(                          \
133
      const uint8_t *src8, int src_stride, const uint8_t *ref8,            \
134
0
      int ref_stride, uint32_t *sse) {                                     \
135
0
    int sum;                                                               \
136
0
    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                             \
137
0
    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                             \
138
0
    highbd_8_variance_sse2(                                                \
139
0
        src, src_stride, ref, ref_stride, w, h, sse, &sum,                 \
140
0
        vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
141
0
    return *sse - (uint32_t)(((int64_t)sum * sum) >> (shift));             \
142
0
  }                                                                        \
Unexecuted instantiation: vpx_highbd_8_variance64x64_sse2
Unexecuted instantiation: vpx_highbd_8_variance64x32_sse2
Unexecuted instantiation: vpx_highbd_8_variance32x64_sse2
Unexecuted instantiation: vpx_highbd_8_variance32x32_sse2
Unexecuted instantiation: vpx_highbd_8_variance32x16_sse2
Unexecuted instantiation: vpx_highbd_8_variance16x32_sse2
Unexecuted instantiation: vpx_highbd_8_variance16x16_sse2
Unexecuted instantiation: vpx_highbd_8_variance16x8_sse2
Unexecuted instantiation: vpx_highbd_8_variance8x16_sse2
Unexecuted instantiation: vpx_highbd_8_variance8x8_sse2
143
                                                                           \
144
  uint32_t vpx_highbd_10_variance##w##x##h##_sse2(                         \
145
      const uint8_t *src8, int src_stride, const uint8_t *ref8,            \
146
0
      int ref_stride, uint32_t *sse) {                                     \
147
0
    int sum;                                                               \
148
0
    int64_t var;                                                           \
149
0
    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                             \
150
0
    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                             \
151
0
    highbd_10_variance_sse2(                                               \
152
0
        src, src_stride, ref, ref_stride, w, h, sse, &sum,                 \
153
0
        vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
154
0
    var = (int64_t)(*sse) - (((int64_t)sum * sum) >> (shift));             \
155
0
    return (var >= 0) ? (uint32_t)var : 0;                                 \
156
0
  }                                                                        \
Unexecuted instantiation: vpx_highbd_10_variance64x64_sse2
Unexecuted instantiation: vpx_highbd_10_variance64x32_sse2
Unexecuted instantiation: vpx_highbd_10_variance32x64_sse2
Unexecuted instantiation: vpx_highbd_10_variance32x32_sse2
Unexecuted instantiation: vpx_highbd_10_variance32x16_sse2
Unexecuted instantiation: vpx_highbd_10_variance16x32_sse2
Unexecuted instantiation: vpx_highbd_10_variance16x16_sse2
Unexecuted instantiation: vpx_highbd_10_variance16x8_sse2
Unexecuted instantiation: vpx_highbd_10_variance8x16_sse2
Unexecuted instantiation: vpx_highbd_10_variance8x8_sse2
157
                                                                           \
158
  uint32_t vpx_highbd_12_variance##w##x##h##_sse2(                         \
159
      const uint8_t *src8, int src_stride, const uint8_t *ref8,            \
160
0
      int ref_stride, uint32_t *sse) {                                     \
161
0
    int sum;                                                               \
162
0
    int64_t var;                                                           \
163
0
    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                             \
164
0
    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                             \
165
0
    highbd_12_variance_sse2(                                               \
166
0
        src, src_stride, ref, ref_stride, w, h, sse, &sum,                 \
167
0
        vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
168
0
    var = (int64_t)(*sse) - (((int64_t)sum * sum) >> (shift));             \
169
0
    return (var >= 0) ? (uint32_t)var : 0;                                 \
170
0
  }
Unexecuted instantiation: vpx_highbd_12_variance64x64_sse2
Unexecuted instantiation: vpx_highbd_12_variance64x32_sse2
Unexecuted instantiation: vpx_highbd_12_variance32x64_sse2
Unexecuted instantiation: vpx_highbd_12_variance32x32_sse2
Unexecuted instantiation: vpx_highbd_12_variance32x16_sse2
Unexecuted instantiation: vpx_highbd_12_variance16x32_sse2
Unexecuted instantiation: vpx_highbd_12_variance16x16_sse2
Unexecuted instantiation: vpx_highbd_12_variance16x8_sse2
Unexecuted instantiation: vpx_highbd_12_variance8x16_sse2
Unexecuted instantiation: vpx_highbd_12_variance8x8_sse2
171
172
VAR_FN(64, 64, 16, 12)
173
VAR_FN(64, 32, 16, 11)
174
VAR_FN(32, 64, 16, 11)
175
VAR_FN(32, 32, 16, 10)
176
VAR_FN(32, 16, 16, 9)
177
VAR_FN(16, 32, 16, 9)
178
VAR_FN(16, 16, 16, 8)
179
VAR_FN(16, 8, 8, 7)
180
VAR_FN(8, 16, 8, 7)
181
VAR_FN(8, 8, 8, 6)
182
183
#undef VAR_FN
184
185
unsigned int vpx_highbd_8_mse16x16_sse2(const uint8_t *src8, int src_stride,
186
                                        const uint8_t *ref8, int ref_stride,
187
0
                                        unsigned int *sse) {
188
0
  int sum;
189
0
  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
190
0
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
191
0
  highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
192
0
                         vpx_highbd_calc16x16var_sse2, 16);
193
0
  return *sse;
194
0
}
195
196
unsigned int vpx_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride,
197
                                         const uint8_t *ref8, int ref_stride,
198
0
                                         unsigned int *sse) {
199
0
  int sum;
200
0
  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
201
0
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
202
0
  highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
203
0
                          vpx_highbd_calc16x16var_sse2, 16);
204
0
  return *sse;
205
0
}
206
207
unsigned int vpx_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride,
208
                                         const uint8_t *ref8, int ref_stride,
209
0
                                         unsigned int *sse) {
210
0
  int sum;
211
0
  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
212
0
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
213
0
  highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
214
0
                          vpx_highbd_calc16x16var_sse2, 16);
215
0
  return *sse;
216
0
}
217
218
unsigned int vpx_highbd_8_mse8x8_sse2(const uint8_t *src8, int src_stride,
219
                                      const uint8_t *ref8, int ref_stride,
220
0
                                      unsigned int *sse) {
221
0
  int sum;
222
0
  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
223
0
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
224
0
  highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum,
225
0
                         vpx_highbd_calc8x8var_sse2, 8);
226
0
  return *sse;
227
0
}
228
229
unsigned int vpx_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride,
230
                                       const uint8_t *ref8, int ref_stride,
231
0
                                       unsigned int *sse) {
232
0
  int sum;
233
0
  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
234
0
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
235
0
  highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum,
236
0
                          vpx_highbd_calc8x8var_sse2, 8);
237
0
  return *sse;
238
0
}
239
240
unsigned int vpx_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
241
                                       const uint8_t *ref8, int ref_stride,
242
0
                                       unsigned int *sse) {
243
0
  int sum;
244
0
  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
245
0
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
246
0
  highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum,
247
0
                          vpx_highbd_calc8x8var_sse2, 8);
248
0
  return *sse;
249
0
}
250
251
// The 2 unused parameters are place holders for PIC enabled build.
252
// These definitions are for functions defined in
253
// highbd_subpel_variance_impl_sse2.asm
254
#define DECL(w, opt)                                                         \
255
  int vpx_highbd_sub_pixel_variance##w##xh_##opt(                            \
256
      const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
257
      const uint16_t *ref, ptrdiff_t ref_stride, int height,                 \
258
      unsigned int *sse, void *unused0, void *unused);
259
#define DECLS(opt) \
260
  DECL(8, opt)     \
261
  DECL(16, opt)
262
263
DECLS(sse2)
264
265
#undef DECLS
266
#undef DECL
267
268
#define FN(w, h, wf, wlog2, hlog2, opt, cast)                                  \
269
  uint32_t vpx_highbd_8_sub_pixel_variance##w##x##h##_##opt(                   \
270
      const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
271
0
      const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr) {                \
272
0
    uint32_t sse;                                                              \
273
0
    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
274
0
    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                                 \
275
0
    int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                      \
276
0
        src, src_stride, x_offset, y_offset, ref, ref_stride, h, &sse, NULL,   \
277
0
        NULL);                                                                 \
278
0
    if (w > wf) {                                                              \
279
0
      unsigned int sse2;                                                       \
280
0
      int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                   \
281
0
          src + 16, src_stride, x_offset, y_offset, ref + 16, ref_stride, h,   \
282
0
          &sse2, NULL, NULL);                                                  \
283
0
      se += se2;                                                               \
284
0
      sse += sse2;                                                             \
285
0
      if (w > wf * 2) {                                                        \
286
0
        se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                     \
287
0
            src + 32, src_stride, x_offset, y_offset, ref + 32, ref_stride, h, \
288
0
            &sse2, NULL, NULL);                                                \
289
0
        se += se2;                                                             \
290
0
        sse += sse2;                                                           \
291
0
        se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                     \
292
0
            src + 48, src_stride, x_offset, y_offset, ref + 48, ref_stride, h, \
293
0
            &sse2, NULL, NULL);                                                \
294
0
        se += se2;                                                             \
295
0
        sse += sse2;                                                           \
296
0
      }                                                                        \
297
0
    }                                                                          \
298
0
    *sse_ptr = sse;                                                            \
299
0
    return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2));                \
300
0
  }                                                                            \
301
                                                                               \
302
  uint32_t vpx_highbd_10_sub_pixel_variance##w##x##h##_##opt(                  \
303
      const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
304
0
      const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr) {                \
305
0
    int64_t var;                                                               \
306
0
    uint32_t sse;                                                              \
307
0
    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
308
0
    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                                 \
309
0
    int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                      \
310
0
        src, src_stride, x_offset, y_offset, ref, ref_stride, h, &sse, NULL,   \
311
0
        NULL);                                                                 \
312
0
    if (w > wf) {                                                              \
313
0
      uint32_t sse2;                                                           \
314
0
      int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                   \
315
0
          src + 16, src_stride, x_offset, y_offset, ref + 16, ref_stride, h,   \
316
0
          &sse2, NULL, NULL);                                                  \
317
0
      se += se2;                                                               \
318
0
      sse += sse2;                                                             \
319
0
      if (w > wf * 2) {                                                        \
320
0
        se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                     \
321
0
            src + 32, src_stride, x_offset, y_offset, ref + 32, ref_stride, h, \
322
0
            &sse2, NULL, NULL);                                                \
323
0
        se += se2;                                                             \
324
0
        sse += sse2;                                                           \
325
0
        se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                     \
326
0
            src + 48, src_stride, x_offset, y_offset, ref + 48, ref_stride, h, \
327
0
            &sse2, NULL, NULL);                                                \
328
0
        se += se2;                                                             \
329
0
        sse += sse2;                                                           \
330
0
      }                                                                        \
331
0
    }                                                                          \
332
0
    se = ROUND_POWER_OF_TWO(se, 2);                                            \
333
0
    sse = ROUND_POWER_OF_TWO(sse, 4);                                          \
334
0
    *sse_ptr = sse;                                                            \
335
0
    var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2));                \
336
0
    return (var >= 0) ? (uint32_t)var : 0;                                     \
337
0
  }                                                                            \
338
                                                                               \
339
  uint32_t vpx_highbd_12_sub_pixel_variance##w##x##h##_##opt(                  \
340
      const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
341
0
      const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr) {                \
342
0
    int start_row;                                                             \
343
0
    uint32_t sse;                                                              \
344
0
    int se = 0;                                                                \
345
0
    int64_t var;                                                               \
346
0
    uint64_t long_sse = 0;                                                     \
347
0
    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
348
0
    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                                 \
349
0
    for (start_row = 0; start_row < h; start_row += 16) {                      \
350
0
      uint32_t sse2;                                                           \
351
0
      int height = h - start_row < 16 ? h - start_row : 16;                    \
352
0
      int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                   \
353
0
          src + (start_row * src_stride), src_stride, x_offset, y_offset,      \
354
0
          ref + (start_row * ref_stride), ref_stride, height, &sse2, NULL,     \
355
0
          NULL);                                                               \
356
0
      se += se2;                                                               \
357
0
      long_sse += sse2;                                                        \
358
0
      if (w > wf) {                                                            \
359
0
        se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                     \
360
0
            src + 16 + (start_row * src_stride), src_stride, x_offset,         \
361
0
            y_offset, ref + 16 + (start_row * ref_stride), ref_stride, height, \
362
0
            &sse2, NULL, NULL);                                                \
363
0
        se += se2;                                                             \
364
0
        long_sse += sse2;                                                      \
365
0
        if (w > wf * 2) {                                                      \
366
0
          se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                   \
367
0
              src + 32 + (start_row * src_stride), src_stride, x_offset,       \
368
0
              y_offset, ref + 32 + (start_row * ref_stride), ref_stride,       \
369
0
              height, &sse2, NULL, NULL);                                      \
370
0
          se += se2;                                                           \
371
0
          long_sse += sse2;                                                    \
372
0
          se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                   \
373
0
              src + 48 + (start_row * src_stride), src_stride, x_offset,       \
374
0
              y_offset, ref + 48 + (start_row * ref_stride), ref_stride,       \
375
0
              height, &sse2, NULL, NULL);                                      \
376
0
          se += se2;                                                           \
377
0
          long_sse += sse2;                                                    \
378
0
        }                                                                      \
379
0
      }                                                                        \
380
0
    }                                                                          \
381
0
    se = ROUND_POWER_OF_TWO(se, 4);                                            \
382
0
    sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8);                           \
383
0
    *sse_ptr = sse;                                                            \
384
0
    var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2));                \
385
0
    return (var >= 0) ? (uint32_t)var : 0;                                     \
386
0
  }
387
388
#define FNS(opt)                       \
389
  FN(64, 64, 16, 6, 6, opt, (int64_t)) \
390
  FN(64, 32, 16, 6, 5, opt, (int64_t)) \
391
  FN(32, 64, 16, 5, 6, opt, (int64_t)) \
392
  FN(32, 32, 16, 5, 5, opt, (int64_t)) \
393
  FN(32, 16, 16, 5, 4, opt, (int64_t)) \
394
  FN(16, 32, 16, 4, 5, opt, (int64_t)) \
395
  FN(16, 16, 16, 4, 4, opt, (int64_t)) \
396
  FN(16, 8, 16, 4, 3, opt, (int64_t))  \
397
  FN(8, 16, 8, 3, 4, opt, (int64_t))   \
398
  FN(8, 8, 8, 3, 3, opt, (int64_t))    \
399
  FN(8, 4, 8, 3, 2, opt, (int64_t))
400
401
0
FNS(sse2)
Unexecuted instantiation: vpx_highbd_8_sub_pixel_variance64x64_sse2
Unexecuted instantiation: vpx_highbd_10_sub_pixel_variance64x64_sse2
Unexecuted instantiation: vpx_highbd_12_sub_pixel_variance64x64_sse2
Unexecuted instantiation: vpx_highbd_8_sub_pixel_variance64x32_sse2
Unexecuted instantiation: vpx_highbd_10_sub_pixel_variance64x32_sse2
Unexecuted instantiation: vpx_highbd_12_sub_pixel_variance64x32_sse2
Unexecuted instantiation: vpx_highbd_8_sub_pixel_variance32x64_sse2
Unexecuted instantiation: vpx_highbd_10_sub_pixel_variance32x64_sse2
Unexecuted instantiation: vpx_highbd_12_sub_pixel_variance32x64_sse2
Unexecuted instantiation: vpx_highbd_8_sub_pixel_variance32x32_sse2
Unexecuted instantiation: vpx_highbd_10_sub_pixel_variance32x32_sse2
Unexecuted instantiation: vpx_highbd_12_sub_pixel_variance32x32_sse2
Unexecuted instantiation: vpx_highbd_8_sub_pixel_variance32x16_sse2
Unexecuted instantiation: vpx_highbd_10_sub_pixel_variance32x16_sse2
Unexecuted instantiation: vpx_highbd_12_sub_pixel_variance32x16_sse2
Unexecuted instantiation: vpx_highbd_8_sub_pixel_variance16x32_sse2
Unexecuted instantiation: vpx_highbd_10_sub_pixel_variance16x32_sse2
Unexecuted instantiation: vpx_highbd_12_sub_pixel_variance16x32_sse2
Unexecuted instantiation: vpx_highbd_8_sub_pixel_variance16x16_sse2
Unexecuted instantiation: vpx_highbd_10_sub_pixel_variance16x16_sse2
Unexecuted instantiation: vpx_highbd_12_sub_pixel_variance16x16_sse2
Unexecuted instantiation: vpx_highbd_8_sub_pixel_variance16x8_sse2
Unexecuted instantiation: vpx_highbd_10_sub_pixel_variance16x8_sse2
Unexecuted instantiation: vpx_highbd_12_sub_pixel_variance16x8_sse2
Unexecuted instantiation: vpx_highbd_8_sub_pixel_variance8x16_sse2
Unexecuted instantiation: vpx_highbd_10_sub_pixel_variance8x16_sse2
Unexecuted instantiation: vpx_highbd_12_sub_pixel_variance8x16_sse2
Unexecuted instantiation: vpx_highbd_8_sub_pixel_variance8x8_sse2
Unexecuted instantiation: vpx_highbd_10_sub_pixel_variance8x8_sse2
Unexecuted instantiation: vpx_highbd_12_sub_pixel_variance8x8_sse2
Unexecuted instantiation: vpx_highbd_8_sub_pixel_variance8x4_sse2
Unexecuted instantiation: vpx_highbd_10_sub_pixel_variance8x4_sse2
Unexecuted instantiation: vpx_highbd_12_sub_pixel_variance8x4_sse2
402
0
403
0
#undef FNS
404
0
#undef FN
405
0
406
0
// The 2 unused parameters are place holders for PIC enabled build.
407
0
#define DECL(w, opt)                                                         \
408
0
  int vpx_highbd_sub_pixel_avg_variance##w##xh_##opt(                        \
409
0
      const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
410
0
      const uint16_t *ref, ptrdiff_t ref_stride, const uint16_t *second,     \
411
0
      ptrdiff_t second_stride, int height, unsigned int *sse, void *unused0, \
412
0
      void *unused);
413
0
#define DECLS(opt1) \
414
0
  DECL(16, opt1)    \
415
0
  DECL(8, opt1)
416
0
417
0
DECLS(sse2)
418
0
#undef DECL
419
0
#undef DECLS
420
0
421
0
#define FN(w, h, wf, wlog2, hlog2, opt, cast)                                  \
422
0
  uint32_t vpx_highbd_8_sub_pixel_avg_variance##w##x##h##_##opt(               \
423
0
      const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
424
0
      const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr,                  \
425
0
      const uint8_t *sec8) {                                                   \
426
0
    uint32_t sse;                                                              \
427
0
    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
428
0
    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                                 \
429
0
    uint16_t *sec = CONVERT_TO_SHORTPTR(sec8);                                 \
430
0
    int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(                  \
431
0
        src, src_stride, x_offset, y_offset, ref, ref_stride, sec, w, h, &sse, \
432
0
        NULL, NULL);                                                           \
433
0
    if (w > wf) {                                                              \
434
0
      uint32_t sse2;                                                           \
435
0
      int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
436
0
          src + 16, src_stride, x_offset, y_offset, ref + 16, ref_stride,      \
437
0
          sec + 16, w, h, &sse2, NULL, NULL);                                  \
438
0
      se += se2;                                                               \
439
0
      sse += sse2;                                                             \
440
0
      if (w > wf * 2) {                                                        \
441
0
        se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
442
0
            src + 32, src_stride, x_offset, y_offset, ref + 32, ref_stride,    \
443
0
            sec + 32, w, h, &sse2, NULL, NULL);                                \
444
0
        se += se2;                                                             \
445
0
        sse += sse2;                                                           \
446
0
        se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
447
0
            src + 48, src_stride, x_offset, y_offset, ref + 48, ref_stride,    \
448
0
            sec + 48, w, h, &sse2, NULL, NULL);                                \
449
0
        se += se2;                                                             \
450
0
        sse += sse2;                                                           \
451
0
      }                                                                        \
452
0
    }                                                                          \
453
0
    *sse_ptr = sse;                                                            \
454
0
    return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2));                \
455
0
  }                                                                            \
456
                                                                               \
457
  uint32_t vpx_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt(              \
458
      const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
459
      const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr,                  \
460
0
      const uint8_t *sec8) {                                                   \
461
0
    int64_t var;                                                               \
462
0
    uint32_t sse;                                                              \
463
0
    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
464
0
    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                                 \
465
0
    uint16_t *sec = CONVERT_TO_SHORTPTR(sec8);                                 \
466
0
    int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(                  \
467
0
        src, src_stride, x_offset, y_offset, ref, ref_stride, sec, w, h, &sse, \
468
0
        NULL, NULL);                                                           \
469
0
    if (w > wf) {                                                              \
470
0
      uint32_t sse2;                                                           \
471
0
      int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
472
0
          src + 16, src_stride, x_offset, y_offset, ref + 16, ref_stride,      \
473
0
          sec + 16, w, h, &sse2, NULL, NULL);                                  \
474
0
      se += se2;                                                               \
475
0
      sse += sse2;                                                             \
476
0
      if (w > wf * 2) {                                                        \
477
0
        se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
478
0
            src + 32, src_stride, x_offset, y_offset, ref + 32, ref_stride,    \
479
0
            sec + 32, w, h, &sse2, NULL, NULL);                                \
480
0
        se += se2;                                                             \
481
0
        sse += sse2;                                                           \
482
0
        se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
483
0
            src + 48, src_stride, x_offset, y_offset, ref + 48, ref_stride,    \
484
0
            sec + 48, w, h, &sse2, NULL, NULL);                                \
485
0
        se += se2;                                                             \
486
0
        sse += sse2;                                                           \
487
0
      }                                                                        \
488
0
    }                                                                          \
489
0
    se = ROUND_POWER_OF_TWO(se, 2);                                            \
490
0
    sse = ROUND_POWER_OF_TWO(sse, 4);                                          \
491
0
    *sse_ptr = sse;                                                            \
492
0
    var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2));                \
493
0
    return (var >= 0) ? (uint32_t)var : 0;                                     \
494
0
  }                                                                            \
495
                                                                               \
496
  uint32_t vpx_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt(              \
497
      const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
498
      const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr,                  \
499
0
      const uint8_t *sec8) {                                                   \
500
0
    int start_row;                                                             \
501
0
    int64_t var;                                                               \
502
0
    uint32_t sse;                                                              \
503
0
    int se = 0;                                                                \
504
0
    uint64_t long_sse = 0;                                                     \
505
0
    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
506
0
    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                                 \
507
0
    uint16_t *sec = CONVERT_TO_SHORTPTR(sec8);                                 \
508
0
    for (start_row = 0; start_row < h; start_row += 16) {                      \
509
0
      uint32_t sse2;                                                           \
510
0
      int height = h - start_row < 16 ? h - start_row : 16;                    \
511
0
      int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
512
0
          src + (start_row * src_stride), src_stride, x_offset, y_offset,      \
513
0
          ref + (start_row * ref_stride), ref_stride, sec + (start_row * w),   \
514
0
          w, height, &sse2, NULL, NULL);                                       \
515
0
      se += se2;                                                               \
516
0
      long_sse += sse2;                                                        \
517
0
      if (w > wf) {                                                            \
518
0
        se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
519
0
            src + 16 + (start_row * src_stride), src_stride, x_offset,         \
520
0
            y_offset, ref + 16 + (start_row * ref_stride), ref_stride,         \
521
0
            sec + 16 + (start_row * w), w, height, &sse2, NULL, NULL);         \
522
0
        se += se2;                                                             \
523
0
        long_sse += sse2;                                                      \
524
0
        if (w > wf * 2) {                                                      \
525
0
          se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
526
0
              src + 32 + (start_row * src_stride), src_stride, x_offset,       \
527
0
              y_offset, ref + 32 + (start_row * ref_stride), ref_stride,       \
528
0
              sec + 32 + (start_row * w), w, height, &sse2, NULL, NULL);       \
529
0
          se += se2;                                                           \
530
0
          long_sse += sse2;                                                    \
531
0
          se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
532
0
              src + 48 + (start_row * src_stride), src_stride, x_offset,       \
533
0
              y_offset, ref + 48 + (start_row * ref_stride), ref_stride,       \
534
0
              sec + 48 + (start_row * w), w, height, &sse2, NULL, NULL);       \
535
0
          se += se2;                                                           \
536
0
          long_sse += sse2;                                                    \
537
0
        }                                                                      \
538
0
      }                                                                        \
539
0
    }                                                                          \
540
0
    se = ROUND_POWER_OF_TWO(se, 4);                                            \
541
0
    sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8);                           \
542
0
    *sse_ptr = sse;                                                            \
543
0
    var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2));                \
544
0
    return (var >= 0) ? (uint32_t)var : 0;                                     \
545
0
  }
546
547
#define FNS(opt1)                       \
548
  FN(64, 64, 16, 6, 6, opt1, (int64_t)) \
549
  FN(64, 32, 16, 6, 5, opt1, (int64_t)) \
550
  FN(32, 64, 16, 5, 6, opt1, (int64_t)) \
551
  FN(32, 32, 16, 5, 5, opt1, (int64_t)) \
552
  FN(32, 16, 16, 5, 4, opt1, (int64_t)) \
553
  FN(16, 32, 16, 4, 5, opt1, (int64_t)) \
554
  FN(16, 16, 16, 4, 4, opt1, (int64_t)) \
555
  FN(16, 8, 16, 4, 3, opt1, (int64_t))  \
556
  FN(8, 16, 8, 4, 3, opt1, (int64_t))   \
557
  FN(8, 8, 8, 3, 3, opt1, (int64_t))    \
558
  FN(8, 4, 8, 3, 2, opt1, (int64_t))
559
560
0
FNS(sse2)
Unexecuted instantiation: vpx_highbd_8_sub_pixel_avg_variance64x64_sse2
Unexecuted instantiation: vpx_highbd_10_sub_pixel_avg_variance64x64_sse2
Unexecuted instantiation: vpx_highbd_12_sub_pixel_avg_variance64x64_sse2
Unexecuted instantiation: vpx_highbd_8_sub_pixel_avg_variance64x32_sse2
Unexecuted instantiation: vpx_highbd_10_sub_pixel_avg_variance64x32_sse2
Unexecuted instantiation: vpx_highbd_12_sub_pixel_avg_variance64x32_sse2
Unexecuted instantiation: vpx_highbd_8_sub_pixel_avg_variance32x64_sse2
Unexecuted instantiation: vpx_highbd_10_sub_pixel_avg_variance32x64_sse2
Unexecuted instantiation: vpx_highbd_12_sub_pixel_avg_variance32x64_sse2
Unexecuted instantiation: vpx_highbd_8_sub_pixel_avg_variance32x32_sse2
Unexecuted instantiation: vpx_highbd_10_sub_pixel_avg_variance32x32_sse2
Unexecuted instantiation: vpx_highbd_12_sub_pixel_avg_variance32x32_sse2
Unexecuted instantiation: vpx_highbd_8_sub_pixel_avg_variance32x16_sse2
Unexecuted instantiation: vpx_highbd_10_sub_pixel_avg_variance32x16_sse2
Unexecuted instantiation: vpx_highbd_12_sub_pixel_avg_variance32x16_sse2
Unexecuted instantiation: vpx_highbd_8_sub_pixel_avg_variance16x32_sse2
Unexecuted instantiation: vpx_highbd_10_sub_pixel_avg_variance16x32_sse2
Unexecuted instantiation: vpx_highbd_12_sub_pixel_avg_variance16x32_sse2
Unexecuted instantiation: vpx_highbd_8_sub_pixel_avg_variance16x16_sse2
Unexecuted instantiation: vpx_highbd_10_sub_pixel_avg_variance16x16_sse2
Unexecuted instantiation: vpx_highbd_12_sub_pixel_avg_variance16x16_sse2
Unexecuted instantiation: vpx_highbd_8_sub_pixel_avg_variance16x8_sse2
Unexecuted instantiation: vpx_highbd_10_sub_pixel_avg_variance16x8_sse2
Unexecuted instantiation: vpx_highbd_12_sub_pixel_avg_variance16x8_sse2
Unexecuted instantiation: vpx_highbd_8_sub_pixel_avg_variance8x16_sse2
Unexecuted instantiation: vpx_highbd_10_sub_pixel_avg_variance8x16_sse2
Unexecuted instantiation: vpx_highbd_12_sub_pixel_avg_variance8x16_sse2
Unexecuted instantiation: vpx_highbd_8_sub_pixel_avg_variance8x8_sse2
Unexecuted instantiation: vpx_highbd_10_sub_pixel_avg_variance8x8_sse2
Unexecuted instantiation: vpx_highbd_12_sub_pixel_avg_variance8x8_sse2
Unexecuted instantiation: vpx_highbd_8_sub_pixel_avg_variance8x4_sse2
Unexecuted instantiation: vpx_highbd_10_sub_pixel_avg_variance8x4_sse2
Unexecuted instantiation: vpx_highbd_12_sub_pixel_avg_variance8x4_sse2
561
0
562
0
#undef FNS
563
0
#undef FN
564
0
#endif  // HAVE_X86_ASM
565
0
566
0
void vpx_highbd_comp_avg_pred_sse2(uint16_t *comp_pred, const uint16_t *pred,
567
0
                                   int width, int height, const uint16_t *ref,
568
0
                                   int ref_stride) {
569
0
  int i, j;
570
0
  if (width > 8) {
571
0
    for (i = 0; i < height; ++i) {
572
0
      for (j = 0; j < width; j += 16) {
573
0
        const __m128i p0 = _mm_loadu_si128((const __m128i *)&pred[j]);
574
0
        const __m128i p1 = _mm_loadu_si128((const __m128i *)&pred[j + 8]);
575
0
        const __m128i r0 = _mm_loadu_si128((const __m128i *)&ref[j]);
576
0
        const __m128i r1 = _mm_loadu_si128((const __m128i *)&ref[j + 8]);
577
0
        _mm_storeu_si128((__m128i *)&comp_pred[j], _mm_avg_epu16(p0, r0));
578
0
        _mm_storeu_si128((__m128i *)&comp_pred[j + 8], _mm_avg_epu16(p1, r1));
579
0
      }
580
0
      comp_pred += width;
581
0
      pred += width;
582
0
      ref += ref_stride;
583
0
    }
584
0
  } else if (width == 8) {
585
0
    for (i = 0; i < height; i += 2) {
586
0
      const __m128i p0 = _mm_loadu_si128((const __m128i *)&pred[0]);
587
0
      const __m128i p1 = _mm_loadu_si128((const __m128i *)&pred[8]);
588
0
      const __m128i r0 = _mm_loadu_si128((const __m128i *)&ref[0]);
589
0
      const __m128i r1 = _mm_loadu_si128((const __m128i *)&ref[ref_stride]);
590
0
      _mm_storeu_si128((__m128i *)&comp_pred[0], _mm_avg_epu16(p0, r0));
591
0
      _mm_storeu_si128((__m128i *)&comp_pred[8], _mm_avg_epu16(p1, r1));
592
0
      comp_pred += 8 << 1;
593
0
      pred += 8 << 1;
594
0
      ref += ref_stride << 1;
595
0
    }
596
0
  } else {
597
0
    assert(width == 4);
598
0
    for (i = 0; i < height; i += 2) {
599
0
      const __m128i p0 = _mm_loadl_epi64((const __m128i *)&pred[0]);
600
0
      const __m128i p1 = _mm_loadl_epi64((const __m128i *)&pred[4]);
601
0
      const __m128i r0 = _mm_loadl_epi64((const __m128i *)&ref[0]);
602
0
      const __m128i r1 = _mm_loadl_epi64((const __m128i *)&ref[ref_stride]);
603
0
      _mm_storel_epi64((__m128i *)&comp_pred[0], _mm_avg_epu16(p0, r0));
604
0
      _mm_storel_epi64((__m128i *)&comp_pred[4], _mm_avg_epu16(p1, r1));
605
0
      comp_pred += 4 << 1;
606
0
      pred += 4 << 1;
607
0
      ref += ref_stride << 1;
608
0
    }
609
0
  }
610
0
}