Coverage Report

Created: 2025-07-23 06:32

/src/aom/av1/common/x86/cdef_block_avx2.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 */
11
12
#include "aom_dsp/aom_simd.h"
13
#define SIMD_FUNC(name) name##_avx2
14
#include "av1/common/cdef_block_simd.h"
15
16
/* partial A is a 16-bit vector of the form:
17
[x8 - - x1 | x16 - - x9] and partial B has the form:
18
[0  y1 - y7 | 0 y9 - y15].
19
This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ...
20
(x7^2+y2^7)*C7 + (x8^2+0^2)*C8 on each 128-bit lane. Here the C1..C8 constants
21
are in const1 and const2. */
22
static inline __m256i fold_mul_and_sum_avx2(__m256i *partiala,
23
                                            __m256i *partialb,
24
                                            const __m256i *const1,
25
60.4M
                                            const __m256i *const2) {
26
  // Mask used to shuffle the elements present in 256bit register.
27
60.4M
  static const int shuffle_reg_256bit[8] = { 0x0b0a0d0c, 0x07060908, 0x03020504,
28
60.4M
                                             0x0f0e0100, 0x0b0a0d0c, 0x07060908,
29
60.4M
                                             0x03020504, 0x0f0e0100 };
30
60.4M
  __m256i tmp;
31
  /* Reverse partial B. */
32
60.4M
  *partialb = _mm256_shuffle_epi8(
33
60.4M
      *partialb, _mm256_loadu_si256((const __m256i *)shuffle_reg_256bit));
34
35
  /* Interleave the x and y values of identical indices and pair x8 with 0. */
36
60.4M
  tmp = *partiala;
37
60.4M
  *partiala = _mm256_unpacklo_epi16(*partiala, *partialb);
38
60.4M
  *partialb = _mm256_unpackhi_epi16(tmp, *partialb);
39
40
  /* Square and add the corresponding x and y values. */
41
60.4M
  *partiala = _mm256_madd_epi16(*partiala, *partiala);
42
60.4M
  *partialb = _mm256_madd_epi16(*partialb, *partialb);
43
  /* Multiply by constant. */
44
60.4M
  *partiala = _mm256_mullo_epi32(*partiala, *const1);
45
60.4M
  *partialb = _mm256_mullo_epi32(*partialb, *const2);
46
  /* Sum all results. */
47
60.4M
  *partiala = _mm256_add_epi32(*partiala, *partialb);
48
60.4M
  return *partiala;
49
60.4M
}
50
51
static inline __m256i hsum4_avx2(__m256i *x0, __m256i *x1, __m256i *x2,
52
20.2M
                                 __m256i *x3) {
53
20.2M
  const __m256i t0 = _mm256_unpacklo_epi32(*x0, *x1);
54
20.2M
  const __m256i t1 = _mm256_unpacklo_epi32(*x2, *x3);
55
20.2M
  const __m256i t2 = _mm256_unpackhi_epi32(*x0, *x1);
56
20.2M
  const __m256i t3 = _mm256_unpackhi_epi32(*x2, *x3);
57
58
20.2M
  *x0 = _mm256_unpacklo_epi64(t0, t1);
59
20.2M
  *x1 = _mm256_unpackhi_epi64(t0, t1);
60
20.2M
  *x2 = _mm256_unpacklo_epi64(t2, t3);
61
20.2M
  *x3 = _mm256_unpackhi_epi64(t2, t3);
62
20.2M
  return _mm256_add_epi32(_mm256_add_epi32(*x0, *x1),
63
20.2M
                          _mm256_add_epi32(*x2, *x3));
64
20.2M
}
65
66
/* Computes cost for directions 0, 5, 6 and 7. We can call this function again
67
to compute the remaining directions. */
68
static inline __m256i compute_directions_avx2(__m256i *lines,
69
                                              int32_t cost_frist_8x8[4],
70
20.1M
                                              int32_t cost_second_8x8[4]) {
71
20.1M
  __m256i partial4a, partial4b, partial5a, partial5b, partial7a, partial7b;
72
20.1M
  __m256i partial6;
73
20.1M
  __m256i tmp;
74
  /* Partial sums for lines 0 and 1. */
75
20.1M
  partial4a = _mm256_slli_si256(lines[0], 14);
76
20.1M
  partial4b = _mm256_srli_si256(lines[0], 2);
77
20.1M
  partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[1], 12));
78
20.1M
  partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[1], 4));
79
20.1M
  tmp = _mm256_add_epi16(lines[0], lines[1]);
80
20.1M
  partial5a = _mm256_slli_si256(tmp, 10);
81
20.1M
  partial5b = _mm256_srli_si256(tmp, 6);
82
20.1M
  partial7a = _mm256_slli_si256(tmp, 4);
83
20.1M
  partial7b = _mm256_srli_si256(tmp, 12);
84
20.1M
  partial6 = tmp;
85
86
  /* Partial sums for lines 2 and 3. */
87
20.1M
  partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[2], 10));
88
20.1M
  partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[2], 6));
89
20.1M
  partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[3], 8));
90
20.1M
  partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[3], 8));
91
20.1M
  tmp = _mm256_add_epi16(lines[2], lines[3]);
92
20.1M
  partial5a = _mm256_add_epi16(partial5a, _mm256_slli_si256(tmp, 8));
93
20.1M
  partial5b = _mm256_add_epi16(partial5b, _mm256_srli_si256(tmp, 8));
94
20.1M
  partial7a = _mm256_add_epi16(partial7a, _mm256_slli_si256(tmp, 6));
95
20.1M
  partial7b = _mm256_add_epi16(partial7b, _mm256_srli_si256(tmp, 10));
96
20.1M
  partial6 = _mm256_add_epi16(partial6, tmp);
97
98
  /* Partial sums for lines 4 and 5. */
99
20.1M
  partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[4], 6));
100
20.1M
  partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[4], 10));
101
20.1M
  partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[5], 4));
102
20.1M
  partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[5], 12));
103
20.1M
  tmp = _mm256_add_epi16(lines[4], lines[5]);
104
20.1M
  partial5a = _mm256_add_epi16(partial5a, _mm256_slli_si256(tmp, 6));
105
20.1M
  partial5b = _mm256_add_epi16(partial5b, _mm256_srli_si256(tmp, 10));
106
20.1M
  partial7a = _mm256_add_epi16(partial7a, _mm256_slli_si256(tmp, 8));
107
20.1M
  partial7b = _mm256_add_epi16(partial7b, _mm256_srli_si256(tmp, 8));
108
20.1M
  partial6 = _mm256_add_epi16(partial6, tmp);
109
110
  /* Partial sums for lines 6 and 7. */
111
20.1M
  partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[6], 2));
112
20.1M
  partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[6], 14));
113
20.1M
  partial4a = _mm256_add_epi16(partial4a, lines[7]);
114
20.1M
  tmp = _mm256_add_epi16(lines[6], lines[7]);
115
20.1M
  partial5a = _mm256_add_epi16(partial5a, _mm256_slli_si256(tmp, 4));
116
20.1M
  partial5b = _mm256_add_epi16(partial5b, _mm256_srli_si256(tmp, 12));
117
20.1M
  partial7a = _mm256_add_epi16(partial7a, _mm256_slli_si256(tmp, 10));
118
20.1M
  partial7b = _mm256_add_epi16(partial7b, _mm256_srli_si256(tmp, 6));
119
20.1M
  partial6 = _mm256_add_epi16(partial6, tmp);
120
121
20.1M
  const __m256i const_reg_1 =
122
20.1M
      _mm256_set_epi32(210, 280, 420, 840, 210, 280, 420, 840);
123
20.1M
  const __m256i const_reg_2 =
124
20.1M
      _mm256_set_epi32(105, 120, 140, 168, 105, 120, 140, 168);
125
20.1M
  const __m256i const_reg_3 = _mm256_set_epi32(210, 420, 0, 0, 210, 420, 0, 0);
126
20.1M
  const __m256i const_reg_4 =
127
20.1M
      _mm256_set_epi32(105, 105, 105, 140, 105, 105, 105, 140);
128
129
  /* Compute costs in terms of partial sums. */
130
20.1M
  partial4a =
131
20.1M
      fold_mul_and_sum_avx2(&partial4a, &partial4b, &const_reg_1, &const_reg_2);
132
20.1M
  partial7a =
133
20.1M
      fold_mul_and_sum_avx2(&partial7a, &partial7b, &const_reg_3, &const_reg_4);
134
20.1M
  partial5a =
135
20.1M
      fold_mul_and_sum_avx2(&partial5a, &partial5b, &const_reg_3, &const_reg_4);
136
20.1M
  partial6 = _mm256_madd_epi16(partial6, partial6);
137
20.1M
  partial6 = _mm256_mullo_epi32(partial6, _mm256_set1_epi32(105));
138
139
20.1M
  partial4a = hsum4_avx2(&partial4a, &partial5a, &partial6, &partial7a);
140
20.1M
  _mm_storeu_si128((__m128i *)cost_frist_8x8,
141
20.1M
                   _mm256_castsi256_si128(partial4a));
142
20.1M
  _mm_storeu_si128((__m128i *)cost_second_8x8,
143
20.1M
                   _mm256_extractf128_si256(partial4a, 1));
144
145
20.1M
  return partial4a;
146
20.1M
}
147
148
/* transpose and reverse the order of the lines -- equivalent to a 90-degree
149
counter-clockwise rotation of the pixels. */
150
10.1M
static inline void array_reverse_transpose_8x8_avx2(__m256i *in, __m256i *res) {
151
10.1M
  const __m256i tr0_0 = _mm256_unpacklo_epi16(in[0], in[1]);
152
10.1M
  const __m256i tr0_1 = _mm256_unpacklo_epi16(in[2], in[3]);
153
10.1M
  const __m256i tr0_2 = _mm256_unpackhi_epi16(in[0], in[1]);
154
10.1M
  const __m256i tr0_3 = _mm256_unpackhi_epi16(in[2], in[3]);
155
10.1M
  const __m256i tr0_4 = _mm256_unpacklo_epi16(in[4], in[5]);
156
10.1M
  const __m256i tr0_5 = _mm256_unpacklo_epi16(in[6], in[7]);
157
10.1M
  const __m256i tr0_6 = _mm256_unpackhi_epi16(in[4], in[5]);
158
10.1M
  const __m256i tr0_7 = _mm256_unpackhi_epi16(in[6], in[7]);
159
160
10.1M
  const __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_1);
161
10.1M
  const __m256i tr1_1 = _mm256_unpacklo_epi32(tr0_4, tr0_5);
162
10.1M
  const __m256i tr1_2 = _mm256_unpackhi_epi32(tr0_0, tr0_1);
163
10.1M
  const __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_4, tr0_5);
164
10.1M
  const __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_2, tr0_3);
165
10.1M
  const __m256i tr1_5 = _mm256_unpacklo_epi32(tr0_6, tr0_7);
166
10.1M
  const __m256i tr1_6 = _mm256_unpackhi_epi32(tr0_2, tr0_3);
167
10.1M
  const __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_6, tr0_7);
168
169
10.1M
  res[7] = _mm256_unpacklo_epi64(tr1_0, tr1_1);
170
10.1M
  res[6] = _mm256_unpackhi_epi64(tr1_0, tr1_1);
171
10.1M
  res[5] = _mm256_unpacklo_epi64(tr1_2, tr1_3);
172
10.1M
  res[4] = _mm256_unpackhi_epi64(tr1_2, tr1_3);
173
10.1M
  res[3] = _mm256_unpacklo_epi64(tr1_4, tr1_5);
174
10.1M
  res[2] = _mm256_unpackhi_epi64(tr1_4, tr1_5);
175
10.1M
  res[1] = _mm256_unpacklo_epi64(tr1_6, tr1_7);
176
10.1M
  res[0] = _mm256_unpackhi_epi64(tr1_6, tr1_7);
177
10.1M
}
178
179
void cdef_find_dir_dual_avx2(const uint16_t *img1, const uint16_t *img2,
180
                             int stride, int32_t *var_out_1st,
181
                             int32_t *var_out_2nd, int coeff_shift,
182
10.0M
                             int *out_dir_1st_8x8, int *out_dir_2nd_8x8) {
183
10.0M
  int32_t cost_first_8x8[8];
184
10.0M
  int32_t cost_second_8x8[8];
185
  // Used to store the best cost for 2 8x8's.
186
10.0M
  int32_t best_cost[2] = { 0 };
187
  // Best direction for 2 8x8's.
188
10.0M
  int best_dir[2] = { 0 };
189
190
10.0M
  const __m128i const_coeff_shift_reg = _mm_cvtsi32_si128(coeff_shift);
191
10.0M
  const __m256i const_128_reg = _mm256_set1_epi16(128);
192
10.0M
  __m256i lines[8];
193
90.5M
  for (int i = 0; i < 8; i++) {
194
80.4M
    const __m128i src_1 = _mm_loadu_si128((const __m128i *)&img1[i * stride]);
195
80.4M
    const __m128i src_2 = _mm_loadu_si128((const __m128i *)&img2[i * stride]);
196
197
80.4M
    lines[i] = _mm256_insertf128_si256(_mm256_castsi128_si256(src_1), src_2, 1);
198
80.4M
    lines[i] = _mm256_sub_epi16(
199
80.4M
        _mm256_sra_epi16(lines[i], const_coeff_shift_reg), const_128_reg);
200
80.4M
  }
201
202
  /* Compute "mostly vertical" directions. */
203
10.0M
  const __m256i dir47 =
204
10.0M
      compute_directions_avx2(lines, cost_first_8x8 + 4, cost_second_8x8 + 4);
205
206
  /* Transpose and reverse the order of the lines. */
207
10.0M
  array_reverse_transpose_8x8_avx2(lines, lines);
208
209
  /* Compute "mostly horizontal" directions. */
210
10.0M
  const __m256i dir03 =
211
10.0M
      compute_directions_avx2(lines, cost_first_8x8, cost_second_8x8);
212
213
10.0M
  __m256i max = _mm256_max_epi32(dir03, dir47);
214
10.0M
  max =
215
10.0M
      _mm256_max_epi32(max, _mm256_or_si256(_mm256_srli_si256(max, 8),
216
10.0M
                                            _mm256_slli_si256(max, 16 - (8))));
217
10.0M
  max =
218
10.0M
      _mm256_max_epi32(max, _mm256_or_si256(_mm256_srli_si256(max, 4),
219
10.0M
                                            _mm256_slli_si256(max, 16 - (4))));
220
221
10.0M
  const __m128i first_8x8_output = _mm256_castsi256_si128(max);
222
10.0M
  const __m128i second_8x8_output = _mm256_extractf128_si256(max, 1);
223
10.0M
  const __m128i cmpeg_res_00 =
224
10.0M
      _mm_cmpeq_epi32(first_8x8_output, _mm256_castsi256_si128(dir47));
225
10.0M
  const __m128i cmpeg_res_01 =
226
10.0M
      _mm_cmpeq_epi32(first_8x8_output, _mm256_castsi256_si128(dir03));
227
10.0M
  const __m128i cmpeg_res_10 =
228
10.0M
      _mm_cmpeq_epi32(second_8x8_output, _mm256_extractf128_si256(dir47, 1));
229
10.0M
  const __m128i cmpeg_res_11 =
230
10.0M
      _mm_cmpeq_epi32(second_8x8_output, _mm256_extractf128_si256(dir03, 1));
231
10.0M
  const __m128i t_first_8x8 = _mm_packs_epi32(cmpeg_res_01, cmpeg_res_00);
232
10.0M
  const __m128i t_second_8x8 = _mm_packs_epi32(cmpeg_res_11, cmpeg_res_10);
233
234
10.0M
  best_cost[0] = _mm_cvtsi128_si32(_mm256_castsi256_si128(max));
235
10.0M
  best_cost[1] = _mm_cvtsi128_si32(second_8x8_output);
236
10.0M
  best_dir[0] = _mm_movemask_epi8(_mm_packs_epi16(t_first_8x8, t_first_8x8));
237
10.0M
  best_dir[0] =
238
10.0M
      get_msb(best_dir[0] ^ (best_dir[0] - 1));  // Count trailing zeros
239
10.0M
  best_dir[1] = _mm_movemask_epi8(_mm_packs_epi16(t_second_8x8, t_second_8x8));
240
10.0M
  best_dir[1] =
241
10.0M
      get_msb(best_dir[1] ^ (best_dir[1] - 1));  // Count trailing zeros
242
243
  /* Difference between the optimal variance and the variance along the
244
     orthogonal direction. Again, the sum(x^2) terms cancel out. */
245
10.0M
  *var_out_1st = best_cost[0] - cost_first_8x8[(best_dir[0] + 4) & 7];
246
10.0M
  *var_out_2nd = best_cost[1] - cost_second_8x8[(best_dir[1] + 4) & 7];
247
248
  /* We'd normally divide by 840, but dividing by 1024 is close enough
249
  for what we're going to do with this. */
250
10.0M
  *var_out_1st >>= 10;
251
10.0M
  *var_out_2nd >>= 10;
252
10.0M
  *out_dir_1st_8x8 = best_dir[0];
253
10.0M
  *out_dir_2nd_8x8 = best_dir[1];
254
10.0M
}
255
256
void cdef_copy_rect8_8bit_to_16bit_avx2(uint16_t *dst, int dstride,
257
                                        const uint8_t *src, int sstride,
258
728k
                                        int width, int height) {
259
728k
  int j = 0;
260
728k
  int remaining_width = width;
261
728k
  assert(height % 2 == 0);
262
728k
  assert(height > 0);
263
728k
  assert(width > 0);
264
265
  // Process multiple 32 pixels at a time.
266
728k
  if (remaining_width > 31) {
267
713k
    int i = 0;
268
11.4M
    do {
269
11.4M
      j = 0;
270
18.8M
      do {
271
18.8M
        __m128i row00 =
272
18.8M
            _mm_loadu_si128((const __m128i *)&src[(i + 0) * sstride + (j + 0)]);
273
18.8M
        __m128i row01 = _mm_loadu_si128(
274
18.8M
            (const __m128i *)&src[(i + 0) * sstride + (j + 16)]);
275
18.8M
        __m128i row10 =
276
18.8M
            _mm_loadu_si128((const __m128i *)&src[(i + 1) * sstride + (j + 0)]);
277
18.8M
        __m128i row11 = _mm_loadu_si128(
278
18.8M
            (const __m128i *)&src[(i + 1) * sstride + (j + 16)]);
279
18.8M
        _mm256_storeu_si256((__m256i *)&dst[(i + 0) * dstride + (j + 0)],
280
18.8M
                            _mm256_cvtepu8_epi16(row00));
281
18.8M
        _mm256_storeu_si256((__m256i *)&dst[(i + 0) * dstride + (j + 16)],
282
18.8M
                            _mm256_cvtepu8_epi16(row01));
283
18.8M
        _mm256_storeu_si256((__m256i *)&dst[(i + 1) * dstride + (j + 0)],
284
18.8M
                            _mm256_cvtepu8_epi16(row10));
285
18.8M
        _mm256_storeu_si256((__m256i *)&dst[(i + 1) * dstride + (j + 16)],
286
18.8M
                            _mm256_cvtepu8_epi16(row11));
287
18.8M
        j += 32;
288
18.8M
      } while (j <= width - 32);
289
11.4M
      i += 2;
290
11.4M
    } while (i < height);
291
713k
    remaining_width = width & 31;
292
713k
  }
293
294
  // Process 16 pixels at a time.
295
728k
  if (remaining_width > 15) {
296
54.1k
    int i = 0;
297
405k
    do {
298
405k
      __m128i row0 =
299
405k
          _mm_loadu_si128((const __m128i *)&src[(i + 0) * sstride + j]);
300
405k
      __m128i row1 =
301
405k
          _mm_loadu_si128((const __m128i *)&src[(i + 1) * sstride + j]);
302
405k
      _mm256_storeu_si256((__m256i *)&dst[(i + 0) * dstride + j],
303
405k
                          _mm256_cvtepu8_epi16(row0));
304
405k
      _mm256_storeu_si256((__m256i *)&dst[(i + 1) * dstride + j],
305
405k
                          _mm256_cvtepu8_epi16(row1));
306
405k
      i += 2;
307
405k
    } while (i < height);
308
54.1k
    remaining_width = width & 15;
309
54.1k
    j += 16;
310
54.1k
  }
311
312
  // Process 8 pixels at a time.
313
728k
  if (remaining_width > 7) {
314
528k
    int i = 0;
315
10.7M
    do {
316
10.7M
      __m128i row0 =
317
10.7M
          _mm_loadl_epi64((const __m128i *)&src[(i + 0) * sstride + j]);
318
10.7M
      __m128i row1 =
319
10.7M
          _mm_loadl_epi64((const __m128i *)&src[(i + 1) * sstride + j]);
320
10.7M
      _mm_storeu_si128((__m128i *)&dst[(i + 0) * dstride + j],
321
10.7M
                       _mm_unpacklo_epi8(row0, _mm_setzero_si128()));
322
10.7M
      _mm_storeu_si128((__m128i *)&dst[(i + 1) * dstride + j],
323
10.7M
                       _mm_unpacklo_epi8(row1, _mm_setzero_si128()));
324
10.7M
      i += 2;
325
10.7M
    } while (i < height);
326
528k
    remaining_width = width & 7;
327
528k
    j += 8;
328
528k
  }
329
330
  // Process 4 pixels at a time.
331
728k
  if (remaining_width > 3) {
332
13.3k
    int i = 0;
333
157k
    do {
334
157k
      __m128i row0 =
335
157k
          _mm_cvtsi32_si128(*((const int32_t *)&src[(i + 0) * sstride + j]));
336
157k
      __m128i row1 =
337
157k
          _mm_cvtsi32_si128(*((const int32_t *)&src[(i + 1) * sstride + j]));
338
157k
      _mm_storel_epi64((__m128i *)&dst[(i + 0) * dstride + j],
339
157k
                       _mm_unpacklo_epi8(row0, _mm_setzero_si128()));
340
157k
      _mm_storel_epi64((__m128i *)&dst[(i + 1) * dstride + j],
341
157k
                       _mm_unpacklo_epi8(row1, _mm_setzero_si128()));
342
157k
      i += 2;
343
157k
    } while (i < height);
344
13.3k
    remaining_width = width & 3;
345
13.3k
    j += 4;
346
13.3k
  }
347
348
  // Process the remaining pixels.
349
728k
  if (remaining_width) {
350
0
    for (int i = 0; i < height; i++) {
351
0
      for (int k = j; k < width; k++) {
352
0
        dst[i * dstride + k] = src[i * sstride + k];
353
0
      }
354
0
    }
355
0
  }
356
728k
}