Coverage Report

Created: 2025-12-31 06:49

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/aom/av1/common/x86/intra_edge_sse4.c
Line
Count
Source
1
/*
2
 * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 */
11
12
#include <assert.h>
13
#include <smmintrin.h>
14
15
#include "config/aom_config.h"
16
#include "config/av1_rtcd.h"
17
18
1.45M
void av1_filter_intra_edge_sse4_1(uint8_t *p, int sz, int strength) {
19
1.45M
  if (!strength) return;
20
21
987k
  DECLARE_ALIGNED(16, static const int8_t, kern[3][16]) = {
22
987k
    { 4, 8, 4, 0, 4, 8, 4, 0, 4, 8, 4, 0, 4, 8, 4, 0 },  // strength 1: 4,8,4
23
987k
    { 5, 6, 5, 0, 5, 6, 5, 0, 5, 6, 5, 0, 5, 6, 5, 0 },  // strength 2: 5,6,5
24
987k
    { 2, 4, 4, 4, 2, 0, 0, 0, 2, 4, 4, 4, 2, 0, 0, 0 }  // strength 3: 2,4,4,4,2
25
987k
  };
26
27
987k
  DECLARE_ALIGNED(16, static const int8_t, v_const[5][16]) = {
28
987k
    { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 },
29
987k
    { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 },
30
987k
    { 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8 },
31
987k
    { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
32
987k
  };
33
34
  // Extend the first and last samples to simplify the loop for the 5-tap case
35
987k
  p[-1] = p[0];
36
987k
  __m128i last = _mm_set1_epi8((char)p[sz - 1]);
37
987k
  _mm_storeu_si128((__m128i *)&p[sz], last);
38
39
  // Adjust input pointer for filter support area
40
987k
  uint8_t *in = (strength == 3) ? p - 1 : p;
41
42
  // Avoid modifying first sample
43
987k
  uint8_t *out = p + 1;
44
987k
  int len = sz - 1;
45
46
987k
  const int use_3tap_filter = (strength < 3);
47
48
987k
  if (use_3tap_filter) {
49
457k
    __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
50
457k
    __m128i shuf0 = _mm_lddqu_si128((__m128i const *)v_const[0]);
51
457k
    __m128i shuf1 = _mm_lddqu_si128((__m128i const *)v_const[1]);
52
457k
    __m128i iden = _mm_lddqu_si128((__m128i *)v_const[3]);
53
457k
    __m128i in0 = _mm_lddqu_si128((__m128i *)in);
54
1.18M
    while (len > 0) {
55
728k
      int n_out = (len < 8) ? len : 8;
56
728k
      __m128i d0 = _mm_shuffle_epi8(in0, shuf0);
57
728k
      __m128i d1 = _mm_shuffle_epi8(in0, shuf1);
58
728k
      d0 = _mm_maddubs_epi16(d0, coef0);
59
728k
      d1 = _mm_maddubs_epi16(d1, coef0);
60
728k
      d0 = _mm_hadd_epi16(d0, d1);
61
728k
      __m128i eight = _mm_set1_epi16(8);
62
728k
      d0 = _mm_add_epi16(d0, eight);
63
728k
      d0 = _mm_srai_epi16(d0, 4);
64
728k
      d0 = _mm_packus_epi16(d0, d0);
65
728k
      __m128i out0 = _mm_lddqu_si128((__m128i *)out);
66
728k
      __m128i n0 = _mm_set1_epi8(n_out);
67
728k
      __m128i mask = _mm_cmpgt_epi8(n0, iden);
68
728k
      out0 = _mm_blendv_epi8(out0, d0, mask);
69
728k
      _mm_storel_epi64((__m128i *)out, out0);
70
728k
      __m128i in1 = _mm_lddqu_si128((__m128i *)(in + 16));
71
728k
      in0 = _mm_alignr_epi8(in1, in0, 8);
72
728k
      in += 8;
73
728k
      out += 8;
74
728k
      len -= n_out;
75
728k
    }
76
529k
  } else {  // 5-tap filter
77
529k
    __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
78
529k
    __m128i two = _mm_set1_epi8(2);
79
529k
    __m128i shuf_a = _mm_lddqu_si128((__m128i const *)v_const[2]);
80
529k
    __m128i shuf_b = _mm_add_epi8(shuf_a, two);
81
529k
    __m128i shuf_c = _mm_add_epi8(shuf_b, two);
82
529k
    __m128i shuf_d = _mm_add_epi8(shuf_c, two);
83
529k
    __m128i iden = _mm_lddqu_si128((__m128i *)v_const[3]);
84
529k
    __m128i in0 = _mm_lddqu_si128((__m128i *)in);
85
2.65M
    while (len > 0) {
86
2.12M
      int n_out = (len < 8) ? len : 8;
87
2.12M
      __m128i d0 = _mm_shuffle_epi8(in0, shuf_a);
88
2.12M
      __m128i d1 = _mm_shuffle_epi8(in0, shuf_b);
89
2.12M
      __m128i d2 = _mm_shuffle_epi8(in0, shuf_c);
90
2.12M
      __m128i d3 = _mm_shuffle_epi8(in0, shuf_d);
91
2.12M
      d0 = _mm_maddubs_epi16(d0, coef0);
92
2.12M
      d1 = _mm_maddubs_epi16(d1, coef0);
93
2.12M
      d2 = _mm_maddubs_epi16(d2, coef0);
94
2.12M
      d3 = _mm_maddubs_epi16(d3, coef0);
95
2.12M
      d0 = _mm_hadd_epi16(d0, d1);
96
2.12M
      d2 = _mm_hadd_epi16(d2, d3);
97
2.12M
      d0 = _mm_hadd_epi16(d0, d2);
98
2.12M
      __m128i eight = _mm_set1_epi16(8);
99
2.12M
      d0 = _mm_add_epi16(d0, eight);
100
2.12M
      d0 = _mm_srai_epi16(d0, 4);
101
2.12M
      d0 = _mm_packus_epi16(d0, d0);
102
2.12M
      __m128i out0 = _mm_lddqu_si128((__m128i *)out);
103
2.12M
      __m128i n0 = _mm_set1_epi8(n_out);
104
2.12M
      __m128i mask = _mm_cmpgt_epi8(n0, iden);
105
2.12M
      out0 = _mm_blendv_epi8(out0, d0, mask);
106
2.12M
      _mm_storel_epi64((__m128i *)out, out0);
107
2.12M
      __m128i in1 = _mm_lddqu_si128((__m128i *)(in + 16));
108
2.12M
      in0 = _mm_alignr_epi8(in1, in0, 8);
109
2.12M
      in += 8;
110
2.12M
      out += 8;
111
2.12M
      len -= n_out;
112
2.12M
    }
113
529k
  }
114
987k
}
115
116
366k
void av1_upsample_intra_edge_sse4_1(uint8_t *p, int sz) {
117
  // interpolate half-sample positions
118
366k
  assert(sz <= 24);
119
120
366k
  DECLARE_ALIGNED(16, static const int8_t, kernel[1][16]) = {
121
366k
    { -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1 }
122
366k
  };
123
124
366k
  DECLARE_ALIGNED(
125
366k
      16, static const int8_t,
126
366k
      v_const[2][16]) = { { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 },
127
366k
                          { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } };
128
129
  // Extend first/last samples (upper-left p[-1], last p[sz-1])
130
  // to support 4-tap filter
131
366k
  p[-2] = p[-1];
132
366k
  p[sz] = p[sz - 1];
133
134
366k
  uint8_t *in = &p[-2];
135
366k
  uint8_t *out = &p[-2];
136
137
366k
  int n = sz + 1;  // Input length including upper-left sample
138
139
366k
  __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
140
366k
  __m128i in16 = _mm_lddqu_si128((__m128i *)&in[16]);
141
142
366k
  __m128i coef0 = _mm_lddqu_si128((__m128i *)kernel[0]);
143
366k
  __m128i shuf0 = _mm_lddqu_si128((__m128i *)v_const[0]);
144
366k
  __m128i shuf1 = _mm_lddqu_si128((__m128i *)v_const[1]);
145
146
780k
  while (n > 0) {
147
413k
    __m128i in8 = _mm_alignr_epi8(in16, in0, 8);
148
413k
    __m128i d0 = _mm_shuffle_epi8(in0, shuf0);
149
413k
    __m128i d1 = _mm_shuffle_epi8(in0, shuf1);
150
413k
    __m128i d2 = _mm_shuffle_epi8(in8, shuf0);
151
413k
    __m128i d3 = _mm_shuffle_epi8(in8, shuf1);
152
413k
    d0 = _mm_maddubs_epi16(d0, coef0);
153
413k
    d1 = _mm_maddubs_epi16(d1, coef0);
154
413k
    d2 = _mm_maddubs_epi16(d2, coef0);
155
413k
    d3 = _mm_maddubs_epi16(d3, coef0);
156
413k
    d0 = _mm_hadd_epi16(d0, d1);
157
413k
    d2 = _mm_hadd_epi16(d2, d3);
158
413k
    __m128i eight = _mm_set1_epi16(8);
159
413k
    d0 = _mm_add_epi16(d0, eight);
160
413k
    d2 = _mm_add_epi16(d2, eight);
161
413k
    d0 = _mm_srai_epi16(d0, 4);
162
413k
    d2 = _mm_srai_epi16(d2, 4);
163
413k
    d0 = _mm_packus_epi16(d0, d2);
164
413k
    __m128i in1 = _mm_alignr_epi8(in16, in0, 1);
165
413k
    __m128i out0 = _mm_unpacklo_epi8(in1, d0);
166
413k
    __m128i out1 = _mm_unpackhi_epi8(in1, d0);
167
413k
    _mm_storeu_si128((__m128i *)&out[0], out0);
168
413k
    _mm_storeu_si128((__m128i *)&out[16], out1);
169
413k
    in0 = in16;
170
413k
    in16 = _mm_setzero_si128();
171
413k
    out += 32;
172
413k
    n -= 16;
173
413k
  }
174
366k
}
175
176
#if CONFIG_AV1_HIGHBITDEPTH
177
178
1.84M
void av1_highbd_filter_intra_edge_sse4_1(uint16_t *p, int sz, int strength) {
179
1.84M
  if (!strength) return;
180
181
1.32M
  DECLARE_ALIGNED(16, static const int16_t, kern[3][8]) = {
182
1.32M
    { 4, 8, 4, 8, 4, 8, 4, 8 },  // strength 1: 4,8,4
183
1.32M
    { 5, 6, 5, 6, 5, 6, 5, 6 },  // strength 2: 5,6,5
184
1.32M
    { 2, 4, 2, 4, 2, 4, 2, 4 }   // strength 3: 2,4,4,4,2
185
1.32M
  };
186
187
1.32M
  DECLARE_ALIGNED(16, static const int16_t,
188
1.32M
                  v_const[1][8]) = { { 0, 1, 2, 3, 4, 5, 6, 7 } };
189
190
  // Extend the first and last samples to simplify the loop for the 5-tap case
191
1.32M
  p[-1] = p[0];
192
1.32M
  __m128i last = _mm_set1_epi16(p[sz - 1]);
193
1.32M
  _mm_storeu_si128((__m128i *)&p[sz], last);
194
195
  // Adjust input pointer for filter support area
196
1.32M
  uint16_t *in = (strength == 3) ? p - 1 : p;
197
198
  // Avoid modifying first sample
199
1.32M
  uint16_t *out = p + 1;
200
1.32M
  int len = sz - 1;
201
202
1.32M
  const int use_3tap_filter = (strength < 3);
203
204
1.32M
  if (use_3tap_filter) {
205
505k
    __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
206
505k
    __m128i iden = _mm_lddqu_si128((__m128i *)v_const[0]);
207
505k
    __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
208
505k
    __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]);
209
1.47M
    while (len > 0) {
210
967k
      int n_out = (len < 8) ? len : 8;
211
967k
      __m128i in1 = _mm_alignr_epi8(in8, in0, 2);
212
967k
      __m128i in2 = _mm_alignr_epi8(in8, in0, 4);
213
967k
      __m128i in02 = _mm_add_epi16(in0, in2);
214
967k
      __m128i d0 = _mm_unpacklo_epi16(in02, in1);
215
967k
      __m128i d1 = _mm_unpackhi_epi16(in02, in1);
216
967k
      d0 = _mm_mullo_epi16(d0, coef0);
217
967k
      d1 = _mm_mullo_epi16(d1, coef0);
218
967k
      d0 = _mm_hadd_epi16(d0, d1);
219
967k
      __m128i eight = _mm_set1_epi16(8);
220
967k
      d0 = _mm_add_epi16(d0, eight);
221
967k
      d0 = _mm_srli_epi16(d0, 4);
222
967k
      __m128i out0 = _mm_lddqu_si128((__m128i *)out);
223
967k
      __m128i n0 = _mm_set1_epi16(n_out);
224
967k
      __m128i mask = _mm_cmpgt_epi16(n0, iden);
225
967k
      out0 = _mm_blendv_epi8(out0, d0, mask);
226
967k
      _mm_storeu_si128((__m128i *)out, out0);
227
967k
      in += 8;
228
967k
      in0 = in8;
229
967k
      in8 = _mm_lddqu_si128((__m128i *)&in[8]);
230
967k
      out += 8;
231
967k
      len -= n_out;
232
967k
    }
233
817k
  } else {  // 5-tap filter
234
817k
    __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
235
817k
    __m128i iden = _mm_lddqu_si128((__m128i *)v_const[0]);
236
817k
    __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
237
817k
    __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]);
238
4.11M
    while (len > 0) {
239
3.29M
      int n_out = (len < 8) ? len : 8;
240
3.29M
      __m128i in1 = _mm_alignr_epi8(in8, in0, 2);
241
3.29M
      __m128i in2 = _mm_alignr_epi8(in8, in0, 4);
242
3.29M
      __m128i in3 = _mm_alignr_epi8(in8, in0, 6);
243
3.29M
      __m128i in4 = _mm_alignr_epi8(in8, in0, 8);
244
3.29M
      __m128i in04 = _mm_add_epi16(in0, in4);
245
3.29M
      __m128i in123 = _mm_add_epi16(in1, in2);
246
3.29M
      in123 = _mm_add_epi16(in123, in3);
247
3.29M
      __m128i d0 = _mm_unpacklo_epi16(in04, in123);
248
3.29M
      __m128i d1 = _mm_unpackhi_epi16(in04, in123);
249
3.29M
      d0 = _mm_mullo_epi16(d0, coef0);
250
3.29M
      d1 = _mm_mullo_epi16(d1, coef0);
251
3.29M
      d0 = _mm_hadd_epi16(d0, d1);
252
3.29M
      __m128i eight = _mm_set1_epi16(8);
253
3.29M
      d0 = _mm_add_epi16(d0, eight);
254
3.29M
      d0 = _mm_srli_epi16(d0, 4);
255
3.29M
      __m128i out0 = _mm_lddqu_si128((__m128i *)out);
256
3.29M
      __m128i n0 = _mm_set1_epi16(n_out);
257
3.29M
      __m128i mask = _mm_cmpgt_epi16(n0, iden);
258
3.29M
      out0 = _mm_blendv_epi8(out0, d0, mask);
259
3.29M
      _mm_storeu_si128((__m128i *)out, out0);
260
3.29M
      in += 8;
261
3.29M
      in0 = in8;
262
3.29M
      in8 = _mm_lddqu_si128((__m128i *)&in[8]);
263
3.29M
      out += 8;
264
3.29M
      len -= n_out;
265
3.29M
    }
266
817k
  }
267
1.32M
}
268
269
371k
void av1_highbd_upsample_intra_edge_sse4_1(uint16_t *p, int sz, int bd) {
270
  // interpolate half-sample positions
271
371k
  assert(sz <= 24);
272
273
371k
  DECLARE_ALIGNED(16, static const int16_t,
274
371k
                  kernel[1][8]) = { { -1, 9, -1, 9, -1, 9, -1, 9 } };
275
276
  // Extend first/last samples (upper-left p[-1], last p[sz-1])
277
  // to support 4-tap filter
278
371k
  p[-2] = p[-1];
279
371k
  p[sz] = p[sz - 1];
280
281
371k
  uint16_t *in = &p[-2];
282
371k
  uint16_t *out = in;
283
371k
  int n = sz + 1;
284
285
371k
  __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
286
371k
  __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]);
287
371k
  __m128i in16 = _mm_lddqu_si128((__m128i *)&in[16]);
288
371k
  __m128i in24 = _mm_lddqu_si128((__m128i *)&in[24]);
289
290
1.12M
  while (n > 0) {
291
756k
    __m128i in1 = _mm_alignr_epi8(in8, in0, 2);
292
756k
    __m128i in2 = _mm_alignr_epi8(in8, in0, 4);
293
    __m128i in3 = _mm_alignr_epi8(in8, in0, 6);
294
756k
    __m128i sum0 = _mm_add_epi16(in0, in3);
295
756k
    __m128i sum1 = _mm_add_epi16(in1, in2);
296
756k
    __m128i d0 = _mm_unpacklo_epi16(sum0, sum1);
297
756k
    __m128i d1 = _mm_unpackhi_epi16(sum0, sum1);
298
756k
    __m128i coef0 = _mm_lddqu_si128((__m128i *)kernel[0]);
299
756k
    d0 = _mm_madd_epi16(d0, coef0);
300
756k
    d1 = _mm_madd_epi16(d1, coef0);
301
756k
    __m128i eight = _mm_set1_epi32(8);
302
756k
    d0 = _mm_add_epi32(d0, eight);
303
756k
    d1 = _mm_add_epi32(d1, eight);
304
756k
    d0 = _mm_srai_epi32(d0, 4);
305
756k
    d1 = _mm_srai_epi32(d1, 4);
306
756k
    d0 = _mm_packus_epi32(d0, d1);
307
756k
    __m128i max0 = _mm_set1_epi16((1 << bd) - 1);
308
756k
    d0 = _mm_min_epi16(d0, max0);
309
756k
    __m128i out0 = _mm_unpacklo_epi16(in1, d0);
310
756k
    __m128i out1 = _mm_unpackhi_epi16(in1, d0);
311
756k
    _mm_storeu_si128((__m128i *)&out[0], out0);
312
756k
    _mm_storeu_si128((__m128i *)&out[8], out1);
313
756k
    in0 = in8;
314
756k
    in8 = in16;
315
756k
    in16 = in24;
316
756k
    in24 = _mm_setzero_si128();
317
756k
    out += 16;
318
756k
    n -= 8;
319
756k
  }
320
371k
}
321
322
#endif  // CONFIG_AV1_HIGHBITDEPTH