Coverage Report

Created: 2023-06-07 06:31

/src/aom/av1/common/x86/intra_edge_sse4.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 */
11
12
#include <assert.h>
13
#include <smmintrin.h>
14
15
#include "config/aom_config.h"
16
#include "config/av1_rtcd.h"
17
18
3.20M
void av1_filter_intra_edge_sse4_1(uint8_t *p, int sz, int strength) {
19
3.20M
  if (!strength) return;
20
21
2.16M
  DECLARE_ALIGNED(16, static const int8_t, kern[3][16]) = {
22
2.16M
    { 4, 8, 4, 0, 4, 8, 4, 0, 4, 8, 4, 0, 4, 8, 4, 0 },  // strength 1: 4,8,4
23
2.16M
    { 5, 6, 5, 0, 5, 6, 5, 0, 5, 6, 5, 0, 5, 6, 5, 0 },  // strength 2: 5,6,5
24
2.16M
    { 2, 4, 4, 4, 2, 0, 0, 0, 2, 4, 4, 4, 2, 0, 0, 0 }  // strength 3: 2,4,4,4,2
25
2.16M
  };
26
27
2.16M
  DECLARE_ALIGNED(16, static const int8_t, v_const[5][16]) = {
28
2.16M
    { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 },
29
2.16M
    { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 },
30
2.16M
    { 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8 },
31
2.16M
    { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
32
2.16M
  };
33
34
  // Extend the first and last samples to simplify the loop for the 5-tap case
35
2.16M
  p[-1] = p[0];
36
2.16M
  __m128i last = _mm_set1_epi8((char)p[sz - 1]);
37
2.16M
  _mm_storeu_si128((__m128i *)&p[sz], last);
38
39
  // Adjust input pointer for filter support area
40
2.16M
  uint8_t *in = (strength == 3) ? p - 1 : p;
41
42
  // Avoid modifying first sample
43
2.16M
  uint8_t *out = p + 1;
44
2.16M
  int len = sz - 1;
45
46
2.16M
  const int use_3tap_filter = (strength < 3);
47
48
2.16M
  if (use_3tap_filter) {
49
897k
    __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
50
897k
    __m128i shuf0 = _mm_lddqu_si128((__m128i const *)v_const[0]);
51
897k
    __m128i shuf1 = _mm_lddqu_si128((__m128i const *)v_const[1]);
52
897k
    __m128i iden = _mm_lddqu_si128((__m128i *)v_const[3]);
53
897k
    __m128i in0 = _mm_lddqu_si128((__m128i *)in);
54
2.52M
    while (len > 0) {
55
1.62M
      int n_out = (len < 8) ? len : 8;
56
1.62M
      __m128i d0 = _mm_shuffle_epi8(in0, shuf0);
57
1.62M
      __m128i d1 = _mm_shuffle_epi8(in0, shuf1);
58
1.62M
      d0 = _mm_maddubs_epi16(d0, coef0);
59
1.62M
      d1 = _mm_maddubs_epi16(d1, coef0);
60
1.62M
      d0 = _mm_hadd_epi16(d0, d1);
61
1.62M
      __m128i eight = _mm_set1_epi16(8);
62
1.62M
      d0 = _mm_add_epi16(d0, eight);
63
1.62M
      d0 = _mm_srai_epi16(d0, 4);
64
1.62M
      d0 = _mm_packus_epi16(d0, d0);
65
1.62M
      __m128i out0 = _mm_lddqu_si128((__m128i *)out);
66
1.62M
      __m128i n0 = _mm_set1_epi8(n_out);
67
1.62M
      __m128i mask = _mm_cmpgt_epi8(n0, iden);
68
1.62M
      out0 = _mm_blendv_epi8(out0, d0, mask);
69
1.62M
      _mm_storel_epi64((__m128i *)out, out0);
70
1.62M
      __m128i in1 = _mm_lddqu_si128((__m128i *)(in + 16));
71
1.62M
      in0 = _mm_alignr_epi8(in1, in0, 8);
72
1.62M
      in += 8;
73
1.62M
      out += 8;
74
1.62M
      len -= n_out;
75
1.62M
    }
76
1.26M
  } else {  // 5-tap filter
77
1.26M
    __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
78
1.26M
    __m128i two = _mm_set1_epi8(2);
79
1.26M
    __m128i shuf_a = _mm_lddqu_si128((__m128i const *)v_const[2]);
80
1.26M
    __m128i shuf_b = _mm_add_epi8(shuf_a, two);
81
1.26M
    __m128i shuf_c = _mm_add_epi8(shuf_b, two);
82
1.26M
    __m128i shuf_d = _mm_add_epi8(shuf_c, two);
83
1.26M
    __m128i iden = _mm_lddqu_si128((__m128i *)v_const[3]);
84
1.26M
    __m128i in0 = _mm_lddqu_si128((__m128i *)in);
85
6.43M
    while (len > 0) {
86
5.16M
      int n_out = (len < 8) ? len : 8;
87
5.16M
      __m128i d0 = _mm_shuffle_epi8(in0, shuf_a);
88
5.16M
      __m128i d1 = _mm_shuffle_epi8(in0, shuf_b);
89
5.16M
      __m128i d2 = _mm_shuffle_epi8(in0, shuf_c);
90
5.16M
      __m128i d3 = _mm_shuffle_epi8(in0, shuf_d);
91
5.16M
      d0 = _mm_maddubs_epi16(d0, coef0);
92
5.16M
      d1 = _mm_maddubs_epi16(d1, coef0);
93
5.16M
      d2 = _mm_maddubs_epi16(d2, coef0);
94
5.16M
      d3 = _mm_maddubs_epi16(d3, coef0);
95
5.16M
      d0 = _mm_hadd_epi16(d0, d1);
96
5.16M
      d2 = _mm_hadd_epi16(d2, d3);
97
5.16M
      d0 = _mm_hadd_epi16(d0, d2);
98
5.16M
      __m128i eight = _mm_set1_epi16(8);
99
5.16M
      d0 = _mm_add_epi16(d0, eight);
100
5.16M
      d0 = _mm_srai_epi16(d0, 4);
101
5.16M
      d0 = _mm_packus_epi16(d0, d0);
102
5.16M
      __m128i out0 = _mm_lddqu_si128((__m128i *)out);
103
5.16M
      __m128i n0 = _mm_set1_epi8(n_out);
104
5.16M
      __m128i mask = _mm_cmpgt_epi8(n0, iden);
105
5.16M
      out0 = _mm_blendv_epi8(out0, d0, mask);
106
5.16M
      _mm_storel_epi64((__m128i *)out, out0);
107
5.16M
      __m128i in1 = _mm_lddqu_si128((__m128i *)(in + 16));
108
5.16M
      in0 = _mm_alignr_epi8(in1, in0, 8);
109
5.16M
      in += 8;
110
5.16M
      out += 8;
111
5.16M
      len -= n_out;
112
5.16M
    }
113
1.26M
  }
114
2.16M
}
115
116
2.62M
void av1_filter_intra_edge_high_sse4_1(uint16_t *p, int sz, int strength) {
117
2.62M
  if (!strength) return;
118
119
1.79M
  DECLARE_ALIGNED(16, static const int16_t, kern[3][8]) = {
120
1.79M
    { 4, 8, 4, 8, 4, 8, 4, 8 },  // strength 1: 4,8,4
121
1.79M
    { 5, 6, 5, 6, 5, 6, 5, 6 },  // strength 2: 5,6,5
122
1.79M
    { 2, 4, 2, 4, 2, 4, 2, 4 }   // strength 3: 2,4,4,4,2
123
1.79M
  };
124
125
1.79M
  DECLARE_ALIGNED(16, static const int16_t,
126
1.79M
                  v_const[1][8]) = { { 0, 1, 2, 3, 4, 5, 6, 7 } };
127
128
  // Extend the first and last samples to simplify the loop for the 5-tap case
129
1.79M
  p[-1] = p[0];
130
1.79M
  __m128i last = _mm_set1_epi16(p[sz - 1]);
131
1.79M
  _mm_storeu_si128((__m128i *)&p[sz], last);
132
133
  // Adjust input pointer for filter support area
134
1.79M
  uint16_t *in = (strength == 3) ? p - 1 : p;
135
136
  // Avoid modifying first sample
137
1.79M
  uint16_t *out = p + 1;
138
1.79M
  int len = sz - 1;
139
140
1.79M
  const int use_3tap_filter = (strength < 3);
141
142
1.79M
  if (use_3tap_filter) {
143
677k
    __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
144
677k
    __m128i iden = _mm_lddqu_si128((__m128i *)v_const[0]);
145
677k
    __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
146
677k
    __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]);
147
1.87M
    while (len > 0) {
148
1.19M
      int n_out = (len < 8) ? len : 8;
149
1.19M
      __m128i in1 = _mm_alignr_epi8(in8, in0, 2);
150
1.19M
      __m128i in2 = _mm_alignr_epi8(in8, in0, 4);
151
1.19M
      __m128i in02 = _mm_add_epi16(in0, in2);
152
1.19M
      __m128i d0 = _mm_unpacklo_epi16(in02, in1);
153
1.19M
      __m128i d1 = _mm_unpackhi_epi16(in02, in1);
154
1.19M
      d0 = _mm_mullo_epi16(d0, coef0);
155
1.19M
      d1 = _mm_mullo_epi16(d1, coef0);
156
1.19M
      d0 = _mm_hadd_epi16(d0, d1);
157
1.19M
      __m128i eight = _mm_set1_epi16(8);
158
1.19M
      d0 = _mm_add_epi16(d0, eight);
159
1.19M
      d0 = _mm_srli_epi16(d0, 4);
160
1.19M
      __m128i out0 = _mm_lddqu_si128((__m128i *)out);
161
1.19M
      __m128i n0 = _mm_set1_epi16(n_out);
162
1.19M
      __m128i mask = _mm_cmpgt_epi16(n0, iden);
163
1.19M
      out0 = _mm_blendv_epi8(out0, d0, mask);
164
1.19M
      _mm_storeu_si128((__m128i *)out, out0);
165
1.19M
      in += 8;
166
1.19M
      in0 = in8;
167
1.19M
      in8 = _mm_lddqu_si128((__m128i *)&in[8]);
168
1.19M
      out += 8;
169
1.19M
      len -= n_out;
170
1.19M
    }
171
1.12M
  } else {  // 5-tap filter
172
1.12M
    __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
173
1.12M
    __m128i iden = _mm_lddqu_si128((__m128i *)v_const[0]);
174
1.12M
    __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
175
1.12M
    __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]);
176
5.70M
    while (len > 0) {
177
4.58M
      int n_out = (len < 8) ? len : 8;
178
4.58M
      __m128i in1 = _mm_alignr_epi8(in8, in0, 2);
179
4.58M
      __m128i in2 = _mm_alignr_epi8(in8, in0, 4);
180
4.58M
      __m128i in3 = _mm_alignr_epi8(in8, in0, 6);
181
4.58M
      __m128i in4 = _mm_alignr_epi8(in8, in0, 8);
182
4.58M
      __m128i in04 = _mm_add_epi16(in0, in4);
183
4.58M
      __m128i in123 = _mm_add_epi16(in1, in2);
184
4.58M
      in123 = _mm_add_epi16(in123, in3);
185
4.58M
      __m128i d0 = _mm_unpacklo_epi16(in04, in123);
186
4.58M
      __m128i d1 = _mm_unpackhi_epi16(in04, in123);
187
4.58M
      d0 = _mm_mullo_epi16(d0, coef0);
188
4.58M
      d1 = _mm_mullo_epi16(d1, coef0);
189
4.58M
      d0 = _mm_hadd_epi16(d0, d1);
190
4.58M
      __m128i eight = _mm_set1_epi16(8);
191
4.58M
      d0 = _mm_add_epi16(d0, eight);
192
4.58M
      d0 = _mm_srli_epi16(d0, 4);
193
4.58M
      __m128i out0 = _mm_lddqu_si128((__m128i *)out);
194
4.58M
      __m128i n0 = _mm_set1_epi16(n_out);
195
4.58M
      __m128i mask = _mm_cmpgt_epi16(n0, iden);
196
4.58M
      out0 = _mm_blendv_epi8(out0, d0, mask);
197
4.58M
      _mm_storeu_si128((__m128i *)out, out0);
198
4.58M
      in += 8;
199
4.58M
      in0 = in8;
200
4.58M
      in8 = _mm_lddqu_si128((__m128i *)&in[8]);
201
4.58M
      out += 8;
202
4.58M
      len -= n_out;
203
4.58M
    }
204
1.12M
  }
205
1.79M
}
206
207
760k
void av1_upsample_intra_edge_sse4_1(uint8_t *p, int sz) {
208
  // interpolate half-sample positions
209
760k
  assert(sz <= 24);
210
211
760k
  DECLARE_ALIGNED(16, static const int8_t, kernel[1][16]) = {
212
760k
    { -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1 }
213
760k
  };
214
215
760k
  DECLARE_ALIGNED(
216
760k
      16, static const int8_t,
217
760k
      v_const[2][16]) = { { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 },
218
760k
                          { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } };
219
220
  // Extend first/last samples (upper-left p[-1], last p[sz-1])
221
  // to support 4-tap filter
222
760k
  p[-2] = p[-1];
223
760k
  p[sz] = p[sz - 1];
224
225
760k
  uint8_t *in = &p[-2];
226
760k
  uint8_t *out = &p[-2];
227
228
760k
  int n = sz + 1;  // Input length including upper-left sample
229
230
760k
  __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
231
760k
  __m128i in16 = _mm_lddqu_si128((__m128i *)&in[16]);
232
233
760k
  __m128i coef0 = _mm_lddqu_si128((__m128i *)kernel[0]);
234
760k
  __m128i shuf0 = _mm_lddqu_si128((__m128i *)v_const[0]);
235
760k
  __m128i shuf1 = _mm_lddqu_si128((__m128i *)v_const[1]);
236
237
1.65M
  while (n > 0) {
238
895k
    __m128i in8 = _mm_alignr_epi8(in16, in0, 8);
239
895k
    __m128i d0 = _mm_shuffle_epi8(in0, shuf0);
240
895k
    __m128i d1 = _mm_shuffle_epi8(in0, shuf1);
241
895k
    __m128i d2 = _mm_shuffle_epi8(in8, shuf0);
242
895k
    __m128i d3 = _mm_shuffle_epi8(in8, shuf1);
243
895k
    d0 = _mm_maddubs_epi16(d0, coef0);
244
895k
    d1 = _mm_maddubs_epi16(d1, coef0);
245
895k
    d2 = _mm_maddubs_epi16(d2, coef0);
246
895k
    d3 = _mm_maddubs_epi16(d3, coef0);
247
895k
    d0 = _mm_hadd_epi16(d0, d1);
248
895k
    d2 = _mm_hadd_epi16(d2, d3);
249
895k
    __m128i eight = _mm_set1_epi16(8);
250
895k
    d0 = _mm_add_epi16(d0, eight);
251
895k
    d2 = _mm_add_epi16(d2, eight);
252
895k
    d0 = _mm_srai_epi16(d0, 4);
253
895k
    d2 = _mm_srai_epi16(d2, 4);
254
895k
    d0 = _mm_packus_epi16(d0, d2);
255
895k
    __m128i in1 = _mm_alignr_epi8(in16, in0, 1);
256
895k
    __m128i out0 = _mm_unpacklo_epi8(in1, d0);
257
895k
    __m128i out1 = _mm_unpackhi_epi8(in1, d0);
258
895k
    _mm_storeu_si128((__m128i *)&out[0], out0);
259
895k
    _mm_storeu_si128((__m128i *)&out[16], out1);
260
895k
    in0 = in16;
261
895k
    in16 = _mm_setzero_si128();
262
895k
    out += 32;
263
895k
    n -= 16;
264
895k
  }
265
760k
}
266
267
591k
void av1_upsample_intra_edge_high_sse4_1(uint16_t *p, int sz, int bd) {
268
  // interpolate half-sample positions
269
591k
  assert(sz <= 24);
270
271
591k
  DECLARE_ALIGNED(16, static const int16_t,
272
591k
                  kernel[1][8]) = { { -1, 9, -1, 9, -1, 9, -1, 9 } };
273
274
  // Extend first/last samples (upper-left p[-1], last p[sz-1])
275
  // to support 4-tap filter
276
591k
  p[-2] = p[-1];
277
591k
  p[sz] = p[sz - 1];
278
279
591k
  uint16_t *in = &p[-2];
280
591k
  uint16_t *out = in;
281
591k
  int n = sz + 1;
282
283
591k
  __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
284
591k
  __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]);
285
591k
  __m128i in16 = _mm_lddqu_si128((__m128i *)&in[16]);
286
591k
  __m128i in24 = _mm_lddqu_si128((__m128i *)&in[24]);
287
288
1.68M
  while (n > 0) {
289
1.09M
    __m128i in1 = _mm_alignr_epi8(in8, in0, 2);
290
1.09M
    __m128i in2 = _mm_alignr_epi8(in8, in0, 4);
291
1.09M
    __m128i in3 = _mm_alignr_epi8(in8, in0, 6);
292
1.09M
    __m128i sum0 = _mm_add_epi16(in0, in3);
293
1.09M
    __m128i sum1 = _mm_add_epi16(in1, in2);
294
1.09M
    __m128i d0 = _mm_unpacklo_epi16(sum0, sum1);
295
1.09M
    __m128i d1 = _mm_unpackhi_epi16(sum0, sum1);
296
1.09M
    __m128i coef0 = _mm_lddqu_si128((__m128i *)kernel[0]);
297
1.09M
    d0 = _mm_madd_epi16(d0, coef0);
298
1.09M
    d1 = _mm_madd_epi16(d1, coef0);
299
1.09M
    __m128i eight = _mm_set1_epi32(8);
300
1.09M
    d0 = _mm_add_epi32(d0, eight);
301
1.09M
    d1 = _mm_add_epi32(d1, eight);
302
1.09M
    d0 = _mm_srai_epi32(d0, 4);
303
1.09M
    d1 = _mm_srai_epi32(d1, 4);
304
1.09M
    d0 = _mm_packus_epi32(d0, d1);
305
1.09M
    __m128i max0 = _mm_set1_epi16((1 << bd) - 1);
306
1.09M
    d0 = _mm_min_epi16(d0, max0);
307
1.09M
    __m128i out0 = _mm_unpacklo_epi16(in1, d0);
308
1.09M
    __m128i out1 = _mm_unpackhi_epi16(in1, d0);
309
1.09M
    _mm_storeu_si128((__m128i *)&out[0], out0);
310
1.09M
    _mm_storeu_si128((__m128i *)&out[8], out1);
311
1.09M
    in0 = in8;
312
1.09M
    in8 = in16;
313
1.09M
    in16 = in24;
314
1.09M
    in24 = _mm_setzero_si128();
315
1.09M
    out += 16;
316
1.09M
    n -= 8;
317
1.09M
  }
318
591k
}