Coverage Report

Created: 2023-06-07 06:31

/src/aom/aom_dsp/x86/highbd_convolve_sse2.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 */
11
#include <emmintrin.h>
12
13
#include "config/aom_dsp_rtcd.h"
14
#include "aom_dsp/x86/convolve.h"
15
16
// -----------------------------------------------------------------------------
17
18
void aom_highbd_filter_block1d4_v4_sse2(const uint16_t *src_ptr,
19
                                        ptrdiff_t src_pitch, uint16_t *dst_ptr,
20
                                        ptrdiff_t dst_pitch, uint32_t height,
21
0
                                        const int16_t *filter, int bd) {
22
0
  __m128i filtersReg;
23
0
  __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
24
0
  __m128i srcReg23_lo, srcReg34_lo;
25
0
  __m128i srcReg45_lo, srcReg56_lo;
26
0
  __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo;
27
0
  __m128i resReg23_45_lo, resReg34_56_lo;
28
0
  __m128i resReg23_45, resReg34_56;
29
0
  __m128i addFilterReg64, secondFilters, thirdFilters;
30
0
  unsigned int i;
31
0
  ptrdiff_t src_stride, dst_stride;
32
33
0
  const __m128i max = _mm_set1_epi16((1 << bd) - 1);
34
0
  addFilterReg64 = _mm_set1_epi32(64);
35
0
  filtersReg = _mm_loadu_si128((const __m128i *)filter);
36
37
  // coeffs 0 1 0 1 2 3 2 3
38
0
  const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
39
  // coeffs 4 5 4 5 6 7 6 7
40
0
  const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
41
42
0
  secondFilters = _mm_unpackhi_epi64(tmp0, tmp0);  // coeffs 2 3 2 3 2 3 2 3
43
0
  thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1);   // coeffs 4 5 4 5 4 5 4 5
44
45
  // multiply the size of the source and destination stride by two
46
0
  src_stride = src_pitch << 1;
47
0
  dst_stride = dst_pitch << 1;
48
49
0
  srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
50
0
  srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
51
0
  srcReg23_lo = _mm_unpacklo_epi16(srcReg2, srcReg3);
52
53
0
  srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
54
0
  srcReg34_lo = _mm_unpacklo_epi16(srcReg3, srcReg4);
55
56
0
  for (i = height; i > 1; i -= 2) {
57
0
    srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
58
0
    srcReg45_lo = _mm_unpacklo_epi16(srcReg4, srcReg5);
59
60
0
    srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
61
0
    srcReg56_lo = _mm_unpacklo_epi16(srcReg5, srcReg6);
62
63
    // multiply 2 adjacent elements with the filter and add the result
64
65
0
    resReg23_lo = _mm_madd_epi16(srcReg23_lo, secondFilters);
66
0
    resReg34_lo = _mm_madd_epi16(srcReg34_lo, secondFilters);
67
0
    resReg45_lo = _mm_madd_epi16(srcReg45_lo, thirdFilters);
68
0
    resReg56_lo = _mm_madd_epi16(srcReg56_lo, thirdFilters);
69
70
0
    resReg23_45_lo = _mm_add_epi32(resReg23_lo, resReg45_lo);
71
0
    resReg34_56_lo = _mm_add_epi32(resReg34_lo, resReg56_lo);
72
73
    // shift by 7 bit each 32 bit
74
0
    resReg23_45_lo = _mm_add_epi32(resReg23_45_lo, addFilterReg64);
75
0
    resReg34_56_lo = _mm_add_epi32(resReg34_56_lo, addFilterReg64);
76
0
    resReg23_45_lo = _mm_srai_epi32(resReg23_45_lo, 7);
77
0
    resReg34_56_lo = _mm_srai_epi32(resReg34_56_lo, 7);
78
79
    // shrink to 16 bit each 32 bits, the first lane contain the first
80
    // convolve result and the second lane contain the second convolve
81
    // result
82
0
    resReg23_45 = _mm_packs_epi32(resReg23_45_lo, _mm_setzero_si128());
83
0
    resReg34_56 = _mm_packs_epi32(resReg34_56_lo, _mm_setzero_si128());
84
85
0
    resReg23_45 = _mm_max_epi16(resReg23_45, _mm_setzero_si128());
86
0
    resReg23_45 = _mm_min_epi16(resReg23_45, max);
87
0
    resReg34_56 = _mm_max_epi16(resReg34_56, _mm_setzero_si128());
88
0
    resReg34_56 = _mm_min_epi16(resReg34_56, max);
89
90
0
    src_ptr += src_stride;
91
92
0
    _mm_storel_epi64((__m128i *)dst_ptr, (resReg23_45));
93
0
    _mm_storel_epi64((__m128i *)(dst_ptr + dst_pitch), (resReg34_56));
94
95
0
    dst_ptr += dst_stride;
96
97
    // save part of the registers for next strides
98
0
    srcReg23_lo = srcReg45_lo;
99
0
    srcReg34_lo = srcReg56_lo;
100
0
    srcReg4 = srcReg6;
101
0
  }
102
0
}
103
104
void aom_highbd_filter_block1d4_h4_sse2(const uint16_t *src_ptr,
105
                                        ptrdiff_t src_pitch, uint16_t *dst_ptr,
106
                                        ptrdiff_t dst_pitch, uint32_t height,
107
0
                                        const int16_t *filter, int bd) {
108
0
  __m128i filtersReg;
109
0
  __m128i addFilterReg64;
110
0
  __m128i secondFilters, thirdFilters;
111
0
  __m128i srcRegFilt32b1_1;
112
0
  __m128i srcReg32b1;
113
0
  unsigned int i;
114
0
  src_ptr -= 3;
115
0
  addFilterReg64 = _mm_set1_epi32(64);
116
0
  filtersReg = _mm_loadu_si128((const __m128i *)filter);
117
0
  const __m128i max = _mm_set1_epi16((1 << bd) - 1);
118
119
  // coeffs 0 1 0 1 2 3 2 3
120
0
  const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
121
  // coeffs 4 5 4 5 6 7 6 7
122
0
  const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
123
124
0
  secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0);  // coeffs 2 3 2 3 2 3 2 3
125
0
  thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1);   // coeffs 4 5 4 5 4 5 4 5
126
127
0
  for (i = height; i > 0; i -= 1) {
128
0
    srcReg32b1 = _mm_loadu_si128((const __m128i *)(src_ptr + 2));
129
130
0
    __m128i ss_3_1 = _mm_srli_si128(srcReg32b1, 2);
131
0
    __m128i ss_4_1 = _mm_srli_si128(srcReg32b1, 4);
132
0
    __m128i ss_5_1 = _mm_srli_si128(srcReg32b1, 6);
133
0
    __m128i ss_23 = _mm_unpacklo_epi32(srcReg32b1, ss_3_1);
134
0
    __m128i ss_45 = _mm_unpacklo_epi32(ss_4_1, ss_5_1);
135
136
0
    ss_23 = _mm_madd_epi16(ss_23, secondFilters);
137
0
    ss_45 = _mm_madd_epi16(ss_45, thirdFilters);
138
0
    srcRegFilt32b1_1 = _mm_add_epi32(ss_23, ss_45);
139
140
    // shift by 7 bit each 32 bit
141
0
    srcRegFilt32b1_1 = _mm_add_epi32(srcRegFilt32b1_1, addFilterReg64);
142
0
    srcRegFilt32b1_1 = _mm_srai_epi32(srcRegFilt32b1_1, 7);
143
144
0
    srcRegFilt32b1_1 = _mm_packs_epi32(srcRegFilt32b1_1, _mm_setzero_si128());
145
0
    srcRegFilt32b1_1 = _mm_max_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
146
0
    srcRegFilt32b1_1 = _mm_min_epi16(srcRegFilt32b1_1, max);
147
148
0
    src_ptr += src_pitch;
149
150
0
    _mm_storel_epi64((__m128i *)dst_ptr, srcRegFilt32b1_1);
151
152
0
    dst_ptr += dst_pitch;
153
0
  }
154
0
}
155
156
void aom_highbd_filter_block1d8_v4_sse2(const uint16_t *src_ptr,
157
                                        ptrdiff_t src_pitch, uint16_t *dst_ptr,
158
                                        ptrdiff_t dst_pitch, uint32_t height,
159
0
                                        const int16_t *filter, int bd) {
160
0
  __m128i filtersReg;
161
0
  __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
162
0
  __m128i srcReg23_lo, srcReg23_hi, srcReg34_lo, srcReg34_hi;
163
0
  __m128i srcReg45_lo, srcReg45_hi, srcReg56_lo, srcReg56_hi;
164
0
  __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo;
165
0
  __m128i resReg23_hi, resReg34_hi, resReg45_hi, resReg56_hi;
166
0
  __m128i resReg23_45_lo, resReg34_56_lo, resReg23_45_hi, resReg34_56_hi;
167
0
  __m128i resReg23_45, resReg34_56;
168
0
  __m128i addFilterReg64, secondFilters, thirdFilters;
169
0
  unsigned int i;
170
0
  ptrdiff_t src_stride, dst_stride;
171
172
0
  const __m128i max = _mm_set1_epi16((1 << bd) - 1);
173
0
  addFilterReg64 = _mm_set1_epi32(64);
174
0
  filtersReg = _mm_loadu_si128((const __m128i *)filter);
175
176
  // coeffs 0 1 0 1 2 3 2 3
177
0
  const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
178
  // coeffs 4 5 4 5 6 7 6 7
179
0
  const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
180
181
0
  secondFilters = _mm_unpackhi_epi64(tmp0, tmp0);  // coeffs 2 3 2 3 2 3 2 3
182
0
  thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1);   // coeffs 4 5 4 5 4 5 4 5
183
184
  // multiple the size of the source and destination stride by two
185
0
  src_stride = src_pitch << 1;
186
0
  dst_stride = dst_pitch << 1;
187
188
0
  srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
189
0
  srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
190
0
  srcReg23_lo = _mm_unpacklo_epi16(srcReg2, srcReg3);
191
0
  srcReg23_hi = _mm_unpackhi_epi16(srcReg2, srcReg3);
192
193
0
  srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
194
0
  srcReg34_lo = _mm_unpacklo_epi16(srcReg3, srcReg4);
195
0
  srcReg34_hi = _mm_unpackhi_epi16(srcReg3, srcReg4);
196
197
0
  for (i = height; i > 1; i -= 2) {
198
0
    srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
199
200
0
    srcReg45_lo = _mm_unpacklo_epi16(srcReg4, srcReg5);
201
0
    srcReg45_hi = _mm_unpackhi_epi16(srcReg4, srcReg5);
202
203
0
    srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
204
205
0
    srcReg56_lo = _mm_unpacklo_epi16(srcReg5, srcReg6);
206
0
    srcReg56_hi = _mm_unpackhi_epi16(srcReg5, srcReg6);
207
208
    // multiply 2 adjacent elements with the filter and add the result
209
210
0
    resReg23_lo = _mm_madd_epi16(srcReg23_lo, secondFilters);
211
0
    resReg34_lo = _mm_madd_epi16(srcReg34_lo, secondFilters);
212
0
    resReg45_lo = _mm_madd_epi16(srcReg45_lo, thirdFilters);
213
0
    resReg56_lo = _mm_madd_epi16(srcReg56_lo, thirdFilters);
214
215
0
    resReg23_45_lo = _mm_add_epi32(resReg23_lo, resReg45_lo);
216
0
    resReg34_56_lo = _mm_add_epi32(resReg34_lo, resReg56_lo);
217
218
    // multiply 2 adjacent elements with the filter and add the result
219
220
0
    resReg23_hi = _mm_madd_epi16(srcReg23_hi, secondFilters);
221
0
    resReg34_hi = _mm_madd_epi16(srcReg34_hi, secondFilters);
222
0
    resReg45_hi = _mm_madd_epi16(srcReg45_hi, thirdFilters);
223
0
    resReg56_hi = _mm_madd_epi16(srcReg56_hi, thirdFilters);
224
225
0
    resReg23_45_hi = _mm_add_epi32(resReg23_hi, resReg45_hi);
226
0
    resReg34_56_hi = _mm_add_epi32(resReg34_hi, resReg56_hi);
227
228
    // shift by 7 bit each 32 bit
229
0
    resReg23_45_lo = _mm_add_epi32(resReg23_45_lo, addFilterReg64);
230
0
    resReg34_56_lo = _mm_add_epi32(resReg34_56_lo, addFilterReg64);
231
0
    resReg23_45_hi = _mm_add_epi32(resReg23_45_hi, addFilterReg64);
232
0
    resReg34_56_hi = _mm_add_epi32(resReg34_56_hi, addFilterReg64);
233
0
    resReg23_45_lo = _mm_srai_epi32(resReg23_45_lo, 7);
234
0
    resReg34_56_lo = _mm_srai_epi32(resReg34_56_lo, 7);
235
0
    resReg23_45_hi = _mm_srai_epi32(resReg23_45_hi, 7);
236
0
    resReg34_56_hi = _mm_srai_epi32(resReg34_56_hi, 7);
237
238
    // shrink to 16 bit each 32 bits, the first lane contain the first
239
    // convolve result and the second lane contain the second convolve
240
    // result
241
0
    resReg23_45 = _mm_packs_epi32(resReg23_45_lo, resReg23_45_hi);
242
0
    resReg34_56 = _mm_packs_epi32(resReg34_56_lo, resReg34_56_hi);
243
244
0
    resReg23_45 = _mm_max_epi16(resReg23_45, _mm_setzero_si128());
245
0
    resReg23_45 = _mm_min_epi16(resReg23_45, max);
246
0
    resReg34_56 = _mm_max_epi16(resReg34_56, _mm_setzero_si128());
247
0
    resReg34_56 = _mm_min_epi16(resReg34_56, max);
248
249
0
    src_ptr += src_stride;
250
251
0
    _mm_store_si128((__m128i *)dst_ptr, (resReg23_45));
252
0
    _mm_store_si128((__m128i *)(dst_ptr + dst_pitch), (resReg34_56));
253
254
0
    dst_ptr += dst_stride;
255
256
    // save part of the registers for next strides
257
0
    srcReg23_lo = srcReg45_lo;
258
0
    srcReg23_hi = srcReg45_hi;
259
0
    srcReg34_lo = srcReg56_lo;
260
0
    srcReg34_hi = srcReg56_hi;
261
0
    srcReg4 = srcReg6;
262
0
  }
263
0
}
264
265
void aom_highbd_filter_block1d8_h4_sse2(const uint16_t *src_ptr,
266
                                        ptrdiff_t src_pitch, uint16_t *dst_ptr,
267
                                        ptrdiff_t dst_pitch, uint32_t height,
268
0
                                        const int16_t *filter, int bd) {
269
0
  __m128i filtersReg;
270
0
  __m128i addFilterReg64;
271
0
  __m128i secondFilters, thirdFilters;
272
0
  __m128i srcRegFilt32b1_1, srcRegFilt32b1_2;
273
0
  __m128i srcReg32b1, srcReg32b2;
274
0
  unsigned int i;
275
0
  src_ptr -= 3;
276
0
  addFilterReg64 = _mm_set1_epi32(64);
277
0
  filtersReg = _mm_loadu_si128((const __m128i *)filter);
278
0
  const __m128i max = _mm_set1_epi16((1 << bd) - 1);
279
280
  // coeffs 0 1 0 1 2 3 2 3
281
0
  const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
282
  // coeffs 4 5 4 5 6 7 6 7
283
0
  const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
284
285
0
  secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0);  // coeffs 2 3 2 3 2 3 2 3
286
0
  thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1);   // coeffs 4 5 4 5 4 5 4 5
287
288
0
  for (i = height; i > 0; i -= 1) {
289
0
    srcReg32b1 = _mm_loadu_si128((const __m128i *)(src_ptr + 2));
290
0
    srcReg32b2 = _mm_loadu_si128((const __m128i *)(src_ptr + 6));
291
292
0
    __m128i ss_4_1 = _mm_srli_si128(srcReg32b1, 4);
293
0
    __m128i ss_4_2 = _mm_srli_si128(srcReg32b2, 4);
294
0
    __m128i ss_4 = _mm_unpacklo_epi64(ss_4_1, ss_4_2);
295
296
0
    __m128i d1 = _mm_madd_epi16(srcReg32b1, secondFilters);
297
0
    __m128i d2 = _mm_madd_epi16(ss_4, thirdFilters);
298
0
    srcRegFilt32b1_1 = _mm_add_epi32(d1, d2);
299
300
0
    __m128i ss_3_1 = _mm_srli_si128(srcReg32b1, 2);
301
0
    __m128i ss_5_1 = _mm_srli_si128(srcReg32b1, 6);
302
0
    __m128i ss_3_2 = _mm_srli_si128(srcReg32b2, 2);
303
0
    __m128i ss_5_2 = _mm_srli_si128(srcReg32b2, 6);
304
0
    __m128i ss_3 = _mm_unpacklo_epi64(ss_3_1, ss_3_2);
305
0
    __m128i ss_5 = _mm_unpacklo_epi64(ss_5_1, ss_5_2);
306
307
0
    d1 = _mm_madd_epi16(ss_3, secondFilters);
308
0
    d2 = _mm_madd_epi16(ss_5, thirdFilters);
309
0
    srcRegFilt32b1_2 = _mm_add_epi32(d1, d2);
310
311
0
    __m128i res_lo_1 = _mm_unpacklo_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
312
0
    __m128i res_hi_1 = _mm_unpackhi_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
313
314
    // shift by 7 bit each 32 bit
315
0
    res_lo_1 = _mm_add_epi32(res_lo_1, addFilterReg64);
316
0
    res_hi_1 = _mm_add_epi32(res_hi_1, addFilterReg64);
317
0
    res_lo_1 = _mm_srai_epi32(res_lo_1, 7);
318
0
    res_hi_1 = _mm_srai_epi32(res_hi_1, 7);
319
320
0
    srcRegFilt32b1_1 = _mm_packs_epi32(res_lo_1, res_hi_1);
321
322
0
    srcRegFilt32b1_1 = _mm_max_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
323
0
    srcRegFilt32b1_1 = _mm_min_epi16(srcRegFilt32b1_1, max);
324
325
0
    src_ptr += src_pitch;
326
327
0
    _mm_store_si128((__m128i *)dst_ptr, srcRegFilt32b1_1);
328
329
0
    dst_ptr += dst_pitch;
330
0
  }
331
0
}
332
333
void aom_highbd_filter_block1d16_v4_sse2(const uint16_t *src_ptr,
334
                                         ptrdiff_t src_pitch, uint16_t *dst_ptr,
335
                                         ptrdiff_t dst_pitch, uint32_t height,
336
0
                                         const int16_t *filter, int bd) {
337
0
  aom_highbd_filter_block1d8_v4_sse2(src_ptr, src_pitch, dst_ptr, dst_pitch,
338
0
                                     height, filter, bd);
339
0
  aom_highbd_filter_block1d8_v4_sse2((src_ptr + 8), src_pitch, (dst_ptr + 8),
340
0
                                     dst_pitch, height, filter, bd);
341
0
}
342
343
void aom_highbd_filter_block1d16_h4_sse2(const uint16_t *src_ptr,
344
                                         ptrdiff_t src_pitch, uint16_t *dst_ptr,
345
                                         ptrdiff_t dst_pitch, uint32_t height,
346
0
                                         const int16_t *filter, int bd) {
347
0
  aom_highbd_filter_block1d8_h4_sse2(src_ptr, src_pitch, dst_ptr, dst_pitch,
348
0
                                     height, filter, bd);
349
0
  aom_highbd_filter_block1d8_h4_sse2((src_ptr + 8), src_pitch, (dst_ptr + 8),
350
0
                                     dst_pitch, height, filter, bd);
351
0
}