/src/aom/aom_dsp/x86/highbd_convolve_sse2.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2018, Alliance for Open Media. All rights reserved. |
3 | | * |
4 | | * This source code is subject to the terms of the BSD 2 Clause License and |
5 | | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
6 | | * was not distributed with this source code in the LICENSE file, you can |
7 | | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
8 | | * Media Patent License 1.0 was not distributed with this source code in the |
9 | | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
10 | | */ |
11 | | #include <emmintrin.h> |
12 | | |
13 | | #include "config/aom_dsp_rtcd.h" |
14 | | #include "aom_dsp/x86/convolve.h" |
15 | | |
16 | | // ----------------------------------------------------------------------------- |
17 | | |
18 | | static void aom_highbd_filter_block1d4_v4_sse2( |
19 | | const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, |
20 | 0 | ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { |
21 | 0 | __m128i filtersReg; |
22 | 0 | __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6; |
23 | 0 | __m128i srcReg23_lo, srcReg34_lo; |
24 | 0 | __m128i srcReg45_lo, srcReg56_lo; |
25 | 0 | __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo; |
26 | 0 | __m128i resReg23_45_lo, resReg34_56_lo; |
27 | 0 | __m128i resReg23_45, resReg34_56; |
28 | 0 | __m128i addFilterReg64, secondFilters, thirdFilters; |
29 | 0 | unsigned int i; |
30 | 0 | ptrdiff_t src_stride, dst_stride; |
31 | |
|
32 | 0 | const __m128i max = _mm_set1_epi16((1 << bd) - 1); |
33 | 0 | addFilterReg64 = _mm_set1_epi32(64); |
34 | 0 | filtersReg = _mm_loadu_si128((const __m128i *)filter); |
35 | | |
36 | | // coeffs 0 1 0 1 2 3 2 3 |
37 | 0 | const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg); |
38 | | // coeffs 4 5 4 5 6 7 6 7 |
39 | 0 | const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg); |
40 | |
|
41 | 0 | secondFilters = _mm_unpackhi_epi64(tmp0, tmp0); // coeffs 2 3 2 3 2 3 2 3 |
42 | 0 | thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1); // coeffs 4 5 4 5 4 5 4 5 |
43 | | |
44 | | // multiply the size of the source and destination stride by two |
45 | 0 | src_stride = src_pitch << 1; |
46 | 0 | dst_stride = dst_pitch << 1; |
47 | |
|
48 | 0 | srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); |
49 | 0 | srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); |
50 | 0 | srcReg23_lo = _mm_unpacklo_epi16(srcReg2, srcReg3); |
51 | |
|
52 | 0 | srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); |
53 | 0 | srcReg34_lo = _mm_unpacklo_epi16(srcReg3, srcReg4); |
54 | |
|
55 | 0 | for (i = height; i > 1; i -= 2) { |
56 | 0 | srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); |
57 | 0 | srcReg45_lo = _mm_unpacklo_epi16(srcReg4, srcReg5); |
58 | |
|
59 | 0 | srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); |
60 | 0 | srcReg56_lo = _mm_unpacklo_epi16(srcReg5, srcReg6); |
61 | | |
62 | | // multiply 2 adjacent elements with the filter and add the result |
63 | |
|
64 | 0 | resReg23_lo = _mm_madd_epi16(srcReg23_lo, secondFilters); |
65 | 0 | resReg34_lo = _mm_madd_epi16(srcReg34_lo, secondFilters); |
66 | 0 | resReg45_lo = _mm_madd_epi16(srcReg45_lo, thirdFilters); |
67 | 0 | resReg56_lo = _mm_madd_epi16(srcReg56_lo, thirdFilters); |
68 | |
|
69 | 0 | resReg23_45_lo = _mm_add_epi32(resReg23_lo, resReg45_lo); |
70 | 0 | resReg34_56_lo = _mm_add_epi32(resReg34_lo, resReg56_lo); |
71 | | |
72 | | // shift by 7 bit each 32 bit |
73 | 0 | resReg23_45_lo = _mm_add_epi32(resReg23_45_lo, addFilterReg64); |
74 | 0 | resReg34_56_lo = _mm_add_epi32(resReg34_56_lo, addFilterReg64); |
75 | 0 | resReg23_45_lo = _mm_srai_epi32(resReg23_45_lo, 7); |
76 | 0 | resReg34_56_lo = _mm_srai_epi32(resReg34_56_lo, 7); |
77 | | |
78 | | // shrink to 16 bit each 32 bits, the first lane contain the first |
79 | | // convolve result and the second lane contain the second convolve |
80 | | // result |
81 | 0 | resReg23_45 = _mm_packs_epi32(resReg23_45_lo, _mm_setzero_si128()); |
82 | 0 | resReg34_56 = _mm_packs_epi32(resReg34_56_lo, _mm_setzero_si128()); |
83 | |
|
84 | 0 | resReg23_45 = _mm_max_epi16(resReg23_45, _mm_setzero_si128()); |
85 | 0 | resReg23_45 = _mm_min_epi16(resReg23_45, max); |
86 | 0 | resReg34_56 = _mm_max_epi16(resReg34_56, _mm_setzero_si128()); |
87 | 0 | resReg34_56 = _mm_min_epi16(resReg34_56, max); |
88 | |
|
89 | 0 | src_ptr += src_stride; |
90 | |
|
91 | 0 | _mm_storel_epi64((__m128i *)dst_ptr, (resReg23_45)); |
92 | 0 | _mm_storel_epi64((__m128i *)(dst_ptr + dst_pitch), (resReg34_56)); |
93 | |
|
94 | 0 | dst_ptr += dst_stride; |
95 | | |
96 | | // save part of the registers for next strides |
97 | 0 | srcReg23_lo = srcReg45_lo; |
98 | 0 | srcReg34_lo = srcReg56_lo; |
99 | 0 | srcReg4 = srcReg6; |
100 | 0 | } |
101 | 0 | } |
102 | | |
103 | | static void aom_highbd_filter_block1d4_h4_sse2( |
104 | | const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, |
105 | 0 | ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { |
106 | 0 | __m128i filtersReg; |
107 | 0 | __m128i addFilterReg64; |
108 | 0 | __m128i secondFilters, thirdFilters; |
109 | 0 | __m128i srcRegFilt32b1_1; |
110 | 0 | __m128i srcReg32b1; |
111 | 0 | unsigned int i; |
112 | 0 | src_ptr -= 3; |
113 | 0 | addFilterReg64 = _mm_set1_epi32(64); |
114 | 0 | filtersReg = _mm_loadu_si128((const __m128i *)filter); |
115 | 0 | const __m128i max = _mm_set1_epi16((1 << bd) - 1); |
116 | | |
117 | | // coeffs 0 1 0 1 2 3 2 3 |
118 | 0 | const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg); |
119 | | // coeffs 4 5 4 5 6 7 6 7 |
120 | 0 | const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg); |
121 | |
|
122 | 0 | secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3 |
123 | 0 | thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5 |
124 | |
|
125 | 0 | for (i = height; i > 0; i -= 1) { |
126 | 0 | srcReg32b1 = _mm_loadu_si128((const __m128i *)(src_ptr + 2)); |
127 | |
|
128 | 0 | __m128i ss_3_1 = _mm_srli_si128(srcReg32b1, 2); |
129 | 0 | __m128i ss_4_1 = _mm_srli_si128(srcReg32b1, 4); |
130 | 0 | __m128i ss_5_1 = _mm_srli_si128(srcReg32b1, 6); |
131 | 0 | __m128i ss_23 = _mm_unpacklo_epi32(srcReg32b1, ss_3_1); |
132 | 0 | __m128i ss_45 = _mm_unpacklo_epi32(ss_4_1, ss_5_1); |
133 | |
|
134 | 0 | ss_23 = _mm_madd_epi16(ss_23, secondFilters); |
135 | 0 | ss_45 = _mm_madd_epi16(ss_45, thirdFilters); |
136 | 0 | srcRegFilt32b1_1 = _mm_add_epi32(ss_23, ss_45); |
137 | | |
138 | | // shift by 7 bit each 32 bit |
139 | 0 | srcRegFilt32b1_1 = _mm_add_epi32(srcRegFilt32b1_1, addFilterReg64); |
140 | 0 | srcRegFilt32b1_1 = _mm_srai_epi32(srcRegFilt32b1_1, 7); |
141 | |
|
142 | 0 | srcRegFilt32b1_1 = _mm_packs_epi32(srcRegFilt32b1_1, _mm_setzero_si128()); |
143 | 0 | srcRegFilt32b1_1 = _mm_max_epi16(srcRegFilt32b1_1, _mm_setzero_si128()); |
144 | 0 | srcRegFilt32b1_1 = _mm_min_epi16(srcRegFilt32b1_1, max); |
145 | |
|
146 | 0 | src_ptr += src_pitch; |
147 | |
|
148 | 0 | _mm_storel_epi64((__m128i *)dst_ptr, srcRegFilt32b1_1); |
149 | |
|
150 | 0 | dst_ptr += dst_pitch; |
151 | 0 | } |
152 | 0 | } |
153 | | |
154 | | static void aom_highbd_filter_block1d8_v4_sse2( |
155 | | const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, |
156 | 0 | ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { |
157 | 0 | __m128i filtersReg; |
158 | 0 | __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6; |
159 | 0 | __m128i srcReg23_lo, srcReg23_hi, srcReg34_lo, srcReg34_hi; |
160 | 0 | __m128i srcReg45_lo, srcReg45_hi, srcReg56_lo, srcReg56_hi; |
161 | 0 | __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo; |
162 | 0 | __m128i resReg23_hi, resReg34_hi, resReg45_hi, resReg56_hi; |
163 | 0 | __m128i resReg23_45_lo, resReg34_56_lo, resReg23_45_hi, resReg34_56_hi; |
164 | 0 | __m128i resReg23_45, resReg34_56; |
165 | 0 | __m128i addFilterReg64, secondFilters, thirdFilters; |
166 | 0 | unsigned int i; |
167 | 0 | ptrdiff_t src_stride, dst_stride; |
168 | |
|
169 | 0 | const __m128i max = _mm_set1_epi16((1 << bd) - 1); |
170 | 0 | addFilterReg64 = _mm_set1_epi32(64); |
171 | 0 | filtersReg = _mm_loadu_si128((const __m128i *)filter); |
172 | | |
173 | | // coeffs 0 1 0 1 2 3 2 3 |
174 | 0 | const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg); |
175 | | // coeffs 4 5 4 5 6 7 6 7 |
176 | 0 | const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg); |
177 | |
|
178 | 0 | secondFilters = _mm_unpackhi_epi64(tmp0, tmp0); // coeffs 2 3 2 3 2 3 2 3 |
179 | 0 | thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1); // coeffs 4 5 4 5 4 5 4 5 |
180 | | |
181 | | // multiple the size of the source and destination stride by two |
182 | 0 | src_stride = src_pitch << 1; |
183 | 0 | dst_stride = dst_pitch << 1; |
184 | |
|
185 | 0 | srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)); |
186 | 0 | srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)); |
187 | 0 | srcReg23_lo = _mm_unpacklo_epi16(srcReg2, srcReg3); |
188 | 0 | srcReg23_hi = _mm_unpackhi_epi16(srcReg2, srcReg3); |
189 | |
|
190 | 0 | srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)); |
191 | 0 | srcReg34_lo = _mm_unpacklo_epi16(srcReg3, srcReg4); |
192 | 0 | srcReg34_hi = _mm_unpackhi_epi16(srcReg3, srcReg4); |
193 | |
|
194 | 0 | for (i = height; i > 1; i -= 2) { |
195 | 0 | srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)); |
196 | |
|
197 | 0 | srcReg45_lo = _mm_unpacklo_epi16(srcReg4, srcReg5); |
198 | 0 | srcReg45_hi = _mm_unpackhi_epi16(srcReg4, srcReg5); |
199 | |
|
200 | 0 | srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)); |
201 | |
|
202 | 0 | srcReg56_lo = _mm_unpacklo_epi16(srcReg5, srcReg6); |
203 | 0 | srcReg56_hi = _mm_unpackhi_epi16(srcReg5, srcReg6); |
204 | | |
205 | | // multiply 2 adjacent elements with the filter and add the result |
206 | |
|
207 | 0 | resReg23_lo = _mm_madd_epi16(srcReg23_lo, secondFilters); |
208 | 0 | resReg34_lo = _mm_madd_epi16(srcReg34_lo, secondFilters); |
209 | 0 | resReg45_lo = _mm_madd_epi16(srcReg45_lo, thirdFilters); |
210 | 0 | resReg56_lo = _mm_madd_epi16(srcReg56_lo, thirdFilters); |
211 | |
|
212 | 0 | resReg23_45_lo = _mm_add_epi32(resReg23_lo, resReg45_lo); |
213 | 0 | resReg34_56_lo = _mm_add_epi32(resReg34_lo, resReg56_lo); |
214 | | |
215 | | // multiply 2 adjacent elements with the filter and add the result |
216 | |
|
217 | 0 | resReg23_hi = _mm_madd_epi16(srcReg23_hi, secondFilters); |
218 | 0 | resReg34_hi = _mm_madd_epi16(srcReg34_hi, secondFilters); |
219 | 0 | resReg45_hi = _mm_madd_epi16(srcReg45_hi, thirdFilters); |
220 | 0 | resReg56_hi = _mm_madd_epi16(srcReg56_hi, thirdFilters); |
221 | |
|
222 | 0 | resReg23_45_hi = _mm_add_epi32(resReg23_hi, resReg45_hi); |
223 | 0 | resReg34_56_hi = _mm_add_epi32(resReg34_hi, resReg56_hi); |
224 | | |
225 | | // shift by 7 bit each 32 bit |
226 | 0 | resReg23_45_lo = _mm_add_epi32(resReg23_45_lo, addFilterReg64); |
227 | 0 | resReg34_56_lo = _mm_add_epi32(resReg34_56_lo, addFilterReg64); |
228 | 0 | resReg23_45_hi = _mm_add_epi32(resReg23_45_hi, addFilterReg64); |
229 | 0 | resReg34_56_hi = _mm_add_epi32(resReg34_56_hi, addFilterReg64); |
230 | 0 | resReg23_45_lo = _mm_srai_epi32(resReg23_45_lo, 7); |
231 | 0 | resReg34_56_lo = _mm_srai_epi32(resReg34_56_lo, 7); |
232 | 0 | resReg23_45_hi = _mm_srai_epi32(resReg23_45_hi, 7); |
233 | 0 | resReg34_56_hi = _mm_srai_epi32(resReg34_56_hi, 7); |
234 | | |
235 | | // shrink to 16 bit each 32 bits, the first lane contain the first |
236 | | // convolve result and the second lane contain the second convolve |
237 | | // result |
238 | 0 | resReg23_45 = _mm_packs_epi32(resReg23_45_lo, resReg23_45_hi); |
239 | 0 | resReg34_56 = _mm_packs_epi32(resReg34_56_lo, resReg34_56_hi); |
240 | |
|
241 | 0 | resReg23_45 = _mm_max_epi16(resReg23_45, _mm_setzero_si128()); |
242 | 0 | resReg23_45 = _mm_min_epi16(resReg23_45, max); |
243 | 0 | resReg34_56 = _mm_max_epi16(resReg34_56, _mm_setzero_si128()); |
244 | 0 | resReg34_56 = _mm_min_epi16(resReg34_56, max); |
245 | |
|
246 | 0 | src_ptr += src_stride; |
247 | |
|
248 | 0 | _mm_store_si128((__m128i *)dst_ptr, (resReg23_45)); |
249 | 0 | _mm_store_si128((__m128i *)(dst_ptr + dst_pitch), (resReg34_56)); |
250 | |
|
251 | 0 | dst_ptr += dst_stride; |
252 | | |
253 | | // save part of the registers for next strides |
254 | 0 | srcReg23_lo = srcReg45_lo; |
255 | 0 | srcReg23_hi = srcReg45_hi; |
256 | 0 | srcReg34_lo = srcReg56_lo; |
257 | 0 | srcReg34_hi = srcReg56_hi; |
258 | 0 | srcReg4 = srcReg6; |
259 | 0 | } |
260 | 0 | } |
261 | | |
262 | | static void aom_highbd_filter_block1d8_h4_sse2( |
263 | | const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, |
264 | 0 | ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { |
265 | 0 | __m128i filtersReg; |
266 | 0 | __m128i addFilterReg64; |
267 | 0 | __m128i secondFilters, thirdFilters; |
268 | 0 | __m128i srcRegFilt32b1_1, srcRegFilt32b1_2; |
269 | 0 | __m128i srcReg32b1, srcReg32b2; |
270 | 0 | unsigned int i; |
271 | 0 | src_ptr -= 3; |
272 | 0 | addFilterReg64 = _mm_set1_epi32(64); |
273 | 0 | filtersReg = _mm_loadu_si128((const __m128i *)filter); |
274 | 0 | const __m128i max = _mm_set1_epi16((1 << bd) - 1); |
275 | | |
276 | | // coeffs 0 1 0 1 2 3 2 3 |
277 | 0 | const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg); |
278 | | // coeffs 4 5 4 5 6 7 6 7 |
279 | 0 | const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg); |
280 | |
|
281 | 0 | secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3 |
282 | 0 | thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5 |
283 | |
|
284 | 0 | for (i = height; i > 0; i -= 1) { |
285 | 0 | srcReg32b1 = _mm_loadu_si128((const __m128i *)(src_ptr + 2)); |
286 | 0 | srcReg32b2 = _mm_loadu_si128((const __m128i *)(src_ptr + 6)); |
287 | |
|
288 | 0 | __m128i ss_4_1 = _mm_srli_si128(srcReg32b1, 4); |
289 | 0 | __m128i ss_4_2 = _mm_srli_si128(srcReg32b2, 4); |
290 | 0 | __m128i ss_4 = _mm_unpacklo_epi64(ss_4_1, ss_4_2); |
291 | |
|
292 | 0 | __m128i d1 = _mm_madd_epi16(srcReg32b1, secondFilters); |
293 | 0 | __m128i d2 = _mm_madd_epi16(ss_4, thirdFilters); |
294 | 0 | srcRegFilt32b1_1 = _mm_add_epi32(d1, d2); |
295 | |
|
296 | 0 | __m128i ss_3_1 = _mm_srli_si128(srcReg32b1, 2); |
297 | 0 | __m128i ss_5_1 = _mm_srli_si128(srcReg32b1, 6); |
298 | 0 | __m128i ss_3_2 = _mm_srli_si128(srcReg32b2, 2); |
299 | 0 | __m128i ss_5_2 = _mm_srli_si128(srcReg32b2, 6); |
300 | 0 | __m128i ss_3 = _mm_unpacklo_epi64(ss_3_1, ss_3_2); |
301 | 0 | __m128i ss_5 = _mm_unpacklo_epi64(ss_5_1, ss_5_2); |
302 | |
|
303 | 0 | d1 = _mm_madd_epi16(ss_3, secondFilters); |
304 | 0 | d2 = _mm_madd_epi16(ss_5, thirdFilters); |
305 | 0 | srcRegFilt32b1_2 = _mm_add_epi32(d1, d2); |
306 | |
|
307 | 0 | __m128i res_lo_1 = _mm_unpacklo_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2); |
308 | 0 | __m128i res_hi_1 = _mm_unpackhi_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2); |
309 | | |
310 | | // shift by 7 bit each 32 bit |
311 | 0 | res_lo_1 = _mm_add_epi32(res_lo_1, addFilterReg64); |
312 | 0 | res_hi_1 = _mm_add_epi32(res_hi_1, addFilterReg64); |
313 | 0 | res_lo_1 = _mm_srai_epi32(res_lo_1, 7); |
314 | 0 | res_hi_1 = _mm_srai_epi32(res_hi_1, 7); |
315 | |
|
316 | 0 | srcRegFilt32b1_1 = _mm_packs_epi32(res_lo_1, res_hi_1); |
317 | |
|
318 | 0 | srcRegFilt32b1_1 = _mm_max_epi16(srcRegFilt32b1_1, _mm_setzero_si128()); |
319 | 0 | srcRegFilt32b1_1 = _mm_min_epi16(srcRegFilt32b1_1, max); |
320 | |
|
321 | 0 | src_ptr += src_pitch; |
322 | |
|
323 | 0 | _mm_store_si128((__m128i *)dst_ptr, srcRegFilt32b1_1); |
324 | |
|
325 | 0 | dst_ptr += dst_pitch; |
326 | 0 | } |
327 | 0 | } |
328 | | |
329 | | static void aom_highbd_filter_block1d16_v4_sse2( |
330 | | const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, |
331 | 0 | ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { |
332 | 0 | aom_highbd_filter_block1d8_v4_sse2(src_ptr, src_pitch, dst_ptr, dst_pitch, |
333 | 0 | height, filter, bd); |
334 | 0 | aom_highbd_filter_block1d8_v4_sse2((src_ptr + 8), src_pitch, (dst_ptr + 8), |
335 | 0 | dst_pitch, height, filter, bd); |
336 | 0 | } |
337 | | |
338 | | static void aom_highbd_filter_block1d16_h4_sse2( |
339 | | const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, |
340 | 0 | ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { |
341 | 0 | aom_highbd_filter_block1d8_h4_sse2(src_ptr, src_pitch, dst_ptr, dst_pitch, |
342 | 0 | height, filter, bd); |
343 | 0 | aom_highbd_filter_block1d8_h4_sse2((src_ptr + 8), src_pitch, (dst_ptr + 8), |
344 | 0 | dst_pitch, height, filter, bd); |
345 | 0 | } |
346 | | |
347 | | // From aom_dsp/x86/aom_high_subpixel_8t_sse2.asm |
348 | | highbd_filter8_1dfunction aom_highbd_filter_block1d16_v8_sse2; |
349 | | highbd_filter8_1dfunction aom_highbd_filter_block1d16_h8_sse2; |
350 | | highbd_filter8_1dfunction aom_highbd_filter_block1d8_v8_sse2; |
351 | | highbd_filter8_1dfunction aom_highbd_filter_block1d8_h8_sse2; |
352 | | highbd_filter8_1dfunction aom_highbd_filter_block1d4_v8_sse2; |
353 | | highbd_filter8_1dfunction aom_highbd_filter_block1d4_h8_sse2; |
354 | | |
355 | | // From aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm |
356 | | highbd_filter8_1dfunction aom_highbd_filter_block1d16_v2_sse2; |
357 | | highbd_filter8_1dfunction aom_highbd_filter_block1d16_h2_sse2; |
358 | | highbd_filter8_1dfunction aom_highbd_filter_block1d8_v2_sse2; |
359 | | highbd_filter8_1dfunction aom_highbd_filter_block1d8_h2_sse2; |
360 | | highbd_filter8_1dfunction aom_highbd_filter_block1d4_v2_sse2; |
361 | | highbd_filter8_1dfunction aom_highbd_filter_block1d4_h2_sse2; |
362 | | |
363 | | // void aom_highbd_convolve8_horiz_sse2(const uint8_t *src, |
364 | | // ptrdiff_t src_stride, |
365 | | // uint8_t *dst, |
366 | | // ptrdiff_t dst_stride, |
367 | | // const int16_t *filter_x, |
368 | | // int x_step_q4, |
369 | | // const int16_t *filter_y, |
370 | | // int y_step_q4, |
371 | | // int w, int h, int bd); |
372 | | // void aom_highbd_convolve8_vert_sse2(const uint8_t *src, |
373 | | // ptrdiff_t src_stride, |
374 | | // uint8_t *dst, |
375 | | // ptrdiff_t dst_stride, |
376 | | // const int16_t *filter_x, |
377 | | // int x_step_q4, |
378 | | // const int16_t *filter_y, |
379 | | // int y_step_q4, |
380 | | // int w, int h, int bd); |
381 | | HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2) |
382 | | HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2) |