/src/aom/aom_dsp/x86/highbd_convolve_sse2.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2018, Alliance for Open Media. All rights reserved |
3 | | * |
4 | | * This source code is subject to the terms of the BSD 2 Clause License and |
5 | | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
6 | | * was not distributed with this source code in the LICENSE file, you can |
7 | | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
8 | | * Media Patent License 1.0 was not distributed with this source code in the |
9 | | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
10 | | */ |
11 | | #include <emmintrin.h> |
12 | | |
13 | | #include "config/aom_dsp_rtcd.h" |
14 | | #include "aom_dsp/x86/convolve.h" |
15 | | |
16 | | // ----------------------------------------------------------------------------- |
17 | | |
18 | | void aom_highbd_filter_block1d4_v4_sse2(const uint16_t *src_ptr, |
19 | | ptrdiff_t src_pitch, uint16_t *dst_ptr, |
20 | | ptrdiff_t dst_pitch, uint32_t height, |
21 | 0 | const int16_t *filter, int bd) { |
22 | 0 | __m128i filtersReg; |
23 | 0 | __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6; |
24 | 0 | __m128i srcReg23_lo, srcReg34_lo; |
25 | 0 | __m128i srcReg45_lo, srcReg56_lo; |
26 | 0 | __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo; |
27 | 0 | __m128i resReg23_45_lo, resReg34_56_lo; |
28 | 0 | __m128i resReg23_45, resReg34_56; |
29 | 0 | __m128i addFilterReg64, secondFilters, thirdFilters; |
30 | 0 | unsigned int i; |
31 | 0 | ptrdiff_t src_stride, dst_stride; |
32 | |
|
33 | 0 | const __m128i max = _mm_set1_epi16((1 << bd) - 1); |
34 | 0 | addFilterReg64 = _mm_set1_epi32(64); |
35 | 0 | filtersReg = _mm_loadu_si128((const __m128i *)filter); |
36 | | |
37 | | // coeffs 0 1 0 1 2 3 2 3 |
38 | 0 | const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg); |
39 | | // coeffs 4 5 4 5 6 7 6 7 |
40 | 0 | const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg); |
41 | |
|
42 | 0 | secondFilters = _mm_unpackhi_epi64(tmp0, tmp0); // coeffs 2 3 2 3 2 3 2 3 |
43 | 0 | thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1); // coeffs 4 5 4 5 4 5 4 5 |
44 | | |
45 | | // multiply the size of the source and destination stride by two |
46 | 0 | src_stride = src_pitch << 1; |
47 | 0 | dst_stride = dst_pitch << 1; |
48 | |
|
49 | 0 | srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); |
50 | 0 | srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); |
51 | 0 | srcReg23_lo = _mm_unpacklo_epi16(srcReg2, srcReg3); |
52 | |
|
53 | 0 | srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); |
54 | 0 | srcReg34_lo = _mm_unpacklo_epi16(srcReg3, srcReg4); |
55 | |
|
56 | 0 | for (i = height; i > 1; i -= 2) { |
57 | 0 | srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); |
58 | 0 | srcReg45_lo = _mm_unpacklo_epi16(srcReg4, srcReg5); |
59 | |
|
60 | 0 | srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); |
61 | 0 | srcReg56_lo = _mm_unpacklo_epi16(srcReg5, srcReg6); |
62 | | |
63 | | // multiply 2 adjacent elements with the filter and add the result |
64 | |
|
65 | 0 | resReg23_lo = _mm_madd_epi16(srcReg23_lo, secondFilters); |
66 | 0 | resReg34_lo = _mm_madd_epi16(srcReg34_lo, secondFilters); |
67 | 0 | resReg45_lo = _mm_madd_epi16(srcReg45_lo, thirdFilters); |
68 | 0 | resReg56_lo = _mm_madd_epi16(srcReg56_lo, thirdFilters); |
69 | |
|
70 | 0 | resReg23_45_lo = _mm_add_epi32(resReg23_lo, resReg45_lo); |
71 | 0 | resReg34_56_lo = _mm_add_epi32(resReg34_lo, resReg56_lo); |
72 | | |
73 | | // shift by 7 bit each 32 bit |
74 | 0 | resReg23_45_lo = _mm_add_epi32(resReg23_45_lo, addFilterReg64); |
75 | 0 | resReg34_56_lo = _mm_add_epi32(resReg34_56_lo, addFilterReg64); |
76 | 0 | resReg23_45_lo = _mm_srai_epi32(resReg23_45_lo, 7); |
77 | 0 | resReg34_56_lo = _mm_srai_epi32(resReg34_56_lo, 7); |
78 | | |
79 | | // shrink to 16 bit each 32 bits, the first lane contain the first |
80 | | // convolve result and the second lane contain the second convolve |
81 | | // result |
82 | 0 | resReg23_45 = _mm_packs_epi32(resReg23_45_lo, _mm_setzero_si128()); |
83 | 0 | resReg34_56 = _mm_packs_epi32(resReg34_56_lo, _mm_setzero_si128()); |
84 | |
|
85 | 0 | resReg23_45 = _mm_max_epi16(resReg23_45, _mm_setzero_si128()); |
86 | 0 | resReg23_45 = _mm_min_epi16(resReg23_45, max); |
87 | 0 | resReg34_56 = _mm_max_epi16(resReg34_56, _mm_setzero_si128()); |
88 | 0 | resReg34_56 = _mm_min_epi16(resReg34_56, max); |
89 | |
|
90 | 0 | src_ptr += src_stride; |
91 | |
|
92 | 0 | _mm_storel_epi64((__m128i *)dst_ptr, (resReg23_45)); |
93 | 0 | _mm_storel_epi64((__m128i *)(dst_ptr + dst_pitch), (resReg34_56)); |
94 | |
|
95 | 0 | dst_ptr += dst_stride; |
96 | | |
97 | | // save part of the registers for next strides |
98 | 0 | srcReg23_lo = srcReg45_lo; |
99 | 0 | srcReg34_lo = srcReg56_lo; |
100 | 0 | srcReg4 = srcReg6; |
101 | 0 | } |
102 | 0 | } |
103 | | |
104 | | void aom_highbd_filter_block1d4_h4_sse2(const uint16_t *src_ptr, |
105 | | ptrdiff_t src_pitch, uint16_t *dst_ptr, |
106 | | ptrdiff_t dst_pitch, uint32_t height, |
107 | 0 | const int16_t *filter, int bd) { |
108 | 0 | __m128i filtersReg; |
109 | 0 | __m128i addFilterReg64; |
110 | 0 | __m128i secondFilters, thirdFilters; |
111 | 0 | __m128i srcRegFilt32b1_1; |
112 | 0 | __m128i srcReg32b1; |
113 | 0 | unsigned int i; |
114 | 0 | src_ptr -= 3; |
115 | 0 | addFilterReg64 = _mm_set1_epi32(64); |
116 | 0 | filtersReg = _mm_loadu_si128((const __m128i *)filter); |
117 | 0 | const __m128i max = _mm_set1_epi16((1 << bd) - 1); |
118 | | |
119 | | // coeffs 0 1 0 1 2 3 2 3 |
120 | 0 | const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg); |
121 | | // coeffs 4 5 4 5 6 7 6 7 |
122 | 0 | const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg); |
123 | |
|
124 | 0 | secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3 |
125 | 0 | thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5 |
126 | |
|
127 | 0 | for (i = height; i > 0; i -= 1) { |
128 | 0 | srcReg32b1 = _mm_loadu_si128((const __m128i *)(src_ptr + 2)); |
129 | |
|
130 | 0 | __m128i ss_3_1 = _mm_srli_si128(srcReg32b1, 2); |
131 | 0 | __m128i ss_4_1 = _mm_srli_si128(srcReg32b1, 4); |
132 | 0 | __m128i ss_5_1 = _mm_srli_si128(srcReg32b1, 6); |
133 | 0 | __m128i ss_23 = _mm_unpacklo_epi32(srcReg32b1, ss_3_1); |
134 | 0 | __m128i ss_45 = _mm_unpacklo_epi32(ss_4_1, ss_5_1); |
135 | |
|
136 | 0 | ss_23 = _mm_madd_epi16(ss_23, secondFilters); |
137 | 0 | ss_45 = _mm_madd_epi16(ss_45, thirdFilters); |
138 | 0 | srcRegFilt32b1_1 = _mm_add_epi32(ss_23, ss_45); |
139 | | |
140 | | // shift by 7 bit each 32 bit |
141 | 0 | srcRegFilt32b1_1 = _mm_add_epi32(srcRegFilt32b1_1, addFilterReg64); |
142 | 0 | srcRegFilt32b1_1 = _mm_srai_epi32(srcRegFilt32b1_1, 7); |
143 | |
|
144 | 0 | srcRegFilt32b1_1 = _mm_packs_epi32(srcRegFilt32b1_1, _mm_setzero_si128()); |
145 | 0 | srcRegFilt32b1_1 = _mm_max_epi16(srcRegFilt32b1_1, _mm_setzero_si128()); |
146 | 0 | srcRegFilt32b1_1 = _mm_min_epi16(srcRegFilt32b1_1, max); |
147 | |
|
148 | 0 | src_ptr += src_pitch; |
149 | |
|
150 | 0 | _mm_storel_epi64((__m128i *)dst_ptr, srcRegFilt32b1_1); |
151 | |
|
152 | 0 | dst_ptr += dst_pitch; |
153 | 0 | } |
154 | 0 | } |
155 | | |
156 | | void aom_highbd_filter_block1d8_v4_sse2(const uint16_t *src_ptr, |
157 | | ptrdiff_t src_pitch, uint16_t *dst_ptr, |
158 | | ptrdiff_t dst_pitch, uint32_t height, |
159 | 0 | const int16_t *filter, int bd) { |
160 | 0 | __m128i filtersReg; |
161 | 0 | __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6; |
162 | 0 | __m128i srcReg23_lo, srcReg23_hi, srcReg34_lo, srcReg34_hi; |
163 | 0 | __m128i srcReg45_lo, srcReg45_hi, srcReg56_lo, srcReg56_hi; |
164 | 0 | __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo; |
165 | 0 | __m128i resReg23_hi, resReg34_hi, resReg45_hi, resReg56_hi; |
166 | 0 | __m128i resReg23_45_lo, resReg34_56_lo, resReg23_45_hi, resReg34_56_hi; |
167 | 0 | __m128i resReg23_45, resReg34_56; |
168 | 0 | __m128i addFilterReg64, secondFilters, thirdFilters; |
169 | 0 | unsigned int i; |
170 | 0 | ptrdiff_t src_stride, dst_stride; |
171 | |
|
172 | 0 | const __m128i max = _mm_set1_epi16((1 << bd) - 1); |
173 | 0 | addFilterReg64 = _mm_set1_epi32(64); |
174 | 0 | filtersReg = _mm_loadu_si128((const __m128i *)filter); |
175 | | |
176 | | // coeffs 0 1 0 1 2 3 2 3 |
177 | 0 | const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg); |
178 | | // coeffs 4 5 4 5 6 7 6 7 |
179 | 0 | const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg); |
180 | |
|
181 | 0 | secondFilters = _mm_unpackhi_epi64(tmp0, tmp0); // coeffs 2 3 2 3 2 3 2 3 |
182 | 0 | thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1); // coeffs 4 5 4 5 4 5 4 5 |
183 | | |
184 | | // multiple the size of the source and destination stride by two |
185 | 0 | src_stride = src_pitch << 1; |
186 | 0 | dst_stride = dst_pitch << 1; |
187 | |
|
188 | 0 | srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)); |
189 | 0 | srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)); |
190 | 0 | srcReg23_lo = _mm_unpacklo_epi16(srcReg2, srcReg3); |
191 | 0 | srcReg23_hi = _mm_unpackhi_epi16(srcReg2, srcReg3); |
192 | |
|
193 | 0 | srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)); |
194 | 0 | srcReg34_lo = _mm_unpacklo_epi16(srcReg3, srcReg4); |
195 | 0 | srcReg34_hi = _mm_unpackhi_epi16(srcReg3, srcReg4); |
196 | |
|
197 | 0 | for (i = height; i > 1; i -= 2) { |
198 | 0 | srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)); |
199 | |
|
200 | 0 | srcReg45_lo = _mm_unpacklo_epi16(srcReg4, srcReg5); |
201 | 0 | srcReg45_hi = _mm_unpackhi_epi16(srcReg4, srcReg5); |
202 | |
|
203 | 0 | srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)); |
204 | |
|
205 | 0 | srcReg56_lo = _mm_unpacklo_epi16(srcReg5, srcReg6); |
206 | 0 | srcReg56_hi = _mm_unpackhi_epi16(srcReg5, srcReg6); |
207 | | |
208 | | // multiply 2 adjacent elements with the filter and add the result |
209 | |
|
210 | 0 | resReg23_lo = _mm_madd_epi16(srcReg23_lo, secondFilters); |
211 | 0 | resReg34_lo = _mm_madd_epi16(srcReg34_lo, secondFilters); |
212 | 0 | resReg45_lo = _mm_madd_epi16(srcReg45_lo, thirdFilters); |
213 | 0 | resReg56_lo = _mm_madd_epi16(srcReg56_lo, thirdFilters); |
214 | |
|
215 | 0 | resReg23_45_lo = _mm_add_epi32(resReg23_lo, resReg45_lo); |
216 | 0 | resReg34_56_lo = _mm_add_epi32(resReg34_lo, resReg56_lo); |
217 | | |
218 | | // multiply 2 adjacent elements with the filter and add the result |
219 | |
|
220 | 0 | resReg23_hi = _mm_madd_epi16(srcReg23_hi, secondFilters); |
221 | 0 | resReg34_hi = _mm_madd_epi16(srcReg34_hi, secondFilters); |
222 | 0 | resReg45_hi = _mm_madd_epi16(srcReg45_hi, thirdFilters); |
223 | 0 | resReg56_hi = _mm_madd_epi16(srcReg56_hi, thirdFilters); |
224 | |
|
225 | 0 | resReg23_45_hi = _mm_add_epi32(resReg23_hi, resReg45_hi); |
226 | 0 | resReg34_56_hi = _mm_add_epi32(resReg34_hi, resReg56_hi); |
227 | | |
228 | | // shift by 7 bit each 32 bit |
229 | 0 | resReg23_45_lo = _mm_add_epi32(resReg23_45_lo, addFilterReg64); |
230 | 0 | resReg34_56_lo = _mm_add_epi32(resReg34_56_lo, addFilterReg64); |
231 | 0 | resReg23_45_hi = _mm_add_epi32(resReg23_45_hi, addFilterReg64); |
232 | 0 | resReg34_56_hi = _mm_add_epi32(resReg34_56_hi, addFilterReg64); |
233 | 0 | resReg23_45_lo = _mm_srai_epi32(resReg23_45_lo, 7); |
234 | 0 | resReg34_56_lo = _mm_srai_epi32(resReg34_56_lo, 7); |
235 | 0 | resReg23_45_hi = _mm_srai_epi32(resReg23_45_hi, 7); |
236 | 0 | resReg34_56_hi = _mm_srai_epi32(resReg34_56_hi, 7); |
237 | | |
238 | | // shrink to 16 bit each 32 bits, the first lane contain the first |
239 | | // convolve result and the second lane contain the second convolve |
240 | | // result |
241 | 0 | resReg23_45 = _mm_packs_epi32(resReg23_45_lo, resReg23_45_hi); |
242 | 0 | resReg34_56 = _mm_packs_epi32(resReg34_56_lo, resReg34_56_hi); |
243 | |
|
244 | 0 | resReg23_45 = _mm_max_epi16(resReg23_45, _mm_setzero_si128()); |
245 | 0 | resReg23_45 = _mm_min_epi16(resReg23_45, max); |
246 | 0 | resReg34_56 = _mm_max_epi16(resReg34_56, _mm_setzero_si128()); |
247 | 0 | resReg34_56 = _mm_min_epi16(resReg34_56, max); |
248 | |
|
249 | 0 | src_ptr += src_stride; |
250 | |
|
251 | 0 | _mm_store_si128((__m128i *)dst_ptr, (resReg23_45)); |
252 | 0 | _mm_store_si128((__m128i *)(dst_ptr + dst_pitch), (resReg34_56)); |
253 | |
|
254 | 0 | dst_ptr += dst_stride; |
255 | | |
256 | | // save part of the registers for next strides |
257 | 0 | srcReg23_lo = srcReg45_lo; |
258 | 0 | srcReg23_hi = srcReg45_hi; |
259 | 0 | srcReg34_lo = srcReg56_lo; |
260 | 0 | srcReg34_hi = srcReg56_hi; |
261 | 0 | srcReg4 = srcReg6; |
262 | 0 | } |
263 | 0 | } |
264 | | |
265 | | void aom_highbd_filter_block1d8_h4_sse2(const uint16_t *src_ptr, |
266 | | ptrdiff_t src_pitch, uint16_t *dst_ptr, |
267 | | ptrdiff_t dst_pitch, uint32_t height, |
268 | 0 | const int16_t *filter, int bd) { |
269 | 0 | __m128i filtersReg; |
270 | 0 | __m128i addFilterReg64; |
271 | 0 | __m128i secondFilters, thirdFilters; |
272 | 0 | __m128i srcRegFilt32b1_1, srcRegFilt32b1_2; |
273 | 0 | __m128i srcReg32b1, srcReg32b2; |
274 | 0 | unsigned int i; |
275 | 0 | src_ptr -= 3; |
276 | 0 | addFilterReg64 = _mm_set1_epi32(64); |
277 | 0 | filtersReg = _mm_loadu_si128((const __m128i *)filter); |
278 | 0 | const __m128i max = _mm_set1_epi16((1 << bd) - 1); |
279 | | |
280 | | // coeffs 0 1 0 1 2 3 2 3 |
281 | 0 | const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg); |
282 | | // coeffs 4 5 4 5 6 7 6 7 |
283 | 0 | const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg); |
284 | |
|
285 | 0 | secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3 |
286 | 0 | thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5 |
287 | |
|
288 | 0 | for (i = height; i > 0; i -= 1) { |
289 | 0 | srcReg32b1 = _mm_loadu_si128((const __m128i *)(src_ptr + 2)); |
290 | 0 | srcReg32b2 = _mm_loadu_si128((const __m128i *)(src_ptr + 6)); |
291 | |
|
292 | 0 | __m128i ss_4_1 = _mm_srli_si128(srcReg32b1, 4); |
293 | 0 | __m128i ss_4_2 = _mm_srli_si128(srcReg32b2, 4); |
294 | 0 | __m128i ss_4 = _mm_unpacklo_epi64(ss_4_1, ss_4_2); |
295 | |
|
296 | 0 | __m128i d1 = _mm_madd_epi16(srcReg32b1, secondFilters); |
297 | 0 | __m128i d2 = _mm_madd_epi16(ss_4, thirdFilters); |
298 | 0 | srcRegFilt32b1_1 = _mm_add_epi32(d1, d2); |
299 | |
|
300 | 0 | __m128i ss_3_1 = _mm_srli_si128(srcReg32b1, 2); |
301 | 0 | __m128i ss_5_1 = _mm_srli_si128(srcReg32b1, 6); |
302 | 0 | __m128i ss_3_2 = _mm_srli_si128(srcReg32b2, 2); |
303 | 0 | __m128i ss_5_2 = _mm_srli_si128(srcReg32b2, 6); |
304 | 0 | __m128i ss_3 = _mm_unpacklo_epi64(ss_3_1, ss_3_2); |
305 | 0 | __m128i ss_5 = _mm_unpacklo_epi64(ss_5_1, ss_5_2); |
306 | |
|
307 | 0 | d1 = _mm_madd_epi16(ss_3, secondFilters); |
308 | 0 | d2 = _mm_madd_epi16(ss_5, thirdFilters); |
309 | 0 | srcRegFilt32b1_2 = _mm_add_epi32(d1, d2); |
310 | |
|
311 | 0 | __m128i res_lo_1 = _mm_unpacklo_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2); |
312 | 0 | __m128i res_hi_1 = _mm_unpackhi_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2); |
313 | | |
314 | | // shift by 7 bit each 32 bit |
315 | 0 | res_lo_1 = _mm_add_epi32(res_lo_1, addFilterReg64); |
316 | 0 | res_hi_1 = _mm_add_epi32(res_hi_1, addFilterReg64); |
317 | 0 | res_lo_1 = _mm_srai_epi32(res_lo_1, 7); |
318 | 0 | res_hi_1 = _mm_srai_epi32(res_hi_1, 7); |
319 | |
|
320 | 0 | srcRegFilt32b1_1 = _mm_packs_epi32(res_lo_1, res_hi_1); |
321 | |
|
322 | 0 | srcRegFilt32b1_1 = _mm_max_epi16(srcRegFilt32b1_1, _mm_setzero_si128()); |
323 | 0 | srcRegFilt32b1_1 = _mm_min_epi16(srcRegFilt32b1_1, max); |
324 | |
|
325 | 0 | src_ptr += src_pitch; |
326 | |
|
327 | 0 | _mm_store_si128((__m128i *)dst_ptr, srcRegFilt32b1_1); |
328 | |
|
329 | 0 | dst_ptr += dst_pitch; |
330 | 0 | } |
331 | 0 | } |
332 | | |
333 | | void aom_highbd_filter_block1d16_v4_sse2(const uint16_t *src_ptr, |
334 | | ptrdiff_t src_pitch, uint16_t *dst_ptr, |
335 | | ptrdiff_t dst_pitch, uint32_t height, |
336 | 0 | const int16_t *filter, int bd) { |
337 | 0 | aom_highbd_filter_block1d8_v4_sse2(src_ptr, src_pitch, dst_ptr, dst_pitch, |
338 | 0 | height, filter, bd); |
339 | 0 | aom_highbd_filter_block1d8_v4_sse2((src_ptr + 8), src_pitch, (dst_ptr + 8), |
340 | 0 | dst_pitch, height, filter, bd); |
341 | 0 | } |
342 | | |
343 | | void aom_highbd_filter_block1d16_h4_sse2(const uint16_t *src_ptr, |
344 | | ptrdiff_t src_pitch, uint16_t *dst_ptr, |
345 | | ptrdiff_t dst_pitch, uint32_t height, |
346 | 0 | const int16_t *filter, int bd) { |
347 | 0 | aom_highbd_filter_block1d8_h4_sse2(src_ptr, src_pitch, dst_ptr, dst_pitch, |
348 | 0 | height, filter, bd); |
349 | 0 | aom_highbd_filter_block1d8_h4_sse2((src_ptr + 8), src_pitch, (dst_ptr + 8), |
350 | 0 | dst_pitch, height, filter, bd); |
351 | 0 | } |