/src/aom/aom_dsp/x86/highbd_convolve_ssse3.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2018, Alliance for Open Media. All rights reserved. |
3 | | * |
4 | | * This source code is subject to the terms of the BSD 2 Clause License and |
5 | | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
6 | | * was not distributed with this source code in the LICENSE file, you can |
7 | | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
8 | | * Media Patent License 1.0 was not distributed with this source code in the |
9 | | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
10 | | */ |
11 | | |
12 | | #include <tmmintrin.h> |
13 | | #include <assert.h> |
14 | | |
15 | | #include "config/av1_rtcd.h" |
16 | | |
17 | | #include "aom_dsp/x86/convolve_sse2.h" |
18 | | #include "aom_dsp/x86/convolve_common_intrin.h" |
19 | | |
20 | | void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride, |
21 | | uint16_t *dst, int dst_stride, int w, int h, |
22 | | const InterpFilterParams *filter_params_y, |
23 | 0 | const int subpel_y_qn, int bd) { |
24 | 0 | int i, j; |
25 | 0 | const int fo_vert = filter_params_y->taps / 2 - 1; |
26 | 0 | const uint16_t *const src_ptr = src - fo_vert * src_stride; |
27 | 0 | const int bits = FILTER_BITS; |
28 | |
|
29 | 0 | const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); |
30 | 0 | const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1); |
31 | 0 | const __m128i clip_pixel = |
32 | 0 | _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); |
33 | 0 | const __m128i zero = _mm_setzero_si128(); |
34 | 0 | if (filter_params_y->taps == 12) { |
35 | 0 | __m128i s[24], coeffs_y[6]; |
36 | |
|
37 | 0 | prepare_coeffs_12tap(filter_params_y, subpel_y_qn, coeffs_y); |
38 | |
|
39 | 0 | for (j = 0; j < w; j += 8) { |
40 | 0 | const uint16_t *data = &src_ptr[j]; |
41 | | /* Vertical filter */ |
42 | 0 | __m128i s0 = _mm_loadu_si128((__m128i *)(data + 0 * src_stride)); |
43 | 0 | __m128i s1 = _mm_loadu_si128((__m128i *)(data + 1 * src_stride)); |
44 | 0 | __m128i s2 = _mm_loadu_si128((__m128i *)(data + 2 * src_stride)); |
45 | 0 | __m128i s3 = _mm_loadu_si128((__m128i *)(data + 3 * src_stride)); |
46 | 0 | __m128i s4 = _mm_loadu_si128((__m128i *)(data + 4 * src_stride)); |
47 | 0 | __m128i s5 = _mm_loadu_si128((__m128i *)(data + 5 * src_stride)); |
48 | 0 | __m128i s6 = _mm_loadu_si128((__m128i *)(data + 6 * src_stride)); |
49 | 0 | __m128i s7 = _mm_loadu_si128((__m128i *)(data + 7 * src_stride)); |
50 | 0 | __m128i s8 = _mm_loadu_si128((__m128i *)(data + 8 * src_stride)); |
51 | 0 | __m128i s9 = _mm_loadu_si128((__m128i *)(data + 9 * src_stride)); |
52 | 0 | __m128i s10 = _mm_loadu_si128((__m128i *)(data + 10 * src_stride)); |
53 | |
|
54 | 0 | s[0] = _mm_unpacklo_epi16(s0, s1); |
55 | 0 | s[1] = _mm_unpacklo_epi16(s2, s3); |
56 | 0 | s[2] = _mm_unpacklo_epi16(s4, s5); |
57 | 0 | s[3] = _mm_unpacklo_epi16(s6, s7); |
58 | 0 | s[4] = _mm_unpacklo_epi16(s8, s9); |
59 | |
|
60 | 0 | s[6] = _mm_unpackhi_epi16(s0, s1); |
61 | 0 | s[7] = _mm_unpackhi_epi16(s2, s3); |
62 | 0 | s[8] = _mm_unpackhi_epi16(s4, s5); |
63 | 0 | s[9] = _mm_unpackhi_epi16(s6, s7); |
64 | 0 | s[10] = _mm_unpackhi_epi16(s8, s9); |
65 | |
|
66 | 0 | s[12] = _mm_unpacklo_epi16(s1, s2); |
67 | 0 | s[13] = _mm_unpacklo_epi16(s3, s4); |
68 | 0 | s[14] = _mm_unpacklo_epi16(s5, s6); |
69 | 0 | s[15] = _mm_unpacklo_epi16(s7, s8); |
70 | 0 | s[16] = _mm_unpacklo_epi16(s9, s10); |
71 | |
|
72 | 0 | s[18] = _mm_unpackhi_epi16(s1, s2); |
73 | 0 | s[19] = _mm_unpackhi_epi16(s3, s4); |
74 | 0 | s[20] = _mm_unpackhi_epi16(s5, s6); |
75 | 0 | s[21] = _mm_unpackhi_epi16(s7, s8); |
76 | 0 | s[22] = _mm_unpackhi_epi16(s9, s10); |
77 | |
|
78 | 0 | for (i = 0; i < h; i += 2) { |
79 | 0 | data = &src_ptr[i * src_stride + j]; |
80 | |
|
81 | 0 | __m128i s11 = _mm_loadu_si128((__m128i *)(data + 11 * src_stride)); |
82 | 0 | __m128i s12 = _mm_loadu_si128((__m128i *)(data + 12 * src_stride)); |
83 | |
|
84 | 0 | s[5] = _mm_unpacklo_epi16(s10, s11); |
85 | 0 | s[11] = _mm_unpackhi_epi16(s10, s11); |
86 | |
|
87 | 0 | s[17] = _mm_unpacklo_epi16(s11, s12); |
88 | 0 | s[23] = _mm_unpackhi_epi16(s11, s12); |
89 | |
|
90 | 0 | const __m128i res_a0 = convolve_12tap(s, coeffs_y); |
91 | 0 | __m128i res_a_round0 = _mm_sra_epi32( |
92 | 0 | _mm_add_epi32(res_a0, round_const_bits), round_shift_bits); |
93 | |
|
94 | 0 | const __m128i res_a1 = convolve_12tap(s + 12, coeffs_y); |
95 | 0 | __m128i res_a_round1 = _mm_sra_epi32( |
96 | 0 | _mm_add_epi32(res_a1, round_const_bits), round_shift_bits); |
97 | |
|
98 | 0 | if (w - j > 4) { |
99 | 0 | const __m128i res_b0 = convolve_12tap(s + 6, coeffs_y); |
100 | 0 | __m128i res_b_round0 = _mm_sra_epi32( |
101 | 0 | _mm_add_epi32(res_b0, round_const_bits), round_shift_bits); |
102 | |
|
103 | 0 | const __m128i res_b1 = convolve_12tap(s + 18, coeffs_y); |
104 | 0 | __m128i res_b_round1 = _mm_sra_epi32( |
105 | 0 | _mm_add_epi32(res_b1, round_const_bits), round_shift_bits); |
106 | |
|
107 | 0 | __m128i res_16bit0 = _mm_packs_epi32(res_a_round0, res_b_round0); |
108 | 0 | res_16bit0 = _mm_min_epi16(res_16bit0, clip_pixel); |
109 | 0 | res_16bit0 = _mm_max_epi16(res_16bit0, zero); |
110 | |
|
111 | 0 | __m128i res_16bit1 = _mm_packs_epi32(res_a_round1, res_b_round1); |
112 | 0 | res_16bit1 = _mm_min_epi16(res_16bit1, clip_pixel); |
113 | 0 | res_16bit1 = _mm_max_epi16(res_16bit1, zero); |
114 | |
|
115 | 0 | _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_16bit0); |
116 | 0 | _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], |
117 | 0 | res_16bit1); |
118 | 0 | } else if (w == 4) { |
119 | 0 | res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0); |
120 | 0 | res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel); |
121 | 0 | res_a_round0 = _mm_max_epi16(res_a_round0, zero); |
122 | |
|
123 | 0 | res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1); |
124 | 0 | res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel); |
125 | 0 | res_a_round1 = _mm_max_epi16(res_a_round1, zero); |
126 | |
|
127 | 0 | _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_a_round0); |
128 | 0 | _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], |
129 | 0 | res_a_round1); |
130 | 0 | } else { |
131 | 0 | res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0); |
132 | 0 | res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel); |
133 | 0 | res_a_round0 = _mm_max_epi16(res_a_round0, zero); |
134 | |
|
135 | 0 | res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1); |
136 | 0 | res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel); |
137 | 0 | res_a_round1 = _mm_max_epi16(res_a_round1, zero); |
138 | |
|
139 | 0 | *((int *)(&dst[i * dst_stride + j])) = |
140 | 0 | _mm_cvtsi128_si32(res_a_round0); |
141 | |
|
142 | 0 | *((int *)(&dst[i * dst_stride + j + dst_stride])) = |
143 | 0 | _mm_cvtsi128_si32(res_a_round1); |
144 | 0 | } |
145 | |
|
146 | 0 | s[0] = s[1]; |
147 | 0 | s[1] = s[2]; |
148 | 0 | s[2] = s[3]; |
149 | 0 | s[3] = s[4]; |
150 | 0 | s[4] = s[5]; |
151 | |
|
152 | 0 | s[6] = s[7]; |
153 | 0 | s[7] = s[8]; |
154 | 0 | s[8] = s[9]; |
155 | 0 | s[9] = s[10]; |
156 | 0 | s[10] = s[11]; |
157 | |
|
158 | 0 | s[12] = s[13]; |
159 | 0 | s[13] = s[14]; |
160 | 0 | s[14] = s[15]; |
161 | 0 | s[15] = s[16]; |
162 | 0 | s[16] = s[17]; |
163 | |
|
164 | 0 | s[18] = s[19]; |
165 | 0 | s[19] = s[20]; |
166 | 0 | s[20] = s[21]; |
167 | 0 | s[21] = s[22]; |
168 | 0 | s[22] = s[23]; |
169 | |
|
170 | 0 | s10 = s12; |
171 | 0 | } |
172 | 0 | } |
173 | 0 | } else { |
174 | 0 | __m128i s[16], coeffs_y[4]; |
175 | |
|
176 | 0 | prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y); |
177 | |
|
178 | 0 | for (j = 0; j < w; j += 8) { |
179 | 0 | const uint16_t *data = &src_ptr[j]; |
180 | | /* Vertical filter */ |
181 | 0 | { |
182 | 0 | __m128i s0 = _mm_loadu_si128((__m128i *)(data + 0 * src_stride)); |
183 | 0 | __m128i s1 = _mm_loadu_si128((__m128i *)(data + 1 * src_stride)); |
184 | 0 | __m128i s2 = _mm_loadu_si128((__m128i *)(data + 2 * src_stride)); |
185 | 0 | __m128i s3 = _mm_loadu_si128((__m128i *)(data + 3 * src_stride)); |
186 | 0 | __m128i s4 = _mm_loadu_si128((__m128i *)(data + 4 * src_stride)); |
187 | 0 | __m128i s5 = _mm_loadu_si128((__m128i *)(data + 5 * src_stride)); |
188 | 0 | __m128i s6 = _mm_loadu_si128((__m128i *)(data + 6 * src_stride)); |
189 | |
|
190 | 0 | s[0] = _mm_unpacklo_epi16(s0, s1); |
191 | 0 | s[1] = _mm_unpacklo_epi16(s2, s3); |
192 | 0 | s[2] = _mm_unpacklo_epi16(s4, s5); |
193 | |
|
194 | 0 | s[4] = _mm_unpackhi_epi16(s0, s1); |
195 | 0 | s[5] = _mm_unpackhi_epi16(s2, s3); |
196 | 0 | s[6] = _mm_unpackhi_epi16(s4, s5); |
197 | |
|
198 | 0 | s[0 + 8] = _mm_unpacklo_epi16(s1, s2); |
199 | 0 | s[1 + 8] = _mm_unpacklo_epi16(s3, s4); |
200 | 0 | s[2 + 8] = _mm_unpacklo_epi16(s5, s6); |
201 | |
|
202 | 0 | s[4 + 8] = _mm_unpackhi_epi16(s1, s2); |
203 | 0 | s[5 + 8] = _mm_unpackhi_epi16(s3, s4); |
204 | 0 | s[6 + 8] = _mm_unpackhi_epi16(s5, s6); |
205 | |
|
206 | 0 | for (i = 0; i < h; i += 2) { |
207 | 0 | data = &src_ptr[i * src_stride + j]; |
208 | |
|
209 | 0 | __m128i s7 = _mm_loadu_si128((__m128i *)(data + 7 * src_stride)); |
210 | 0 | __m128i s8 = _mm_loadu_si128((__m128i *)(data + 8 * src_stride)); |
211 | |
|
212 | 0 | s[3] = _mm_unpacklo_epi16(s6, s7); |
213 | 0 | s[7] = _mm_unpackhi_epi16(s6, s7); |
214 | |
|
215 | 0 | s[3 + 8] = _mm_unpacklo_epi16(s7, s8); |
216 | 0 | s[7 + 8] = _mm_unpackhi_epi16(s7, s8); |
217 | |
|
218 | 0 | const __m128i res_a0 = convolve(s, coeffs_y); |
219 | 0 | __m128i res_a_round0 = _mm_sra_epi32( |
220 | 0 | _mm_add_epi32(res_a0, round_const_bits), round_shift_bits); |
221 | |
|
222 | 0 | const __m128i res_a1 = convolve(s + 8, coeffs_y); |
223 | 0 | __m128i res_a_round1 = _mm_sra_epi32( |
224 | 0 | _mm_add_epi32(res_a1, round_const_bits), round_shift_bits); |
225 | |
|
226 | 0 | if (w - j > 4) { |
227 | 0 | const __m128i res_b0 = convolve(s + 4, coeffs_y); |
228 | 0 | __m128i res_b_round0 = _mm_sra_epi32( |
229 | 0 | _mm_add_epi32(res_b0, round_const_bits), round_shift_bits); |
230 | |
|
231 | 0 | const __m128i res_b1 = convolve(s + 4 + 8, coeffs_y); |
232 | 0 | __m128i res_b_round1 = _mm_sra_epi32( |
233 | 0 | _mm_add_epi32(res_b1, round_const_bits), round_shift_bits); |
234 | |
|
235 | 0 | __m128i res_16bit0 = _mm_packs_epi32(res_a_round0, res_b_round0); |
236 | 0 | res_16bit0 = _mm_min_epi16(res_16bit0, clip_pixel); |
237 | 0 | res_16bit0 = _mm_max_epi16(res_16bit0, zero); |
238 | |
|
239 | 0 | __m128i res_16bit1 = _mm_packs_epi32(res_a_round1, res_b_round1); |
240 | 0 | res_16bit1 = _mm_min_epi16(res_16bit1, clip_pixel); |
241 | 0 | res_16bit1 = _mm_max_epi16(res_16bit1, zero); |
242 | |
|
243 | 0 | _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_16bit0); |
244 | 0 | _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], |
245 | 0 | res_16bit1); |
246 | 0 | } else if (w == 4) { |
247 | 0 | res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0); |
248 | 0 | res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel); |
249 | 0 | res_a_round0 = _mm_max_epi16(res_a_round0, zero); |
250 | |
|
251 | 0 | res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1); |
252 | 0 | res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel); |
253 | 0 | res_a_round1 = _mm_max_epi16(res_a_round1, zero); |
254 | |
|
255 | 0 | _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_a_round0); |
256 | 0 | _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], |
257 | 0 | res_a_round1); |
258 | 0 | } else { |
259 | 0 | res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0); |
260 | 0 | res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel); |
261 | 0 | res_a_round0 = _mm_max_epi16(res_a_round0, zero); |
262 | |
|
263 | 0 | res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1); |
264 | 0 | res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel); |
265 | 0 | res_a_round1 = _mm_max_epi16(res_a_round1, zero); |
266 | |
|
267 | 0 | *((int *)(&dst[i * dst_stride + j])) = |
268 | 0 | _mm_cvtsi128_si32(res_a_round0); |
269 | |
|
270 | 0 | *((int *)(&dst[i * dst_stride + j + dst_stride])) = |
271 | 0 | _mm_cvtsi128_si32(res_a_round1); |
272 | 0 | } |
273 | |
|
274 | 0 | s[0] = s[1]; |
275 | 0 | s[1] = s[2]; |
276 | 0 | s[2] = s[3]; |
277 | |
|
278 | 0 | s[4] = s[5]; |
279 | 0 | s[5] = s[6]; |
280 | 0 | s[6] = s[7]; |
281 | |
|
282 | 0 | s[0 + 8] = s[1 + 8]; |
283 | 0 | s[1 + 8] = s[2 + 8]; |
284 | 0 | s[2 + 8] = s[3 + 8]; |
285 | |
|
286 | 0 | s[4 + 8] = s[5 + 8]; |
287 | 0 | s[5 + 8] = s[6 + 8]; |
288 | 0 | s[6 + 8] = s[7 + 8]; |
289 | |
|
290 | 0 | s6 = s8; |
291 | 0 | } |
292 | 0 | } |
293 | 0 | } |
294 | 0 | } |
295 | 0 | } |
296 | | |
297 | | void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride, |
298 | | uint16_t *dst, int dst_stride, int w, int h, |
299 | | const InterpFilterParams *filter_params_x, |
300 | | const int subpel_x_qn, |
301 | 0 | ConvolveParams *conv_params, int bd) { |
302 | 0 | int i, j; |
303 | 0 | const int fo_horiz = filter_params_x->taps / 2 - 1; |
304 | 0 | const uint16_t *const src_ptr = src - fo_horiz; |
305 | | |
306 | | // Check that, even with 12-bit input, the intermediate values will fit |
307 | | // into an unsigned 16-bit intermediate array. |
308 | 0 | assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16); |
309 | | |
310 | 0 | const __m128i round_const_x = |
311 | 0 | _mm_set1_epi32(((1 << conv_params->round_0) >> 1)); |
312 | 0 | const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0); |
313 | |
|
314 | 0 | const int bits = FILTER_BITS - conv_params->round_0; |
315 | |
|
316 | 0 | const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); |
317 | 0 | const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1); |
318 | 0 | const __m128i clip_pixel = |
319 | 0 | _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); |
320 | 0 | const __m128i zero = _mm_setzero_si128(); |
321 | |
|
322 | 0 | if (filter_params_x->taps == 12) { |
323 | 0 | __m128i s[6], coeffs_x[6]; |
324 | |
|
325 | 0 | prepare_coeffs_12tap(filter_params_x, subpel_x_qn, coeffs_x); |
326 | |
|
327 | 0 | for (j = 0; j < w; j += 8) { |
328 | | /* Horizontal filter */ |
329 | 0 | { |
330 | 0 | for (i = 0; i < h; i += 1) { |
331 | 0 | const __m128i row00 = |
332 | 0 | _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); |
333 | 0 | const __m128i row01 = |
334 | 0 | _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]); |
335 | 0 | const __m128i row02 = |
336 | 0 | _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 16)]); |
337 | | |
338 | | // even pixels |
339 | 0 | s[0] = _mm_alignr_epi8(row01, row00, 0); |
340 | 0 | s[1] = _mm_alignr_epi8(row01, row00, 4); |
341 | 0 | s[2] = _mm_alignr_epi8(row01, row00, 8); |
342 | 0 | s[3] = _mm_alignr_epi8(row01, row00, 12); |
343 | 0 | s[4] = _mm_alignr_epi8(row02, row01, 0); |
344 | 0 | s[5] = _mm_alignr_epi8(row02, row01, 4); |
345 | |
|
346 | 0 | __m128i res_even = convolve_12tap(s, coeffs_x); |
347 | 0 | res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x), |
348 | 0 | round_shift_x); |
349 | 0 | res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_bits), |
350 | 0 | round_shift_bits); |
351 | | |
352 | | // odd pixels |
353 | 0 | s[0] = _mm_alignr_epi8(row01, row00, 2); |
354 | 0 | s[1] = _mm_alignr_epi8(row01, row00, 6); |
355 | 0 | s[2] = _mm_alignr_epi8(row01, row00, 10); |
356 | 0 | s[3] = _mm_alignr_epi8(row01, row00, 14); |
357 | 0 | s[4] = _mm_alignr_epi8(row02, row01, 2); |
358 | 0 | s[5] = _mm_alignr_epi8(row02, row01, 6); |
359 | |
|
360 | 0 | __m128i res_odd = convolve_12tap(s, coeffs_x); |
361 | 0 | res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x), |
362 | 0 | round_shift_x); |
363 | 0 | res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_bits), |
364 | 0 | round_shift_bits); |
365 | |
|
366 | 0 | __m128i res_even1 = _mm_packs_epi32(res_even, res_even); |
367 | 0 | __m128i res_odd1 = _mm_packs_epi32(res_odd, res_odd); |
368 | 0 | __m128i res = _mm_unpacklo_epi16(res_even1, res_odd1); |
369 | |
|
370 | 0 | res = _mm_min_epi16(res, clip_pixel); |
371 | 0 | res = _mm_max_epi16(res, zero); |
372 | |
|
373 | 0 | if (w - j > 4) { |
374 | 0 | _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res); |
375 | 0 | } else if (w == 4) { |
376 | 0 | _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res); |
377 | 0 | } else { |
378 | 0 | *((int *)(&dst[i * dst_stride + j])) = _mm_cvtsi128_si32(res); |
379 | 0 | } |
380 | 0 | } |
381 | 0 | } |
382 | 0 | } |
383 | 0 | } else { |
384 | 0 | __m128i s[4], coeffs_x[4]; |
385 | 0 | prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x); |
386 | |
|
387 | 0 | for (j = 0; j < w; j += 8) { |
388 | | /* Horizontal filter */ |
389 | 0 | { |
390 | 0 | for (i = 0; i < h; i += 1) { |
391 | 0 | const __m128i row00 = |
392 | 0 | _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); |
393 | 0 | const __m128i row01 = |
394 | 0 | _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]); |
395 | | |
396 | | // even pixels |
397 | 0 | s[0] = _mm_alignr_epi8(row01, row00, 0); |
398 | 0 | s[1] = _mm_alignr_epi8(row01, row00, 4); |
399 | 0 | s[2] = _mm_alignr_epi8(row01, row00, 8); |
400 | 0 | s[3] = _mm_alignr_epi8(row01, row00, 12); |
401 | |
|
402 | 0 | __m128i res_even = convolve(s, coeffs_x); |
403 | 0 | res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x), |
404 | 0 | round_shift_x); |
405 | | |
406 | | // odd pixels |
407 | 0 | s[0] = _mm_alignr_epi8(row01, row00, 2); |
408 | 0 | s[1] = _mm_alignr_epi8(row01, row00, 6); |
409 | 0 | s[2] = _mm_alignr_epi8(row01, row00, 10); |
410 | 0 | s[3] = _mm_alignr_epi8(row01, row00, 14); |
411 | |
|
412 | 0 | __m128i res_odd = convolve(s, coeffs_x); |
413 | 0 | res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x), |
414 | 0 | round_shift_x); |
415 | |
|
416 | 0 | res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_bits), |
417 | 0 | round_shift_bits); |
418 | 0 | res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_bits), |
419 | 0 | round_shift_bits); |
420 | |
|
421 | 0 | __m128i res_even1 = _mm_packs_epi32(res_even, res_even); |
422 | 0 | __m128i res_odd1 = _mm_packs_epi32(res_odd, res_odd); |
423 | 0 | __m128i res = _mm_unpacklo_epi16(res_even1, res_odd1); |
424 | |
|
425 | 0 | res = _mm_min_epi16(res, clip_pixel); |
426 | 0 | res = _mm_max_epi16(res, zero); |
427 | |
|
428 | 0 | if (w - j > 4) { |
429 | 0 | _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res); |
430 | 0 | } else if (w == 4) { |
431 | 0 | _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res); |
432 | 0 | } else { |
433 | 0 | *((int *)(&dst[i * dst_stride + j])) = _mm_cvtsi128_si32(res); |
434 | 0 | } |
435 | 0 | } |
436 | 0 | } |
437 | 0 | } |
438 | 0 | } |
439 | 0 | } |