/src/libavc/encoder/x86/svc/isvce_downscaler_sse42.c
Line | Count | Source |
1 | | /****************************************************************************** |
2 | | * |
3 | | * Copyright (C) 2022 The Android Open Source Project |
4 | | * |
5 | | * Licensed under the Apache License, Version 2.0 (the "License"); |
6 | | * you may not use this file except in compliance with the License. |
7 | | * You may obtain a copy of the License at: |
8 | | * |
9 | | * http://www.apache.org/licenses/LICENSE-2.0 |
10 | | * |
11 | | * Unless required by applicable law or agreed to in writing, software |
12 | | * distributed under the License is distributed on an "AS IS" BASIS, |
13 | | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | * See the License for the specific language governing permissions and |
15 | | * limitations under the License. |
16 | | * |
17 | | ***************************************************************************** |
18 | | * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore |
19 | | */ |
20 | | |
21 | | /** |
22 | | ****************************************************************************** |
23 | | * @file isvce_downscaler_sse42.c |
24 | | * |
25 | | * @brief |
26 | | * This file contains the x86 SIMD version of the function which does |
27 | | * horizontal scaling and transpose |
28 | | * |
29 | | * @author |
30 | | * Ittiam |
31 | | * |
32 | | * @par List of Functions: |
33 | | * - isvce_horizontal_downscale_and_transpose_sse42() |
34 | | * |
35 | | * @remarks |
36 | | * None |
37 | | * |
38 | | ******************************************************************************* |
39 | | */ |
40 | | |
41 | | /*****************************************************************************/ |
42 | | /* File Includes */ |
43 | | /*****************************************************************************/ |
44 | | |
45 | | /* System include files */ |
46 | | #include <stdio.h> |
47 | | #include <stdlib.h> |
48 | | #include <immintrin.h> |
49 | | |
50 | | /* User include files */ |
51 | | #include "ih264_typedefs.h" |
52 | | #include "isvc_macros.h" |
53 | | #include "ih264_platform_macros.h" |
54 | | #include "isvc_defs.h" |
55 | | #include "isvce_defs.h" |
56 | | #include "isvc_structs.h" |
57 | | #include "isvce_downscaler_private_defs.h" |
58 | | |
59 | | /*****************************************************************************/ |
60 | | /* Function Definitions */ |
61 | | /*****************************************************************************/ |
62 | | |
63 | | /** |
64 | | ******************************************************************************* |
65 | | * |
66 | | * @brief |
67 | | * horizontal scaler function |
68 | | * |
69 | | * @par Description: |
70 | | * Does horizontal scaling for the given block |
71 | | * |
72 | | * @param[in] ps_scaler |
73 | | * pointer to downscaler context |
74 | | * |
75 | | * @param[in] ps_src |
76 | | * pointer to source buffer container |
77 | | * |
78 | | * @param[in] ps_dst |
79 | | * pointer to destination buffer container |
80 | | * |
81 | | * @param[in] pai1_filters |
82 | | * pointer to array of downscaler filters |
83 | | * |
84 | | * @param[in] u4_blk_wd |
85 | | * width of the block after horizontal scaling (output block width) |
86 | | * |
87 | | * @param[in] u4_blk_ht |
88 | | * height of the current block (input block height) |
89 | | * |
90 | | * @param[in] u1_is_chroma |
91 | | * flag suggesting whether the buffer is luma or chroma |
92 | | * |
93 | | * |
94 | | * @returns |
95 | | * |
96 | | * @remarks |
97 | | * The same function is used for vertical scaling too as |
98 | | * the horizontally scaled input in stored in transpose fashion. |
99 | | * |
100 | | ******************************************************************************* |
101 | | */ |
102 | | |
103 | | void isvce_horizontal_downscale_and_transpose_sse42( |
104 | | downscaler_ctxt_t *ps_scaler, buffer_container_t *ps_src, buffer_container_t *ps_dst, |
105 | | FILTER_COEFF_ARRAY pai1_filters, UWORD32 u4_blk_wd, UWORD32 u4_blk_ht, UWORD8 u1_is_chroma) |
106 | 933k | { |
107 | 933k | WORD32 i, j; |
108 | 933k | UWORD8 u1_phase; |
109 | 933k | UWORD8 *pu1_src_j, *pu1_dst_j; |
110 | 933k | WORD32 i4_temp_pixel_holder; |
111 | 933k | UWORD32 u4_num_iterations_vertical_by_16; |
112 | 933k | UWORD32 u4_rem_vert_loop; |
113 | 933k | UWORD8 *pu1_in_pixel; |
114 | 933k | UWORD8 *pu1_out_pixel; |
115 | 933k | WORD8 *pi1_filter_for_grid; |
116 | 933k | UWORD16 u2_full_pixel_inc; |
117 | | |
118 | 933k | __m128i src_temp_0, src_temp_1, src_temp_2, src_temp_3, src_temp_4, src_temp_5, src_temp_6, |
119 | 933k | src_temp_7; |
120 | | |
121 | 933k | __m128i reg_all_1s, reg_64val_32bit, reg_all_0s, filt_coeff_grid, reg_shuffle; |
122 | | |
123 | 933k | __m128i reg_01_16x8b, reg_02_16x8b, reg_03_16x8b, reg_04_16x8b, reg_05_16x8b; |
124 | | |
125 | 933k | downscaler_state_t *ps_scaler_state = (downscaler_state_t *) ps_scaler->pv_scaler_state; |
126 | | |
127 | 933k | UWORD32 u4_center_pixel_pos = ps_scaler_state->i4_init_offset; |
128 | 933k | UWORD32 u4_src_vert_increments = ps_scaler_state->u4_vert_increment; |
129 | 933k | UWORD32 u4_src_horz_increments = ps_scaler_state->u4_horz_increment; |
130 | | |
131 | 933k | UWORD8 *pu1_src = ps_src->pv_data; |
132 | 933k | UWORD32 u4_in_stride = ps_src->i4_data_stride; |
133 | 933k | UWORD8 *pu1_dst = ps_dst->pv_data; |
134 | 933k | UWORD32 u4_out_stride = ps_dst->i4_data_stride; |
135 | 933k | UWORD32 u4_center_pixel_pos_src = u4_center_pixel_pos; |
136 | | |
137 | 933k | ASSERT((1 << DOWNSCALER_Q) == u4_src_vert_increments); |
138 | | |
139 | 933k | reg_all_1s = _mm_set1_epi16((short) 1); |
140 | 933k | reg_64val_32bit = _mm_set1_epi32((int) 64); |
141 | 933k | reg_all_0s = _mm_setzero_si128(); |
142 | 933k | reg_shuffle = _mm_set_epi8(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0); |
143 | | |
144 | 933k | u4_num_iterations_vertical_by_16 = u4_blk_ht >> 4; |
145 | 933k | u4_rem_vert_loop = u4_blk_ht % 16; |
146 | | |
147 | | /* Offset the input so that the input pixel to be processed |
148 | | co-incides with the centre of filter (4th coefficient)*/ |
149 | 933k | pu1_src += (1 + u1_is_chroma); |
150 | | |
151 | 933k | if(!u1_is_chroma) |
152 | 700k | { |
153 | 3.21M | for(j = 0; j < (WORD32) u4_num_iterations_vertical_by_16; j++) |
154 | 2.51M | { |
155 | 2.51M | pu1_src_j = pu1_src + ((j << 4) * u4_in_stride); |
156 | 2.51M | pu1_dst_j = pu1_dst + (j << 4); |
157 | | |
158 | 2.51M | u4_center_pixel_pos = u4_center_pixel_pos_src; |
159 | | |
160 | 102M | for(i = 0; i < (WORD32) u4_blk_wd; i++) |
161 | 99.5M | { |
162 | 99.5M | u1_phase = get_filter_phase(u4_center_pixel_pos); |
163 | 99.5M | pi1_filter_for_grid = pai1_filters[u1_phase]; |
164 | | |
165 | 99.5M | u2_full_pixel_inc = u4_center_pixel_pos >> DOWNSCALER_Q; |
166 | | |
167 | 99.5M | pu1_in_pixel = pu1_src_j + (u2_full_pixel_inc << u1_is_chroma); |
168 | | |
169 | 99.5M | pu1_out_pixel = pu1_dst_j + ((i << u1_is_chroma) * u4_out_stride); |
170 | | |
171 | 99.5M | filt_coeff_grid = _mm_loadu_si128((__m128i *) pi1_filter_for_grid); |
172 | | /******************************************************/ |
173 | | /* This loop is going vertically in bottom direction */ |
174 | | /* but the output pixels are stored in horizontal */ |
175 | | /* direction in transpose manner */ |
176 | | /******************************************************/ |
177 | | |
178 | | /*For row 0,1*/ |
179 | 99.5M | src_temp_0 = _mm_loadl_epi64((__m128i *) pu1_in_pixel); |
180 | 99.5M | src_temp_1 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride)); |
181 | | /*next transfer the 8 pixels from temp_2 to temp_1 higher bits 64-127*/ |
182 | 99.5M | src_temp_0 = _mm_unpacklo_epi64(src_temp_0, src_temp_1); |
183 | | |
184 | | /*For row 2,3*/ |
185 | 99.5M | src_temp_2 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride * 2)); |
186 | | |
187 | 99.5M | src_temp_3 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride * 3)); |
188 | | |
189 | 99.5M | src_temp_2 = _mm_unpacklo_epi64(src_temp_2, src_temp_3); |
190 | | |
191 | 99.5M | reg_01_16x8b = _mm_maddubs_epi16(src_temp_0, filt_coeff_grid); |
192 | | |
193 | | /*multiply with filter coeffs to get 16 bit results*/ |
194 | 99.5M | reg_02_16x8b = _mm_maddubs_epi16(src_temp_2, filt_coeff_grid); |
195 | | |
196 | 99.5M | reg_01_16x8b = _mm_hadd_epi16(reg_01_16x8b, reg_02_16x8b); |
197 | | /*add adjacent 16 bit values to get 32 bit values*/ |
198 | 99.5M | reg_01_16x8b = _mm_madd_epi16(reg_01_16x8b, reg_all_1s); |
199 | | |
200 | | /*Add offset of 64 for rounding each out pixel value*/ |
201 | 99.5M | reg_01_16x8b = _mm_add_epi32(reg_01_16x8b, reg_64val_32bit); |
202 | | /*Divide by 128 each out pixel value*/ |
203 | 99.5M | reg_01_16x8b = _mm_srli_epi32(reg_01_16x8b, 7); |
204 | | |
205 | | /*For row 4,5*/ |
206 | 99.5M | src_temp_4 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride * 4)); |
207 | | |
208 | 99.5M | src_temp_5 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride * 5)); |
209 | | |
210 | 99.5M | src_temp_4 = _mm_unpacklo_epi64(src_temp_4, src_temp_5); |
211 | | |
212 | | /*For row 6,7*/ |
213 | 99.5M | src_temp_6 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride * 6)); |
214 | | |
215 | 99.5M | src_temp_7 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride * 7)); |
216 | | |
217 | 99.5M | src_temp_6 = _mm_unpacklo_epi64(src_temp_6, src_temp_7); |
218 | | |
219 | 99.5M | reg_03_16x8b = _mm_maddubs_epi16(src_temp_4, filt_coeff_grid); |
220 | | |
221 | 99.5M | reg_04_16x8b = _mm_maddubs_epi16(src_temp_6, filt_coeff_grid); |
222 | | |
223 | 99.5M | reg_03_16x8b = _mm_hadd_epi16(reg_03_16x8b, reg_04_16x8b); |
224 | | |
225 | 99.5M | reg_03_16x8b = _mm_madd_epi16(reg_03_16x8b, reg_all_1s); |
226 | | |
227 | | /*next add 2 adjacent 32 bit values to get a single 32 bit |
228 | | **value in each row |
229 | | */ |
230 | | |
231 | | /*Add offset of 64 for rounding each out pixel value*/ |
232 | 99.5M | reg_03_16x8b = _mm_add_epi32(reg_03_16x8b, reg_64val_32bit); |
233 | | /*Divide by 128 each out pixel value*/ |
234 | 99.5M | reg_03_16x8b = _mm_srli_epi32(reg_03_16x8b, 7); |
235 | | |
236 | | /*pack the lower 16 bit values corresponding to the 8 output |
237 | | pixels from reg1 and reg 2*/ |
238 | 99.5M | reg_01_16x8b = _mm_packus_epi32(reg_01_16x8b, reg_03_16x8b); |
239 | | |
240 | | /*For row 8,9*/ |
241 | 99.5M | src_temp_0 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + 8 * u4_in_stride)); |
242 | | |
243 | 99.5M | src_temp_1 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + 9 * u4_in_stride)); |
244 | | |
245 | | /*next transfer the 8 pixels from temp_2 to temp_1 higher bits 64-127*/ |
246 | 99.5M | src_temp_0 = _mm_unpacklo_epi64(src_temp_0, src_temp_1); |
247 | | |
248 | | /*For row 10,11*/ |
249 | 99.5M | src_temp_2 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride * 10)); |
250 | | |
251 | 99.5M | src_temp_3 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride * 11)); |
252 | | |
253 | 99.5M | src_temp_2 = _mm_unpacklo_epi64(src_temp_2, src_temp_3); |
254 | | |
255 | 99.5M | reg_02_16x8b = _mm_maddubs_epi16(src_temp_0, filt_coeff_grid); |
256 | | |
257 | | /*multiply with filter coeffs to get 16 bit results*/ |
258 | 99.5M | reg_03_16x8b = _mm_maddubs_epi16(src_temp_2, filt_coeff_grid); |
259 | | |
260 | 99.5M | reg_02_16x8b = _mm_hadd_epi16(reg_02_16x8b, reg_03_16x8b); |
261 | | /*add adjacent 16 bit values to get 32 bit values*/ |
262 | 99.5M | reg_02_16x8b = _mm_madd_epi16(reg_02_16x8b, reg_all_1s); |
263 | | |
264 | | /*next add 2 adjacent 32 bit values to get a single |
265 | | 32 bit value in each row*/ |
266 | | |
267 | | /*Add offset of 64 for rounding each out pixel value*/ |
268 | 99.5M | reg_02_16x8b = _mm_add_epi32(reg_02_16x8b, reg_64val_32bit); |
269 | | /*Divide by 128 each out pixel value*/ |
270 | 99.5M | reg_02_16x8b = _mm_srli_epi32(reg_02_16x8b, 7); |
271 | | |
272 | | /*For row 12,13*/ |
273 | 99.5M | src_temp_4 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride * 12)); |
274 | | |
275 | 99.5M | src_temp_5 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride * 13)); |
276 | | |
277 | 99.5M | src_temp_4 = _mm_unpacklo_epi64(src_temp_4, src_temp_5); |
278 | | |
279 | | /*For row 14,15*/ |
280 | 99.5M | src_temp_6 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride * 14)); |
281 | | |
282 | 99.5M | src_temp_7 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride * 15)); |
283 | | |
284 | 99.5M | src_temp_6 = _mm_unpacklo_epi64(src_temp_6, src_temp_7); |
285 | | |
286 | 99.5M | reg_04_16x8b = _mm_maddubs_epi16(src_temp_4, filt_coeff_grid); |
287 | | |
288 | 99.5M | reg_05_16x8b = _mm_maddubs_epi16(src_temp_6, filt_coeff_grid); |
289 | | |
290 | 99.5M | reg_04_16x8b = _mm_hadd_epi16(reg_04_16x8b, reg_05_16x8b); |
291 | | /*add adjacent 16 bit values to get 32 bit values*/ |
292 | 99.5M | reg_04_16x8b = _mm_madd_epi16(reg_04_16x8b, reg_all_1s); |
293 | | |
294 | | /*next add 2 adjacent 32 bit values to get a single |
295 | | 32 bit value in each row*/ |
296 | | |
297 | | /*Add offset of 64 for rounding each out pixel value*/ |
298 | 99.5M | reg_04_16x8b = _mm_add_epi32(reg_04_16x8b, reg_64val_32bit); |
299 | | /*Divide by 128 each out pixel value*/ |
300 | 99.5M | reg_04_16x8b = _mm_srli_epi32(reg_04_16x8b, 7); |
301 | | |
302 | | /*pack the lower 16 bit values corresponding to the 8 output |
303 | | pixels from reg1 and reg 2*/ |
304 | 99.5M | reg_02_16x8b = _mm_packus_epi32(reg_02_16x8b, reg_04_16x8b); |
305 | | |
306 | | /*next get saturated 8 bit output pixel values for row 0-15*/ |
307 | 99.5M | reg_01_16x8b = _mm_packus_epi16(reg_01_16x8b, reg_02_16x8b); |
308 | | |
309 | | /*Store the 16 output values*/ |
310 | 99.5M | _mm_storeu_si128((__m128i *) pu1_out_pixel, reg_01_16x8b); |
311 | | |
312 | 99.5M | pu1_out_pixel += 16; |
313 | | |
314 | 99.5M | pu1_in_pixel += ((u4_src_vert_increments * (u4_in_stride << 4)) >> DOWNSCALER_Q); |
315 | | |
316 | | /* Update the context for next Loop Count */ |
317 | 99.5M | u4_center_pixel_pos += u4_src_horz_increments; |
318 | 99.5M | } |
319 | 2.51M | } |
320 | | |
321 | | /*if height is not a multiple of 8 process 2 rows at a |
322 | | time for the remaining rows*/ |
323 | 700k | if(u4_rem_vert_loop) |
324 | 233k | { |
325 | 233k | pu1_src_j = pu1_src + ((j << 4) * u4_in_stride); |
326 | 233k | pu1_dst_j = pu1_dst + (j << 4); |
327 | | |
328 | 233k | u4_center_pixel_pos = u4_center_pixel_pos_src; |
329 | | |
330 | 11.0M | for(i = 0; i < (WORD32) u4_blk_wd; i++) |
331 | 10.7M | { |
332 | 10.7M | u1_phase = get_filter_phase(u4_center_pixel_pos); |
333 | 10.7M | pi1_filter_for_grid = pai1_filters[u1_phase]; |
334 | | |
335 | 10.7M | u2_full_pixel_inc = u4_center_pixel_pos >> DOWNSCALER_Q; |
336 | | |
337 | 10.7M | pu1_in_pixel = pu1_src_j + (u2_full_pixel_inc << u1_is_chroma); |
338 | | |
339 | 10.7M | pu1_out_pixel = pu1_dst_j + ((i << u1_is_chroma) * u4_out_stride); |
340 | | |
341 | 10.7M | filt_coeff_grid = _mm_loadu_si128((__m128i *) pi1_filter_for_grid); |
342 | | |
343 | 97.0M | for(j = u4_rem_vert_loop; j > 0; j--) |
344 | 86.2M | { |
345 | 86.2M | src_temp_0 = _mm_loadl_epi64((__m128i const *) pu1_in_pixel); |
346 | | |
347 | 86.2M | src_temp_0 = _mm_maddubs_epi16(src_temp_0, filt_coeff_grid); |
348 | | |
349 | 86.2M | src_temp_0 = _mm_madd_epi16(src_temp_0, reg_all_1s); |
350 | | |
351 | 86.2M | reg_01_16x8b = _mm_hadd_epi32(src_temp_0, reg_all_0s); |
352 | | |
353 | | /*Add offset of 64 for rounding each out pixel value*/ |
354 | 86.2M | reg_01_16x8b = _mm_add_epi32(reg_01_16x8b, reg_64val_32bit); |
355 | | /*Divide by 128 each out pixel value*/ |
356 | 86.2M | reg_01_16x8b = _mm_srli_epi32(reg_01_16x8b, (int) 7); |
357 | | |
358 | 86.2M | reg_01_16x8b = _mm_packus_epi32(reg_01_16x8b, reg_all_0s); |
359 | | |
360 | | /*next get saturated 8 bit output pixel values*/ |
361 | 86.2M | reg_01_16x8b = _mm_packus_epi16(reg_01_16x8b, reg_all_0s); |
362 | | |
363 | | /*Store the 1 output value*/ |
364 | 86.2M | *pu1_out_pixel = (UWORD8) _mm_cvtsi128_si32(reg_01_16x8b); |
365 | | |
366 | 86.2M | pu1_in_pixel += (u4_src_vert_increments * u4_in_stride) >> DOWNSCALER_Q; |
367 | | |
368 | 86.2M | pu1_out_pixel++; |
369 | 86.2M | } |
370 | | /* Update the context for next Loop Count */ |
371 | 10.7M | u4_center_pixel_pos += u4_src_horz_increments; |
372 | 10.7M | } |
373 | 233k | } |
374 | 700k | } |
375 | | |
376 | 233k | else /* for chroma */ |
377 | 233k | { |
378 | 831k | for(j = 0; j < (WORD32) u4_num_iterations_vertical_by_16; j++) |
379 | 598k | { |
380 | 598k | pu1_src_j = pu1_src + ((j << 4) * u4_in_stride); |
381 | 598k | pu1_dst_j = pu1_dst + (j << 4); |
382 | | |
383 | 598k | u4_center_pixel_pos = u4_center_pixel_pos_src; |
384 | | |
385 | 28.9M | for(i = 0; i < (WORD32) u4_blk_wd; i++) |
386 | 28.3M | { |
387 | 28.3M | u1_phase = get_filter_phase(u4_center_pixel_pos); |
388 | 28.3M | pi1_filter_for_grid = pai1_filters[u1_phase]; |
389 | | |
390 | 28.3M | u2_full_pixel_inc = u4_center_pixel_pos >> DOWNSCALER_Q; |
391 | | |
392 | 28.3M | pu1_in_pixel = pu1_src_j + (u2_full_pixel_inc << u1_is_chroma); |
393 | | |
394 | 28.3M | pu1_out_pixel = pu1_dst_j + ((i << u1_is_chroma) * u4_out_stride); |
395 | | |
396 | 28.3M | filt_coeff_grid = _mm_loadu_si128((__m128i *) pi1_filter_for_grid); |
397 | | /******************************************************/ |
398 | | /* This loop is going vertically in bottom direction */ |
399 | | /* but the output pixels are stored in horizontal */ |
400 | | /* direction in transpose manner */ |
401 | | /******************************************************/ |
402 | | |
403 | | /*Load 16 values shuffle to separate Cb and Cr and process*/ |
404 | | |
405 | 28.3M | src_temp_0 = _mm_loadu_si128((__m128i *) pu1_in_pixel); |
406 | 28.3M | src_temp_1 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride)); |
407 | | |
408 | 28.3M | src_temp_2 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride * 2)); |
409 | | |
410 | 28.3M | src_temp_3 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride * 3)); |
411 | | |
412 | 28.3M | src_temp_0 = _mm_shuffle_epi8(src_temp_0, reg_shuffle); |
413 | 28.3M | src_temp_1 = _mm_shuffle_epi8(src_temp_1, reg_shuffle); |
414 | 28.3M | src_temp_2 = _mm_shuffle_epi8(src_temp_2, reg_shuffle); |
415 | 28.3M | src_temp_3 = _mm_shuffle_epi8(src_temp_3, reg_shuffle); |
416 | | |
417 | 28.3M | reg_01_16x8b = _mm_maddubs_epi16(src_temp_0, filt_coeff_grid); |
418 | 28.3M | reg_02_16x8b = _mm_maddubs_epi16(src_temp_1, filt_coeff_grid); |
419 | | |
420 | 28.3M | reg_01_16x8b = _mm_hadd_epi16(reg_01_16x8b, reg_02_16x8b); |
421 | | |
422 | 28.3M | reg_01_16x8b = _mm_madd_epi16(reg_01_16x8b, reg_all_1s); |
423 | | |
424 | 28.3M | reg_01_16x8b = _mm_add_epi32(reg_01_16x8b, reg_64val_32bit); |
425 | | |
426 | 28.3M | reg_01_16x8b = _mm_srli_epi32(reg_01_16x8b, (int) 7); |
427 | | |
428 | 28.3M | reg_03_16x8b = _mm_maddubs_epi16(src_temp_2, filt_coeff_grid); |
429 | 28.3M | reg_04_16x8b = _mm_maddubs_epi16(src_temp_3, filt_coeff_grid); |
430 | | |
431 | 28.3M | src_temp_4 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride * 4)); |
432 | | |
433 | 28.3M | src_temp_5 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride * 5)); |
434 | | |
435 | 28.3M | src_temp_6 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride * 6)); |
436 | | |
437 | 28.3M | src_temp_7 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride * 7)); |
438 | | |
439 | 28.3M | src_temp_4 = _mm_shuffle_epi8(src_temp_4, reg_shuffle); |
440 | 28.3M | src_temp_5 = _mm_shuffle_epi8(src_temp_5, reg_shuffle); |
441 | 28.3M | src_temp_6 = _mm_shuffle_epi8(src_temp_6, reg_shuffle); |
442 | 28.3M | src_temp_7 = _mm_shuffle_epi8(src_temp_7, reg_shuffle); |
443 | | |
444 | 28.3M | reg_03_16x8b = _mm_hadd_epi16(reg_03_16x8b, reg_04_16x8b); |
445 | | |
446 | 28.3M | reg_03_16x8b = _mm_madd_epi16(reg_03_16x8b, reg_all_1s); |
447 | | |
448 | 28.3M | reg_03_16x8b = _mm_add_epi32(reg_03_16x8b, reg_64val_32bit); |
449 | | |
450 | 28.3M | reg_03_16x8b = _mm_srli_epi32(reg_03_16x8b, (int) 7); |
451 | | |
452 | 28.3M | reg_01_16x8b = _mm_packus_epi32(reg_01_16x8b, reg_03_16x8b); |
453 | | |
454 | 28.3M | reg_02_16x8b = _mm_maddubs_epi16(src_temp_4, filt_coeff_grid); |
455 | 28.3M | reg_04_16x8b = _mm_maddubs_epi16(src_temp_5, filt_coeff_grid); |
456 | | |
457 | 28.3M | reg_02_16x8b = _mm_hadd_epi16(reg_02_16x8b, reg_04_16x8b); |
458 | | |
459 | 28.3M | reg_02_16x8b = _mm_madd_epi16(reg_02_16x8b, reg_all_1s); |
460 | | |
461 | 28.3M | reg_02_16x8b = _mm_add_epi32(reg_02_16x8b, reg_64val_32bit); |
462 | | |
463 | 28.3M | reg_02_16x8b = _mm_srli_epi32(reg_02_16x8b, (int) 7); |
464 | | |
465 | 28.3M | reg_03_16x8b = _mm_maddubs_epi16(src_temp_6, filt_coeff_grid); |
466 | 28.3M | reg_04_16x8b = _mm_maddubs_epi16(src_temp_7, filt_coeff_grid); |
467 | | |
468 | 28.3M | reg_03_16x8b = _mm_hadd_epi16(reg_03_16x8b, reg_04_16x8b); |
469 | | |
470 | 28.3M | reg_03_16x8b = _mm_madd_epi16(reg_03_16x8b, reg_all_1s); |
471 | | |
472 | 28.3M | reg_03_16x8b = _mm_add_epi32(reg_03_16x8b, reg_64val_32bit); |
473 | | |
474 | 28.3M | reg_03_16x8b = _mm_srli_epi32(reg_03_16x8b, (int) 7); |
475 | | |
476 | 28.3M | reg_02_16x8b = _mm_packus_epi32(reg_02_16x8b, reg_03_16x8b); |
477 | | |
478 | 28.3M | reg_01_16x8b = _mm_packus_epi16(reg_01_16x8b, reg_02_16x8b); |
479 | | |
480 | 28.3M | reg_01_16x8b = _mm_shuffle_epi8(reg_01_16x8b, reg_shuffle); |
481 | | |
482 | 28.3M | src_temp_0 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + 8 * u4_in_stride)); |
483 | | |
484 | 28.3M | src_temp_1 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + 9 * u4_in_stride)); |
485 | | |
486 | 28.3M | src_temp_2 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride * 10)); |
487 | | |
488 | 28.3M | src_temp_3 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride * 11)); |
489 | | |
490 | 28.3M | src_temp_0 = _mm_shuffle_epi8(src_temp_0, reg_shuffle); |
491 | 28.3M | src_temp_1 = _mm_shuffle_epi8(src_temp_1, reg_shuffle); |
492 | 28.3M | src_temp_2 = _mm_shuffle_epi8(src_temp_2, reg_shuffle); |
493 | 28.3M | src_temp_3 = _mm_shuffle_epi8(src_temp_3, reg_shuffle); |
494 | | |
495 | 28.3M | reg_02_16x8b = _mm_maddubs_epi16(src_temp_0, filt_coeff_grid); |
496 | 28.3M | reg_03_16x8b = _mm_maddubs_epi16(src_temp_1, filt_coeff_grid); |
497 | | |
498 | 28.3M | reg_02_16x8b = _mm_hadd_epi16(reg_02_16x8b, reg_03_16x8b); |
499 | | |
500 | 28.3M | reg_02_16x8b = _mm_madd_epi16(reg_02_16x8b, reg_all_1s); |
501 | | |
502 | 28.3M | reg_02_16x8b = _mm_add_epi32(reg_02_16x8b, reg_64val_32bit); |
503 | | |
504 | 28.3M | reg_02_16x8b = _mm_srli_epi32(reg_02_16x8b, (int) 7); |
505 | | |
506 | 28.3M | reg_04_16x8b = _mm_maddubs_epi16(src_temp_2, filt_coeff_grid); |
507 | 28.3M | reg_05_16x8b = _mm_maddubs_epi16(src_temp_3, filt_coeff_grid); |
508 | | |
509 | 28.3M | src_temp_4 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride * 12)); |
510 | | |
511 | 28.3M | src_temp_5 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride * 13)); |
512 | | |
513 | 28.3M | src_temp_6 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride * 14)); |
514 | | |
515 | 28.3M | src_temp_7 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride * 15)); |
516 | | |
517 | 28.3M | src_temp_4 = _mm_shuffle_epi8(src_temp_4, reg_shuffle); |
518 | 28.3M | src_temp_5 = _mm_shuffle_epi8(src_temp_5, reg_shuffle); |
519 | 28.3M | src_temp_6 = _mm_shuffle_epi8(src_temp_6, reg_shuffle); |
520 | 28.3M | src_temp_7 = _mm_shuffle_epi8(src_temp_7, reg_shuffle); |
521 | | |
522 | 28.3M | reg_04_16x8b = _mm_hadd_epi16(reg_04_16x8b, reg_05_16x8b); |
523 | | |
524 | 28.3M | reg_04_16x8b = _mm_madd_epi16(reg_04_16x8b, reg_all_1s); |
525 | | |
526 | 28.3M | reg_04_16x8b = _mm_add_epi32(reg_04_16x8b, reg_64val_32bit); |
527 | | |
528 | 28.3M | reg_04_16x8b = _mm_srli_epi32(reg_04_16x8b, (int) 7); |
529 | | |
530 | 28.3M | reg_02_16x8b = _mm_packus_epi32(reg_02_16x8b, reg_04_16x8b); |
531 | | |
532 | 28.3M | reg_03_16x8b = _mm_maddubs_epi16(src_temp_4, filt_coeff_grid); |
533 | 28.3M | reg_05_16x8b = _mm_maddubs_epi16(src_temp_5, filt_coeff_grid); |
534 | | |
535 | 28.3M | reg_03_16x8b = _mm_hadd_epi16(reg_03_16x8b, reg_05_16x8b); |
536 | | |
537 | 28.3M | reg_03_16x8b = _mm_madd_epi16(reg_03_16x8b, reg_all_1s); |
538 | | |
539 | 28.3M | reg_03_16x8b = _mm_add_epi32(reg_03_16x8b, reg_64val_32bit); |
540 | | |
541 | 28.3M | reg_03_16x8b = _mm_srli_epi32(reg_03_16x8b, (int) 7); |
542 | | |
543 | 28.3M | reg_04_16x8b = _mm_maddubs_epi16(src_temp_6, filt_coeff_grid); |
544 | 28.3M | reg_05_16x8b = _mm_maddubs_epi16(src_temp_7, filt_coeff_grid); |
545 | | |
546 | 28.3M | reg_04_16x8b = _mm_hadd_epi16(reg_04_16x8b, reg_05_16x8b); |
547 | | |
548 | 28.3M | reg_04_16x8b = _mm_madd_epi16(reg_04_16x8b, reg_all_1s); |
549 | | |
550 | 28.3M | reg_04_16x8b = _mm_add_epi32(reg_04_16x8b, reg_64val_32bit); |
551 | | |
552 | 28.3M | reg_04_16x8b = _mm_srli_epi32(reg_04_16x8b, (int) 7); |
553 | | |
554 | 28.3M | reg_03_16x8b = _mm_packus_epi32(reg_03_16x8b, reg_04_16x8b); |
555 | | |
556 | 28.3M | reg_02_16x8b = _mm_packus_epi16(reg_02_16x8b, reg_03_16x8b); |
557 | | |
558 | 28.3M | reg_02_16x8b = _mm_shuffle_epi8(reg_02_16x8b, reg_shuffle); |
559 | | |
560 | 28.3M | reg_03_16x8b = _mm_unpacklo_epi64(reg_01_16x8b, reg_02_16x8b); |
561 | | |
562 | 28.3M | reg_04_16x8b = _mm_unpackhi_epi64(reg_01_16x8b, reg_02_16x8b); |
563 | | |
564 | | /*Storing after shuffling again*/ |
565 | | |
566 | 28.3M | _mm_storeu_si128((__m128i *) pu1_out_pixel, reg_03_16x8b); |
567 | 28.3M | _mm_storeu_si128((__m128i *) (pu1_out_pixel + u4_out_stride), reg_04_16x8b); |
568 | | |
569 | 28.3M | pu1_out_pixel += 16; |
570 | | |
571 | 28.3M | pu1_in_pixel += (u4_src_vert_increments * (u4_in_stride << 4)) >> DOWNSCALER_Q; |
572 | | |
573 | | /* Update the context for next Loop Count */ |
574 | 28.3M | u4_center_pixel_pos += u4_src_horz_increments; |
575 | 28.3M | } |
576 | 598k | } |
577 | | |
578 | | /*if height is not a multiple of 8 process 2 rows at a |
579 | | time for the remaining rows*/ |
580 | 233k | if(u4_rem_vert_loop) |
581 | 199k | { |
582 | 199k | pu1_src_j = pu1_src + ((j << 4) * u4_in_stride); |
583 | 199k | pu1_dst_j = pu1_dst + (j << 4); |
584 | | |
585 | 199k | u4_center_pixel_pos = u4_center_pixel_pos_src; |
586 | 9.09M | for(i = 0; i < (WORD32) u4_blk_wd; i++) |
587 | 8.89M | { |
588 | 8.89M | UWORD8 u1_phase = get_filter_phase(u4_center_pixel_pos); |
589 | 8.89M | pi1_filter_for_grid = pai1_filters[u1_phase]; |
590 | | |
591 | 8.89M | u2_full_pixel_inc = u4_center_pixel_pos >> DOWNSCALER_Q; |
592 | | |
593 | 8.89M | pu1_in_pixel = pu1_src_j + (u2_full_pixel_inc << u1_is_chroma); |
594 | | |
595 | 8.89M | pu1_out_pixel = pu1_dst_j + ((i << u1_is_chroma) * u4_out_stride); |
596 | | |
597 | 8.89M | filt_coeff_grid = _mm_loadu_si128((__m128i *) pi1_filter_for_grid); |
598 | | |
599 | 44.4M | for(j = u4_rem_vert_loop; j > 0; j = j - 2) |
600 | 35.5M | { |
601 | 35.5M | src_temp_0 = _mm_loadu_si128((__m128i const *) pu1_in_pixel); |
602 | 35.5M | src_temp_0 = _mm_shuffle_epi8(src_temp_0, reg_shuffle); |
603 | | |
604 | 35.5M | src_temp_1 = _mm_loadu_si128((__m128i const *) (pu1_in_pixel + u4_in_stride)); |
605 | | |
606 | 35.5M | src_temp_1 = _mm_shuffle_epi8(src_temp_1, reg_shuffle); |
607 | | |
608 | 35.5M | src_temp_0 = _mm_maddubs_epi16(src_temp_0, filt_coeff_grid); |
609 | 35.5M | src_temp_1 = _mm_maddubs_epi16(src_temp_1, filt_coeff_grid); |
610 | | |
611 | 35.5M | reg_01_16x8b = _mm_hadd_epi16(src_temp_0, src_temp_1); |
612 | | |
613 | 35.5M | reg_01_16x8b = _mm_madd_epi16(reg_01_16x8b, reg_all_1s); |
614 | | |
615 | | /*Add offset of 64 for rounding each out pixel value*/ |
616 | 35.5M | reg_01_16x8b = _mm_add_epi32(reg_01_16x8b, reg_64val_32bit); |
617 | | /*Divide by 128 each out pixel value*/ |
618 | 35.5M | reg_01_16x8b = _mm_srli_epi32(reg_01_16x8b, (int) 7); |
619 | | |
620 | 35.5M | reg_01_16x8b = _mm_packus_epi32(reg_01_16x8b, reg_all_0s); |
621 | | |
622 | | /*next get saturated 8 bit output pixel values*/ |
623 | 35.5M | reg_01_16x8b = _mm_packus_epi16(reg_01_16x8b, reg_all_0s); |
624 | | |
625 | 35.5M | reg_01_16x8b = _mm_shuffle_epi8(reg_01_16x8b, reg_shuffle); |
626 | | |
627 | 35.5M | reg_02_16x8b = _mm_srli_si128(reg_01_16x8b, (int) 8); |
628 | | |
629 | | /*Store the 2 output values*/ |
630 | 35.5M | i4_temp_pixel_holder = _mm_cvtsi128_si32(reg_01_16x8b); |
631 | | |
632 | 35.5M | *pu1_out_pixel = (UWORD8) i4_temp_pixel_holder; |
633 | 35.5M | i4_temp_pixel_holder >>= 8; |
634 | | |
635 | 35.5M | *(pu1_out_pixel + 1) = (UWORD8) i4_temp_pixel_holder; |
636 | | |
637 | 35.5M | i4_temp_pixel_holder = _mm_cvtsi128_si32(reg_02_16x8b); |
638 | | |
639 | 35.5M | *(pu1_out_pixel + u4_out_stride) = (UWORD8) i4_temp_pixel_holder; |
640 | 35.5M | i4_temp_pixel_holder >>= 8; |
641 | | |
642 | 35.5M | *(pu1_out_pixel + u4_out_stride + 1) = (UWORD8) i4_temp_pixel_holder; |
643 | | |
644 | 35.5M | pu1_in_pixel += (u4_src_vert_increments * (u4_in_stride << 1)) >> DOWNSCALER_Q; |
645 | 35.5M | pu1_out_pixel += 2; |
646 | 35.5M | } |
647 | | /* Update the context for next Loop Count */ |
648 | 8.89M | u4_center_pixel_pos += u4_src_horz_increments; |
649 | 8.89M | } |
650 | 199k | } |
651 | 233k | } |
652 | 933k | } |