Coverage Report

Created: 2024-07-27 06:35

/src/libavc/encoder/x86/svc/isvce_downscaler_sse42.c
Line
Count
Source
1
/******************************************************************************
2
 *
3
 * Copyright (C) 2022 The Android Open Source Project
4
 *
5
 * Licensed under the Apache License, Version 2.0 (the "License");
6
 * you may not use this file except in compliance with the License.
7
 * You may obtain a copy of the License at:
8
 *
9
 * http://www.apache.org/licenses/LICENSE-2.0
10
 *
11
 * Unless required by applicable law or agreed to in writing, software
12
 * distributed under the License is distributed on an "AS IS" BASIS,
13
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 * See the License for the specific language governing permissions and
15
 * limitations under the License.
16
 *
17
 *****************************************************************************
18
 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19
 */
20
21
/**
22
******************************************************************************
23
* @file isvce_downscaler_sse42.c
24
*
25
* @brief
26
*  This file contains the x86 SIMD version of the function which does
27
*  horizontal scaling and transpose
28
*
29
* @author
30
*  Ittiam
31
*
32
* @par List of Functions:
33
*  - isvce_horizontal_downscale_and_transpose_sse42()
34
*
35
* @remarks
36
*  None
37
*
38
*******************************************************************************
39
*/
40
41
/*****************************************************************************/
42
/* File Includes                                                             */
43
/*****************************************************************************/
44
45
/* System include files */
46
#include <stdio.h>
47
#include <stdlib.h>
48
#include <immintrin.h>
49
50
/* User include files */
51
#include "ih264_typedefs.h"
52
#include "isvc_macros.h"
53
#include "ih264_platform_macros.h"
54
#include "isvc_defs.h"
55
#include "isvce_defs.h"
56
#include "isvc_structs.h"
57
#include "isvce_downscaler_private_defs.h"
58
59
/*****************************************************************************/
60
/* Function Definitions                                                      */
61
/*****************************************************************************/
62
63
/**
64
*******************************************************************************
65
*
66
* @brief
67
*   horizontal scaler function
68
*
69
* @par Description:
70
*   Does horizontal scaling for the given block
71
*
72
* @param[in] ps_scaler
73
*  pointer to downscaler context
74
*
75
* @param[in] ps_src
76
*  pointer to source buffer container
77
*
78
* @param[in] ps_dst
79
*  pointer to destination buffer container
80
*
81
* @param[in] pai1_filters
82
*  pointer to array of downscaler filters
83
*
84
* @param[in] u4_blk_wd
85
*  width of the block after horizontal scaling (output block width)
86
*
87
* @param[in] u4_blk_ht
88
*  height of the current block (input block height)
89
*
90
* @param[in] u1_is_chroma
91
*  flag suggesting whether the buffer is luma or chroma
92
*
93
*
94
* @returns
95
*
96
* @remarks
97
*  The same function is used for vertical scaling too as
98
*  the horizontally scaled input in stored in transpose fashion.
99
*
100
*******************************************************************************
101
*/
102
103
void isvce_horizontal_downscale_and_transpose_sse42(
104
    downscaler_ctxt_t *ps_scaler, buffer_container_t *ps_src, buffer_container_t *ps_dst,
105
    FILTER_COEFF_ARRAY pai1_filters, UWORD32 u4_blk_wd, UWORD32 u4_blk_ht, UWORD8 u1_is_chroma)
106
933k
{
107
933k
    WORD32 i, j;
108
933k
    UWORD8 u1_phase;
109
933k
    UWORD8 *pu1_src_j, *pu1_dst_j;
110
933k
    WORD32 i4_temp_pixel_holder;
111
933k
    UWORD32 u4_num_iterations_vertical_by_16;
112
933k
    UWORD32 u4_rem_vert_loop;
113
933k
    UWORD8 *pu1_in_pixel;
114
933k
    UWORD8 *pu1_out_pixel;
115
933k
    WORD8 *pi1_filter_for_grid;
116
933k
    UWORD16 u2_full_pixel_inc;
117
118
933k
    __m128i src_temp_0, src_temp_1, src_temp_2, src_temp_3, src_temp_4, src_temp_5, src_temp_6,
119
933k
        src_temp_7;
120
121
933k
    __m128i reg_all_1s, reg_64val_32bit, reg_all_0s, filt_coeff_grid, reg_shuffle;
122
123
933k
    __m128i reg_01_16x8b, reg_02_16x8b, reg_03_16x8b, reg_04_16x8b, reg_05_16x8b;
124
125
933k
    downscaler_state_t *ps_scaler_state = (downscaler_state_t *) ps_scaler->pv_scaler_state;
126
127
933k
    UWORD32 u4_center_pixel_pos = ps_scaler_state->i4_init_offset;
128
933k
    UWORD32 u4_src_vert_increments = ps_scaler_state->u4_vert_increment;
129
933k
    UWORD32 u4_src_horz_increments = ps_scaler_state->u4_horz_increment;
130
131
933k
    UWORD8 *pu1_src = ps_src->pv_data;
132
933k
    UWORD32 u4_in_stride = ps_src->i4_data_stride;
133
933k
    UWORD8 *pu1_dst = ps_dst->pv_data;
134
933k
    UWORD32 u4_out_stride = ps_dst->i4_data_stride;
135
933k
    UWORD32 u4_center_pixel_pos_src = u4_center_pixel_pos;
136
137
933k
    ASSERT((1 << DOWNSCALER_Q) == u4_src_vert_increments);
138
139
933k
    reg_all_1s = _mm_set1_epi16((short) 1);
140
933k
    reg_64val_32bit = _mm_set1_epi32((int) 64);
141
933k
    reg_all_0s = _mm_setzero_si128();
142
933k
    reg_shuffle = _mm_set_epi8(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
143
144
933k
    u4_num_iterations_vertical_by_16 = u4_blk_ht >> 4;
145
933k
    u4_rem_vert_loop = u4_blk_ht % 16;
146
147
    /* Offset the input so that the input pixel to be processed
148
       co-incides with the centre of filter (4th coefficient)*/
149
933k
    pu1_src += (1 + u1_is_chroma);
150
151
933k
    if(!u1_is_chroma)
152
700k
    {
153
3.21M
        for(j = 0; j < (WORD32) u4_num_iterations_vertical_by_16; j++)
154
2.51M
        {
155
2.51M
            pu1_src_j = pu1_src + ((j << 4) * u4_in_stride);
156
2.51M
            pu1_dst_j = pu1_dst + (j << 4);
157
158
2.51M
            u4_center_pixel_pos = u4_center_pixel_pos_src;
159
160
102M
            for(i = 0; i < (WORD32) u4_blk_wd; i++)
161
99.5M
            {
162
99.5M
                u1_phase = get_filter_phase(u4_center_pixel_pos);
163
99.5M
                pi1_filter_for_grid = pai1_filters[u1_phase];
164
165
99.5M
                u2_full_pixel_inc = u4_center_pixel_pos >> DOWNSCALER_Q;
166
167
99.5M
                pu1_in_pixel = pu1_src_j + (u2_full_pixel_inc << u1_is_chroma);
168
169
99.5M
                pu1_out_pixel = pu1_dst_j + ((i << u1_is_chroma) * u4_out_stride);
170
171
99.5M
                filt_coeff_grid = _mm_loadu_si128((__m128i *) pi1_filter_for_grid);
172
                /******************************************************/
173
                /* This loop is going vertically in bottom direction */
174
                /* but the output pixels are stored in horizontal    */
175
                /* direction in transpose manner                     */
176
                /******************************************************/
177
178
                /*For row 0,1*/
179
99.5M
                src_temp_0 = _mm_loadl_epi64((__m128i *) pu1_in_pixel);
180
99.5M
                src_temp_1 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride));
181
                /*next transfer the 8 pixels from temp_2 to temp_1 higher bits 64-127*/
182
99.5M
                src_temp_0 = _mm_unpacklo_epi64(src_temp_0, src_temp_1);
183
184
                /*For row 2,3*/
185
99.5M
                src_temp_2 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride * 2));
186
187
99.5M
                src_temp_3 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride * 3));
188
189
99.5M
                src_temp_2 = _mm_unpacklo_epi64(src_temp_2, src_temp_3);
190
191
99.5M
                reg_01_16x8b = _mm_maddubs_epi16(src_temp_0, filt_coeff_grid);
192
193
                /*multiply with filter coeffs to get 16 bit results*/
194
99.5M
                reg_02_16x8b = _mm_maddubs_epi16(src_temp_2, filt_coeff_grid);
195
196
99.5M
                reg_01_16x8b = _mm_hadd_epi16(reg_01_16x8b, reg_02_16x8b);
197
                /*add adjacent 16 bit values to get 32 bit values*/
198
99.5M
                reg_01_16x8b = _mm_madd_epi16(reg_01_16x8b, reg_all_1s);
199
200
                /*Add offset of 64 for rounding each out pixel value*/
201
99.5M
                reg_01_16x8b = _mm_add_epi32(reg_01_16x8b, reg_64val_32bit);
202
                /*Divide by 128 each out pixel value*/
203
99.5M
                reg_01_16x8b = _mm_srli_epi32(reg_01_16x8b, 7);
204
205
                /*For row 4,5*/
206
99.5M
                src_temp_4 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride * 4));
207
208
99.5M
                src_temp_5 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride * 5));
209
210
99.5M
                src_temp_4 = _mm_unpacklo_epi64(src_temp_4, src_temp_5);
211
212
                /*For row 6,7*/
213
99.5M
                src_temp_6 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride * 6));
214
215
99.5M
                src_temp_7 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride * 7));
216
217
99.5M
                src_temp_6 = _mm_unpacklo_epi64(src_temp_6, src_temp_7);
218
219
99.5M
                reg_03_16x8b = _mm_maddubs_epi16(src_temp_4, filt_coeff_grid);
220
221
99.5M
                reg_04_16x8b = _mm_maddubs_epi16(src_temp_6, filt_coeff_grid);
222
223
99.5M
                reg_03_16x8b = _mm_hadd_epi16(reg_03_16x8b, reg_04_16x8b);
224
225
99.5M
                reg_03_16x8b = _mm_madd_epi16(reg_03_16x8b, reg_all_1s);
226
227
                /*next add 2 adjacent 32 bit values to get a single 32 bit
228
                **value in each row
229
                */
230
231
                /*Add offset of 64 for rounding each out pixel value*/
232
99.5M
                reg_03_16x8b = _mm_add_epi32(reg_03_16x8b, reg_64val_32bit);
233
                /*Divide by 128 each out pixel value*/
234
99.5M
                reg_03_16x8b = _mm_srli_epi32(reg_03_16x8b, 7);
235
236
                /*pack the lower 16 bit values corresponding to the 8 output
237
                pixels from reg1 and reg 2*/
238
99.5M
                reg_01_16x8b = _mm_packus_epi32(reg_01_16x8b, reg_03_16x8b);
239
240
                /*For row 8,9*/
241
99.5M
                src_temp_0 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + 8 * u4_in_stride));
242
243
99.5M
                src_temp_1 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + 9 * u4_in_stride));
244
245
                /*next transfer the 8 pixels from temp_2 to temp_1 higher bits 64-127*/
246
99.5M
                src_temp_0 = _mm_unpacklo_epi64(src_temp_0, src_temp_1);
247
248
                /*For row 10,11*/
249
99.5M
                src_temp_2 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride * 10));
250
251
99.5M
                src_temp_3 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride * 11));
252
253
99.5M
                src_temp_2 = _mm_unpacklo_epi64(src_temp_2, src_temp_3);
254
255
99.5M
                reg_02_16x8b = _mm_maddubs_epi16(src_temp_0, filt_coeff_grid);
256
257
                /*multiply with filter coeffs to get 16 bit results*/
258
99.5M
                reg_03_16x8b = _mm_maddubs_epi16(src_temp_2, filt_coeff_grid);
259
260
99.5M
                reg_02_16x8b = _mm_hadd_epi16(reg_02_16x8b, reg_03_16x8b);
261
                /*add adjacent 16 bit values to get 32 bit values*/
262
99.5M
                reg_02_16x8b = _mm_madd_epi16(reg_02_16x8b, reg_all_1s);
263
264
                /*next add 2 adjacent 32 bit values to get a single
265
                32 bit value in each row*/
266
267
                /*Add offset of 64 for rounding each out pixel value*/
268
99.5M
                reg_02_16x8b = _mm_add_epi32(reg_02_16x8b, reg_64val_32bit);
269
                /*Divide by 128 each out pixel value*/
270
99.5M
                reg_02_16x8b = _mm_srli_epi32(reg_02_16x8b, 7);
271
272
                /*For row 12,13*/
273
99.5M
                src_temp_4 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride * 12));
274
275
99.5M
                src_temp_5 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride * 13));
276
277
99.5M
                src_temp_4 = _mm_unpacklo_epi64(src_temp_4, src_temp_5);
278
279
                /*For row 14,15*/
280
99.5M
                src_temp_6 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride * 14));
281
282
99.5M
                src_temp_7 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride * 15));
283
284
99.5M
                src_temp_6 = _mm_unpacklo_epi64(src_temp_6, src_temp_7);
285
286
99.5M
                reg_04_16x8b = _mm_maddubs_epi16(src_temp_4, filt_coeff_grid);
287
288
99.5M
                reg_05_16x8b = _mm_maddubs_epi16(src_temp_6, filt_coeff_grid);
289
290
99.5M
                reg_04_16x8b = _mm_hadd_epi16(reg_04_16x8b, reg_05_16x8b);
291
                /*add adjacent 16 bit values to get 32 bit values*/
292
99.5M
                reg_04_16x8b = _mm_madd_epi16(reg_04_16x8b, reg_all_1s);
293
294
                /*next add 2 adjacent 32 bit values to get a single
295
                32 bit value in each row*/
296
297
                /*Add offset of 64 for rounding each out pixel value*/
298
99.5M
                reg_04_16x8b = _mm_add_epi32(reg_04_16x8b, reg_64val_32bit);
299
                /*Divide by 128 each out pixel value*/
300
99.5M
                reg_04_16x8b = _mm_srli_epi32(reg_04_16x8b, 7);
301
302
                /*pack the lower 16 bit values corresponding to the 8 output
303
                pixels from reg1 and reg 2*/
304
99.5M
                reg_02_16x8b = _mm_packus_epi32(reg_02_16x8b, reg_04_16x8b);
305
306
                /*next get saturated 8 bit output pixel values for row 0-15*/
307
99.5M
                reg_01_16x8b = _mm_packus_epi16(reg_01_16x8b, reg_02_16x8b);
308
309
                /*Store the 16 output values*/
310
99.5M
                _mm_storeu_si128((__m128i *) pu1_out_pixel, reg_01_16x8b);
311
312
99.5M
                pu1_out_pixel += 16;
313
314
99.5M
                pu1_in_pixel += ((u4_src_vert_increments * (u4_in_stride << 4)) >> DOWNSCALER_Q);
315
316
                /* Update the context for next Loop Count */
317
99.5M
                u4_center_pixel_pos += u4_src_horz_increments;
318
99.5M
            }
319
2.51M
        }
320
321
        /*if height is not a multiple of 8 process 2 rows at a
322
        time for the remaining rows*/
323
700k
        if(u4_rem_vert_loop)
324
233k
        {
325
233k
            pu1_src_j = pu1_src + ((j << 4) * u4_in_stride);
326
233k
            pu1_dst_j = pu1_dst + (j << 4);
327
328
233k
            u4_center_pixel_pos = u4_center_pixel_pos_src;
329
330
11.0M
            for(i = 0; i < (WORD32) u4_blk_wd; i++)
331
10.7M
            {
332
10.7M
                u1_phase = get_filter_phase(u4_center_pixel_pos);
333
10.7M
                pi1_filter_for_grid = pai1_filters[u1_phase];
334
335
10.7M
                u2_full_pixel_inc = u4_center_pixel_pos >> DOWNSCALER_Q;
336
337
10.7M
                pu1_in_pixel = pu1_src_j + (u2_full_pixel_inc << u1_is_chroma);
338
339
10.7M
                pu1_out_pixel = pu1_dst_j + ((i << u1_is_chroma) * u4_out_stride);
340
341
10.7M
                filt_coeff_grid = _mm_loadu_si128((__m128i *) pi1_filter_for_grid);
342
343
97.0M
                for(j = u4_rem_vert_loop; j > 0; j--)
344
86.2M
                {
345
86.2M
                    src_temp_0 = _mm_loadl_epi64((__m128i const *) pu1_in_pixel);
346
347
86.2M
                    src_temp_0 = _mm_maddubs_epi16(src_temp_0, filt_coeff_grid);
348
349
86.2M
                    src_temp_0 = _mm_madd_epi16(src_temp_0, reg_all_1s);
350
351
86.2M
                    reg_01_16x8b = _mm_hadd_epi32(src_temp_0, reg_all_0s);
352
353
                    /*Add offset of 64 for rounding each out pixel value*/
354
86.2M
                    reg_01_16x8b = _mm_add_epi32(reg_01_16x8b, reg_64val_32bit);
355
                    /*Divide by 128 each out pixel value*/
356
86.2M
                    reg_01_16x8b = _mm_srli_epi32(reg_01_16x8b, (int) 7);
357
358
86.2M
                    reg_01_16x8b = _mm_packus_epi32(reg_01_16x8b, reg_all_0s);
359
360
                    /*next get saturated 8 bit output pixel values*/
361
86.2M
                    reg_01_16x8b = _mm_packus_epi16(reg_01_16x8b, reg_all_0s);
362
363
                    /*Store the 1 output value*/
364
86.2M
                    *pu1_out_pixel = (UWORD8) _mm_cvtsi128_si32(reg_01_16x8b);
365
366
86.2M
                    pu1_in_pixel += (u4_src_vert_increments * u4_in_stride) >> DOWNSCALER_Q;
367
368
86.2M
                    pu1_out_pixel++;
369
86.2M
                }
370
                /* Update the context for next Loop Count */
371
10.7M
                u4_center_pixel_pos += u4_src_horz_increments;
372
10.7M
            }
373
233k
        }
374
700k
    }
375
376
233k
    else /* for chroma */
377
233k
    {
378
831k
        for(j = 0; j < (WORD32) u4_num_iterations_vertical_by_16; j++)
379
598k
        {
380
598k
            pu1_src_j = pu1_src + ((j << 4) * u4_in_stride);
381
598k
            pu1_dst_j = pu1_dst + (j << 4);
382
383
598k
            u4_center_pixel_pos = u4_center_pixel_pos_src;
384
385
28.9M
            for(i = 0; i < (WORD32) u4_blk_wd; i++)
386
28.3M
            {
387
28.3M
                u1_phase = get_filter_phase(u4_center_pixel_pos);
388
28.3M
                pi1_filter_for_grid = pai1_filters[u1_phase];
389
390
28.3M
                u2_full_pixel_inc = u4_center_pixel_pos >> DOWNSCALER_Q;
391
392
28.3M
                pu1_in_pixel = pu1_src_j + (u2_full_pixel_inc << u1_is_chroma);
393
394
28.3M
                pu1_out_pixel = pu1_dst_j + ((i << u1_is_chroma) * u4_out_stride);
395
396
28.3M
                filt_coeff_grid = _mm_loadu_si128((__m128i *) pi1_filter_for_grid);
397
                /******************************************************/
398
                /* This loop is going vertically in bottom direction */
399
                /* but the output pixels are stored in horizontal    */
400
                /* direction in transpose manner                     */
401
                /******************************************************/
402
403
                /*Load 16 values shuffle to separate Cb and Cr and process*/
404
405
28.3M
                src_temp_0 = _mm_loadu_si128((__m128i *) pu1_in_pixel);
406
28.3M
                src_temp_1 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride));
407
408
28.3M
                src_temp_2 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride * 2));
409
410
28.3M
                src_temp_3 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride * 3));
411
412
28.3M
                src_temp_0 = _mm_shuffle_epi8(src_temp_0, reg_shuffle);
413
28.3M
                src_temp_1 = _mm_shuffle_epi8(src_temp_1, reg_shuffle);
414
28.3M
                src_temp_2 = _mm_shuffle_epi8(src_temp_2, reg_shuffle);
415
28.3M
                src_temp_3 = _mm_shuffle_epi8(src_temp_3, reg_shuffle);
416
417
28.3M
                reg_01_16x8b = _mm_maddubs_epi16(src_temp_0, filt_coeff_grid);
418
28.3M
                reg_02_16x8b = _mm_maddubs_epi16(src_temp_1, filt_coeff_grid);
419
420
28.3M
                reg_01_16x8b = _mm_hadd_epi16(reg_01_16x8b, reg_02_16x8b);
421
422
28.3M
                reg_01_16x8b = _mm_madd_epi16(reg_01_16x8b, reg_all_1s);
423
424
28.3M
                reg_01_16x8b = _mm_add_epi32(reg_01_16x8b, reg_64val_32bit);
425
426
28.3M
                reg_01_16x8b = _mm_srli_epi32(reg_01_16x8b, (int) 7);
427
428
28.3M
                reg_03_16x8b = _mm_maddubs_epi16(src_temp_2, filt_coeff_grid);
429
28.3M
                reg_04_16x8b = _mm_maddubs_epi16(src_temp_3, filt_coeff_grid);
430
431
28.3M
                src_temp_4 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride * 4));
432
433
28.3M
                src_temp_5 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride * 5));
434
435
28.3M
                src_temp_6 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride * 6));
436
437
28.3M
                src_temp_7 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride * 7));
438
439
28.3M
                src_temp_4 = _mm_shuffle_epi8(src_temp_4, reg_shuffle);
440
28.3M
                src_temp_5 = _mm_shuffle_epi8(src_temp_5, reg_shuffle);
441
28.3M
                src_temp_6 = _mm_shuffle_epi8(src_temp_6, reg_shuffle);
442
28.3M
                src_temp_7 = _mm_shuffle_epi8(src_temp_7, reg_shuffle);
443
444
28.3M
                reg_03_16x8b = _mm_hadd_epi16(reg_03_16x8b, reg_04_16x8b);
445
446
28.3M
                reg_03_16x8b = _mm_madd_epi16(reg_03_16x8b, reg_all_1s);
447
448
28.3M
                reg_03_16x8b = _mm_add_epi32(reg_03_16x8b, reg_64val_32bit);
449
450
28.3M
                reg_03_16x8b = _mm_srli_epi32(reg_03_16x8b, (int) 7);
451
452
28.3M
                reg_01_16x8b = _mm_packus_epi32(reg_01_16x8b, reg_03_16x8b);
453
454
28.3M
                reg_02_16x8b = _mm_maddubs_epi16(src_temp_4, filt_coeff_grid);
455
28.3M
                reg_04_16x8b = _mm_maddubs_epi16(src_temp_5, filt_coeff_grid);
456
457
28.3M
                reg_02_16x8b = _mm_hadd_epi16(reg_02_16x8b, reg_04_16x8b);
458
459
28.3M
                reg_02_16x8b = _mm_madd_epi16(reg_02_16x8b, reg_all_1s);
460
461
28.3M
                reg_02_16x8b = _mm_add_epi32(reg_02_16x8b, reg_64val_32bit);
462
463
28.3M
                reg_02_16x8b = _mm_srli_epi32(reg_02_16x8b, (int) 7);
464
465
28.3M
                reg_03_16x8b = _mm_maddubs_epi16(src_temp_6, filt_coeff_grid);
466
28.3M
                reg_04_16x8b = _mm_maddubs_epi16(src_temp_7, filt_coeff_grid);
467
468
28.3M
                reg_03_16x8b = _mm_hadd_epi16(reg_03_16x8b, reg_04_16x8b);
469
470
28.3M
                reg_03_16x8b = _mm_madd_epi16(reg_03_16x8b, reg_all_1s);
471
472
28.3M
                reg_03_16x8b = _mm_add_epi32(reg_03_16x8b, reg_64val_32bit);
473
474
28.3M
                reg_03_16x8b = _mm_srli_epi32(reg_03_16x8b, (int) 7);
475
476
28.3M
                reg_02_16x8b = _mm_packus_epi32(reg_02_16x8b, reg_03_16x8b);
477
478
28.3M
                reg_01_16x8b = _mm_packus_epi16(reg_01_16x8b, reg_02_16x8b);
479
480
28.3M
                reg_01_16x8b = _mm_shuffle_epi8(reg_01_16x8b, reg_shuffle);
481
482
28.3M
                src_temp_0 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + 8 * u4_in_stride));
483
484
28.3M
                src_temp_1 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + 9 * u4_in_stride));
485
486
28.3M
                src_temp_2 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride * 10));
487
488
28.3M
                src_temp_3 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride * 11));
489
490
28.3M
                src_temp_0 = _mm_shuffle_epi8(src_temp_0, reg_shuffle);
491
28.3M
                src_temp_1 = _mm_shuffle_epi8(src_temp_1, reg_shuffle);
492
28.3M
                src_temp_2 = _mm_shuffle_epi8(src_temp_2, reg_shuffle);
493
28.3M
                src_temp_3 = _mm_shuffle_epi8(src_temp_3, reg_shuffle);
494
495
28.3M
                reg_02_16x8b = _mm_maddubs_epi16(src_temp_0, filt_coeff_grid);
496
28.3M
                reg_03_16x8b = _mm_maddubs_epi16(src_temp_1, filt_coeff_grid);
497
498
28.3M
                reg_02_16x8b = _mm_hadd_epi16(reg_02_16x8b, reg_03_16x8b);
499
500
28.3M
                reg_02_16x8b = _mm_madd_epi16(reg_02_16x8b, reg_all_1s);
501
502
28.3M
                reg_02_16x8b = _mm_add_epi32(reg_02_16x8b, reg_64val_32bit);
503
504
28.3M
                reg_02_16x8b = _mm_srli_epi32(reg_02_16x8b, (int) 7);
505
506
28.3M
                reg_04_16x8b = _mm_maddubs_epi16(src_temp_2, filt_coeff_grid);
507
28.3M
                reg_05_16x8b = _mm_maddubs_epi16(src_temp_3, filt_coeff_grid);
508
509
28.3M
                src_temp_4 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride * 12));
510
511
28.3M
                src_temp_5 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride * 13));
512
513
28.3M
                src_temp_6 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride * 14));
514
515
28.3M
                src_temp_7 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride * 15));
516
517
28.3M
                src_temp_4 = _mm_shuffle_epi8(src_temp_4, reg_shuffle);
518
28.3M
                src_temp_5 = _mm_shuffle_epi8(src_temp_5, reg_shuffle);
519
28.3M
                src_temp_6 = _mm_shuffle_epi8(src_temp_6, reg_shuffle);
520
28.3M
                src_temp_7 = _mm_shuffle_epi8(src_temp_7, reg_shuffle);
521
522
28.3M
                reg_04_16x8b = _mm_hadd_epi16(reg_04_16x8b, reg_05_16x8b);
523
524
28.3M
                reg_04_16x8b = _mm_madd_epi16(reg_04_16x8b, reg_all_1s);
525
526
28.3M
                reg_04_16x8b = _mm_add_epi32(reg_04_16x8b, reg_64val_32bit);
527
528
28.3M
                reg_04_16x8b = _mm_srli_epi32(reg_04_16x8b, (int) 7);
529
530
28.3M
                reg_02_16x8b = _mm_packus_epi32(reg_02_16x8b, reg_04_16x8b);
531
532
28.3M
                reg_03_16x8b = _mm_maddubs_epi16(src_temp_4, filt_coeff_grid);
533
28.3M
                reg_05_16x8b = _mm_maddubs_epi16(src_temp_5, filt_coeff_grid);
534
535
28.3M
                reg_03_16x8b = _mm_hadd_epi16(reg_03_16x8b, reg_05_16x8b);
536
537
28.3M
                reg_03_16x8b = _mm_madd_epi16(reg_03_16x8b, reg_all_1s);
538
539
28.3M
                reg_03_16x8b = _mm_add_epi32(reg_03_16x8b, reg_64val_32bit);
540
541
28.3M
                reg_03_16x8b = _mm_srli_epi32(reg_03_16x8b, (int) 7);
542
543
28.3M
                reg_04_16x8b = _mm_maddubs_epi16(src_temp_6, filt_coeff_grid);
544
28.3M
                reg_05_16x8b = _mm_maddubs_epi16(src_temp_7, filt_coeff_grid);
545
546
28.3M
                reg_04_16x8b = _mm_hadd_epi16(reg_04_16x8b, reg_05_16x8b);
547
548
28.3M
                reg_04_16x8b = _mm_madd_epi16(reg_04_16x8b, reg_all_1s);
549
550
28.3M
                reg_04_16x8b = _mm_add_epi32(reg_04_16x8b, reg_64val_32bit);
551
552
28.3M
                reg_04_16x8b = _mm_srli_epi32(reg_04_16x8b, (int) 7);
553
554
28.3M
                reg_03_16x8b = _mm_packus_epi32(reg_03_16x8b, reg_04_16x8b);
555
556
28.3M
                reg_02_16x8b = _mm_packus_epi16(reg_02_16x8b, reg_03_16x8b);
557
558
28.3M
                reg_02_16x8b = _mm_shuffle_epi8(reg_02_16x8b, reg_shuffle);
559
560
28.3M
                reg_03_16x8b = _mm_unpacklo_epi64(reg_01_16x8b, reg_02_16x8b);
561
562
28.3M
                reg_04_16x8b = _mm_unpackhi_epi64(reg_01_16x8b, reg_02_16x8b);
563
564
                /*Storing after shuffling again*/
565
566
28.3M
                _mm_storeu_si128((__m128i *) pu1_out_pixel, reg_03_16x8b);
567
28.3M
                _mm_storeu_si128((__m128i *) (pu1_out_pixel + u4_out_stride), reg_04_16x8b);
568
569
28.3M
                pu1_out_pixel += 16;
570
571
28.3M
                pu1_in_pixel += (u4_src_vert_increments * (u4_in_stride << 4)) >> DOWNSCALER_Q;
572
573
                /* Update the context for next Loop Count */
574
28.3M
                u4_center_pixel_pos += u4_src_horz_increments;
575
28.3M
            }
576
598k
        }
577
578
        /*if height is not a multiple of 8 process 2 rows at a
579
        time for the remaining rows*/
580
233k
        if(u4_rem_vert_loop)
581
199k
        {
582
199k
            pu1_src_j = pu1_src + ((j << 4) * u4_in_stride);
583
199k
            pu1_dst_j = pu1_dst + (j << 4);
584
585
199k
            u4_center_pixel_pos = u4_center_pixel_pos_src;
586
9.09M
            for(i = 0; i < (WORD32) u4_blk_wd; i++)
587
8.89M
            {
588
8.89M
                UWORD8 u1_phase = get_filter_phase(u4_center_pixel_pos);
589
8.89M
                pi1_filter_for_grid = pai1_filters[u1_phase];
590
591
8.89M
                u2_full_pixel_inc = u4_center_pixel_pos >> DOWNSCALER_Q;
592
593
8.89M
                pu1_in_pixel = pu1_src_j + (u2_full_pixel_inc << u1_is_chroma);
594
595
8.89M
                pu1_out_pixel = pu1_dst_j + ((i << u1_is_chroma) * u4_out_stride);
596
597
8.89M
                filt_coeff_grid = _mm_loadu_si128((__m128i *) pi1_filter_for_grid);
598
599
44.4M
                for(j = u4_rem_vert_loop; j > 0; j = j - 2)
600
35.5M
                {
601
35.5M
                    src_temp_0 = _mm_loadu_si128((__m128i const *) pu1_in_pixel);
602
35.5M
                    src_temp_0 = _mm_shuffle_epi8(src_temp_0, reg_shuffle);
603
604
35.5M
                    src_temp_1 = _mm_loadu_si128((__m128i const *) (pu1_in_pixel + u4_in_stride));
605
606
35.5M
                    src_temp_1 = _mm_shuffle_epi8(src_temp_1, reg_shuffle);
607
608
35.5M
                    src_temp_0 = _mm_maddubs_epi16(src_temp_0, filt_coeff_grid);
609
35.5M
                    src_temp_1 = _mm_maddubs_epi16(src_temp_1, filt_coeff_grid);
610
611
35.5M
                    reg_01_16x8b = _mm_hadd_epi16(src_temp_0, src_temp_1);
612
613
35.5M
                    reg_01_16x8b = _mm_madd_epi16(reg_01_16x8b, reg_all_1s);
614
615
                    /*Add offset of 64 for rounding each out pixel value*/
616
35.5M
                    reg_01_16x8b = _mm_add_epi32(reg_01_16x8b, reg_64val_32bit);
617
                    /*Divide by 128 each out pixel value*/
618
35.5M
                    reg_01_16x8b = _mm_srli_epi32(reg_01_16x8b, (int) 7);
619
620
35.5M
                    reg_01_16x8b = _mm_packus_epi32(reg_01_16x8b, reg_all_0s);
621
622
                    /*next get saturated 8 bit output pixel values*/
623
35.5M
                    reg_01_16x8b = _mm_packus_epi16(reg_01_16x8b, reg_all_0s);
624
625
35.5M
                    reg_01_16x8b = _mm_shuffle_epi8(reg_01_16x8b, reg_shuffle);
626
627
35.5M
                    reg_02_16x8b = _mm_srli_si128(reg_01_16x8b, (int) 8);
628
629
                    /*Store the 2 output values*/
630
35.5M
                    i4_temp_pixel_holder = _mm_cvtsi128_si32(reg_01_16x8b);
631
632
35.5M
                    *pu1_out_pixel = (UWORD8) i4_temp_pixel_holder;
633
35.5M
                    i4_temp_pixel_holder >>= 8;
634
635
35.5M
                    *(pu1_out_pixel + 1) = (UWORD8) i4_temp_pixel_holder;
636
637
35.5M
                    i4_temp_pixel_holder = _mm_cvtsi128_si32(reg_02_16x8b);
638
639
35.5M
                    *(pu1_out_pixel + u4_out_stride) = (UWORD8) i4_temp_pixel_holder;
640
35.5M
                    i4_temp_pixel_holder >>= 8;
641
642
35.5M
                    *(pu1_out_pixel + u4_out_stride + 1) = (UWORD8) i4_temp_pixel_holder;
643
644
35.5M
                    pu1_in_pixel += (u4_src_vert_increments * (u4_in_stride << 1)) >> DOWNSCALER_Q;
645
35.5M
                    pu1_out_pixel += 2;
646
35.5M
                }
647
                /* Update the context for next Loop Count */
648
8.89M
                u4_center_pixel_pos += u4_src_horz_increments;
649
8.89M
            }
650
199k
        }
651
233k
    }
652
933k
}