Coverage Report

Created: 2026-01-10 06:44

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libavc/encoder/x86/svc/isvce_downscaler_sse42.c
Line
Count
Source
1
/******************************************************************************
2
 *
3
 * Copyright (C) 2022 The Android Open Source Project
4
 *
5
 * Licensed under the Apache License, Version 2.0 (the "License");
6
 * you may not use this file except in compliance with the License.
7
 * You may obtain a copy of the License at:
8
 *
9
 * http://www.apache.org/licenses/LICENSE-2.0
10
 *
11
 * Unless required by applicable law or agreed to in writing, software
12
 * distributed under the License is distributed on an "AS IS" BASIS,
13
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 * See the License for the specific language governing permissions and
15
 * limitations under the License.
16
 *
17
 *****************************************************************************
18
 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19
 */
20
21
/**
22
******************************************************************************
23
* @file isvce_downscaler_sse42.c
24
*
25
* @brief
26
*  This file contains the x86 SIMD version of the function which does
27
*  horizontal scaling and transpose
28
*
29
* @author
30
*  Ittiam
31
*
32
* @par List of Functions:
33
*  - isvce_horizontal_downscale_and_transpose_sse42()
34
*
35
* @remarks
36
*  None
37
*
38
*******************************************************************************
39
*/
40
41
/*****************************************************************************/
42
/* File Includes                                                             */
43
/*****************************************************************************/
44
45
/* System include files */
46
#include <stdio.h>
47
#include <stdlib.h>
48
#include <immintrin.h>
49
50
/* User include files */
51
#include "ih264_typedefs.h"
52
#include "isvc_macros.h"
53
#include "ih264_platform_macros.h"
54
#include "isvc_defs.h"
55
#include "isvce_defs.h"
56
#include "isvc_structs.h"
57
#include "isvce_downscaler_private_defs.h"
58
59
/*****************************************************************************/
60
/* Function Definitions                                                      */
61
/*****************************************************************************/
62
63
/**
64
*******************************************************************************
65
*
66
* @brief
67
*   horizontal scaler function
68
*
69
* @par Description:
70
*   Does horizontal scaling for the given block
71
*
72
* @param[in] ps_scaler
73
*  pointer to downscaler context
74
*
75
* @param[in] ps_src
76
*  pointer to source buffer container
77
*
78
* @param[in] ps_dst
79
*  pointer to destination buffer container
80
*
81
* @param[in] pai1_filters
82
*  pointer to array of downscaler filters
83
*
84
* @param[in] u4_blk_wd
85
*  width of the block after horizontal scaling (output block width)
86
*
87
* @param[in] u4_blk_ht
88
*  height of the current block (input block height)
89
*
90
* @param[in] u1_is_chroma
91
*  flag suggesting whether the buffer is luma or chroma
92
*
93
*
94
* @returns
95
*
96
* @remarks
97
*  The same function is used for vertical scaling too as
98
*  the horizontally scaled input in stored in transpose fashion.
99
*
100
*******************************************************************************
101
*/
102
103
void isvce_horizontal_downscale_and_transpose_sse42(
104
    downscaler_ctxt_t *ps_scaler, buffer_container_t *ps_src, buffer_container_t *ps_dst,
105
    FILTER_COEFF_ARRAY pai1_filters, UWORD32 u4_blk_wd, UWORD32 u4_blk_ht, UWORD8 u1_is_chroma)
106
1.04M
{
107
1.04M
    WORD32 i, j;
108
1.04M
    UWORD8 u1_phase;
109
1.04M
    UWORD8 *pu1_src_j, *pu1_dst_j;
110
1.04M
    WORD32 i4_temp_pixel_holder;
111
1.04M
    UWORD32 u4_num_iterations_vertical_by_16;
112
1.04M
    UWORD32 u4_rem_vert_loop;
113
1.04M
    UWORD8 *pu1_in_pixel;
114
1.04M
    UWORD8 *pu1_out_pixel;
115
1.04M
    WORD8 *pi1_filter_for_grid;
116
1.04M
    UWORD16 u2_full_pixel_inc;
117
118
1.04M
    __m128i src_temp_0, src_temp_1, src_temp_2, src_temp_3, src_temp_4, src_temp_5, src_temp_6,
119
1.04M
        src_temp_7;
120
121
1.04M
    __m128i reg_all_1s, reg_64val_32bit, reg_all_0s, filt_coeff_grid, reg_shuffle;
122
123
1.04M
    __m128i reg_01_16x8b, reg_02_16x8b, reg_03_16x8b, reg_04_16x8b, reg_05_16x8b;
124
125
1.04M
    downscaler_state_t *ps_scaler_state = (downscaler_state_t *) ps_scaler->pv_scaler_state;
126
127
1.04M
    UWORD32 u4_center_pixel_pos = ps_scaler_state->i4_init_offset;
128
1.04M
    UWORD32 u4_src_vert_increments = ps_scaler_state->u4_vert_increment;
129
1.04M
    UWORD32 u4_src_horz_increments = ps_scaler_state->u4_horz_increment;
130
131
1.04M
    UWORD8 *pu1_src = ps_src->pv_data;
132
1.04M
    UWORD32 u4_in_stride = ps_src->i4_data_stride;
133
1.04M
    UWORD8 *pu1_dst = ps_dst->pv_data;
134
1.04M
    UWORD32 u4_out_stride = ps_dst->i4_data_stride;
135
1.04M
    UWORD32 u4_center_pixel_pos_src = u4_center_pixel_pos;
136
137
1.04M
    ASSERT((1 << DOWNSCALER_Q) == u4_src_vert_increments);
138
139
1.04M
    reg_all_1s = _mm_set1_epi16((short) 1);
140
1.04M
    reg_64val_32bit = _mm_set1_epi32((int) 64);
141
1.04M
    reg_all_0s = _mm_setzero_si128();
142
1.04M
    reg_shuffle = _mm_set_epi8(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
143
144
1.04M
    u4_num_iterations_vertical_by_16 = u4_blk_ht >> 4;
145
1.04M
    u4_rem_vert_loop = u4_blk_ht % 16;
146
147
    /* Offset the input so that the input pixel to be processed
148
       co-incides with the centre of filter (4th coefficient)*/
149
1.04M
    pu1_src += (1 + u1_is_chroma);
150
151
1.04M
    if(!u1_is_chroma)
152
780k
    {
153
3.57M
        for(j = 0; j < (WORD32) u4_num_iterations_vertical_by_16; j++)
154
2.79M
        {
155
2.79M
            pu1_src_j = pu1_src + ((j << 4) * u4_in_stride);
156
2.79M
            pu1_dst_j = pu1_dst + (j << 4);
157
158
2.79M
            u4_center_pixel_pos = u4_center_pixel_pos_src;
159
160
113M
            for(i = 0; i < (WORD32) u4_blk_wd; i++)
161
110M
            {
162
110M
                u1_phase = get_filter_phase(u4_center_pixel_pos);
163
110M
                pi1_filter_for_grid = pai1_filters[u1_phase];
164
165
110M
                u2_full_pixel_inc = u4_center_pixel_pos >> DOWNSCALER_Q;
166
167
110M
                pu1_in_pixel = pu1_src_j + (u2_full_pixel_inc << u1_is_chroma);
168
169
110M
                pu1_out_pixel = pu1_dst_j + ((i << u1_is_chroma) * u4_out_stride);
170
171
110M
                filt_coeff_grid = _mm_loadu_si128((__m128i *) pi1_filter_for_grid);
172
                /******************************************************/
173
                /* This loop is going vertically in bottom direction */
174
                /* but the output pixels are stored in horizontal    */
175
                /* direction in transpose manner                     */
176
                /******************************************************/
177
178
                /*For row 0,1*/
179
110M
                src_temp_0 = _mm_loadl_epi64((__m128i *) pu1_in_pixel);
180
110M
                src_temp_1 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride));
181
                /*next transfer the 8 pixels from temp_2 to temp_1 higher bits 64-127*/
182
110M
                src_temp_0 = _mm_unpacklo_epi64(src_temp_0, src_temp_1);
183
184
                /*For row 2,3*/
185
110M
                src_temp_2 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride * 2));
186
187
110M
                src_temp_3 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride * 3));
188
189
110M
                src_temp_2 = _mm_unpacklo_epi64(src_temp_2, src_temp_3);
190
191
110M
                reg_01_16x8b = _mm_maddubs_epi16(src_temp_0, filt_coeff_grid);
192
193
                /*multiply with filter coeffs to get 16 bit results*/
194
110M
                reg_02_16x8b = _mm_maddubs_epi16(src_temp_2, filt_coeff_grid);
195
196
110M
                reg_01_16x8b = _mm_hadd_epi16(reg_01_16x8b, reg_02_16x8b);
197
                /*add adjacent 16 bit values to get 32 bit values*/
198
110M
                reg_01_16x8b = _mm_madd_epi16(reg_01_16x8b, reg_all_1s);
199
200
                /*Add offset of 64 for rounding each out pixel value*/
201
110M
                reg_01_16x8b = _mm_add_epi32(reg_01_16x8b, reg_64val_32bit);
202
                /*Divide by 128 each out pixel value*/
203
110M
                reg_01_16x8b = _mm_srli_epi32(reg_01_16x8b, 7);
204
205
                /*For row 4,5*/
206
110M
                src_temp_4 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride * 4));
207
208
110M
                src_temp_5 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride * 5));
209
210
110M
                src_temp_4 = _mm_unpacklo_epi64(src_temp_4, src_temp_5);
211
212
                /*For row 6,7*/
213
110M
                src_temp_6 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride * 6));
214
215
110M
                src_temp_7 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride * 7));
216
217
110M
                src_temp_6 = _mm_unpacklo_epi64(src_temp_6, src_temp_7);
218
219
110M
                reg_03_16x8b = _mm_maddubs_epi16(src_temp_4, filt_coeff_grid);
220
221
110M
                reg_04_16x8b = _mm_maddubs_epi16(src_temp_6, filt_coeff_grid);
222
223
110M
                reg_03_16x8b = _mm_hadd_epi16(reg_03_16x8b, reg_04_16x8b);
224
225
110M
                reg_03_16x8b = _mm_madd_epi16(reg_03_16x8b, reg_all_1s);
226
227
                /*next add 2 adjacent 32 bit values to get a single 32 bit
228
                **value in each row
229
                */
230
231
                /*Add offset of 64 for rounding each out pixel value*/
232
110M
                reg_03_16x8b = _mm_add_epi32(reg_03_16x8b, reg_64val_32bit);
233
                /*Divide by 128 each out pixel value*/
234
110M
                reg_03_16x8b = _mm_srli_epi32(reg_03_16x8b, 7);
235
236
                /*pack the lower 16 bit values corresponding to the 8 output
237
                pixels from reg1 and reg 2*/
238
110M
                reg_01_16x8b = _mm_packus_epi32(reg_01_16x8b, reg_03_16x8b);
239
240
                /*For row 8,9*/
241
110M
                src_temp_0 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + 8 * u4_in_stride));
242
243
110M
                src_temp_1 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + 9 * u4_in_stride));
244
245
                /*next transfer the 8 pixels from temp_2 to temp_1 higher bits 64-127*/
246
110M
                src_temp_0 = _mm_unpacklo_epi64(src_temp_0, src_temp_1);
247
248
                /*For row 10,11*/
249
110M
                src_temp_2 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride * 10));
250
251
110M
                src_temp_3 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride * 11));
252
253
110M
                src_temp_2 = _mm_unpacklo_epi64(src_temp_2, src_temp_3);
254
255
110M
                reg_02_16x8b = _mm_maddubs_epi16(src_temp_0, filt_coeff_grid);
256
257
                /*multiply with filter coeffs to get 16 bit results*/
258
110M
                reg_03_16x8b = _mm_maddubs_epi16(src_temp_2, filt_coeff_grid);
259
260
110M
                reg_02_16x8b = _mm_hadd_epi16(reg_02_16x8b, reg_03_16x8b);
261
                /*add adjacent 16 bit values to get 32 bit values*/
262
110M
                reg_02_16x8b = _mm_madd_epi16(reg_02_16x8b, reg_all_1s);
263
264
                /*next add 2 adjacent 32 bit values to get a single
265
                32 bit value in each row*/
266
267
                /*Add offset of 64 for rounding each out pixel value*/
268
110M
                reg_02_16x8b = _mm_add_epi32(reg_02_16x8b, reg_64val_32bit);
269
                /*Divide by 128 each out pixel value*/
270
110M
                reg_02_16x8b = _mm_srli_epi32(reg_02_16x8b, 7);
271
272
                /*For row 12,13*/
273
110M
                src_temp_4 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride * 12));
274
275
110M
                src_temp_5 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride * 13));
276
277
110M
                src_temp_4 = _mm_unpacklo_epi64(src_temp_4, src_temp_5);
278
279
                /*For row 14,15*/
280
110M
                src_temp_6 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride * 14));
281
282
110M
                src_temp_7 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride * 15));
283
284
110M
                src_temp_6 = _mm_unpacklo_epi64(src_temp_6, src_temp_7);
285
286
110M
                reg_04_16x8b = _mm_maddubs_epi16(src_temp_4, filt_coeff_grid);
287
288
110M
                reg_05_16x8b = _mm_maddubs_epi16(src_temp_6, filt_coeff_grid);
289
290
110M
                reg_04_16x8b = _mm_hadd_epi16(reg_04_16x8b, reg_05_16x8b);
291
                /*add adjacent 16 bit values to get 32 bit values*/
292
110M
                reg_04_16x8b = _mm_madd_epi16(reg_04_16x8b, reg_all_1s);
293
294
                /*next add 2 adjacent 32 bit values to get a single
295
                32 bit value in each row*/
296
297
                /*Add offset of 64 for rounding each out pixel value*/
298
110M
                reg_04_16x8b = _mm_add_epi32(reg_04_16x8b, reg_64val_32bit);
299
                /*Divide by 128 each out pixel value*/
300
110M
                reg_04_16x8b = _mm_srli_epi32(reg_04_16x8b, 7);
301
302
                /*pack the lower 16 bit values corresponding to the 8 output
303
                pixels from reg1 and reg 2*/
304
110M
                reg_02_16x8b = _mm_packus_epi32(reg_02_16x8b, reg_04_16x8b);
305
306
                /*next get saturated 8 bit output pixel values for row 0-15*/
307
110M
                reg_01_16x8b = _mm_packus_epi16(reg_01_16x8b, reg_02_16x8b);
308
309
                /*Store the 16 output values*/
310
110M
                _mm_storeu_si128((__m128i *) pu1_out_pixel, reg_01_16x8b);
311
312
110M
                pu1_out_pixel += 16;
313
314
110M
                pu1_in_pixel += ((u4_src_vert_increments * (u4_in_stride << 4)) >> DOWNSCALER_Q);
315
316
                /* Update the context for next Loop Count */
317
110M
                u4_center_pixel_pos += u4_src_horz_increments;
318
110M
            }
319
2.79M
        }
320
321
        /*if height is not a multiple of 8 process 2 rows at a
322
        time for the remaining rows*/
323
780k
        if(u4_rem_vert_loop)
324
260k
        {
325
260k
            pu1_src_j = pu1_src + ((j << 4) * u4_in_stride);
326
260k
            pu1_dst_j = pu1_dst + (j << 4);
327
328
260k
            u4_center_pixel_pos = u4_center_pixel_pos_src;
329
330
12.2M
            for(i = 0; i < (WORD32) u4_blk_wd; i++)
331
12.0M
            {
332
12.0M
                u1_phase = get_filter_phase(u4_center_pixel_pos);
333
12.0M
                pi1_filter_for_grid = pai1_filters[u1_phase];
334
335
12.0M
                u2_full_pixel_inc = u4_center_pixel_pos >> DOWNSCALER_Q;
336
337
12.0M
                pu1_in_pixel = pu1_src_j + (u2_full_pixel_inc << u1_is_chroma);
338
339
12.0M
                pu1_out_pixel = pu1_dst_j + ((i << u1_is_chroma) * u4_out_stride);
340
341
12.0M
                filt_coeff_grid = _mm_loadu_si128((__m128i *) pi1_filter_for_grid);
342
343
108M
                for(j = u4_rem_vert_loop; j > 0; j--)
344
96.0M
                {
345
96.0M
                    src_temp_0 = _mm_loadl_epi64((__m128i const *) pu1_in_pixel);
346
347
96.0M
                    src_temp_0 = _mm_maddubs_epi16(src_temp_0, filt_coeff_grid);
348
349
96.0M
                    src_temp_0 = _mm_madd_epi16(src_temp_0, reg_all_1s);
350
351
96.0M
                    reg_01_16x8b = _mm_hadd_epi32(src_temp_0, reg_all_0s);
352
353
                    /*Add offset of 64 for rounding each out pixel value*/
354
96.0M
                    reg_01_16x8b = _mm_add_epi32(reg_01_16x8b, reg_64val_32bit);
355
                    /*Divide by 128 each out pixel value*/
356
96.0M
                    reg_01_16x8b = _mm_srli_epi32(reg_01_16x8b, (int) 7);
357
358
96.0M
                    reg_01_16x8b = _mm_packus_epi32(reg_01_16x8b, reg_all_0s);
359
360
                    /*next get saturated 8 bit output pixel values*/
361
96.0M
                    reg_01_16x8b = _mm_packus_epi16(reg_01_16x8b, reg_all_0s);
362
363
                    /*Store the 1 output value*/
364
96.0M
                    *pu1_out_pixel = (UWORD8) _mm_cvtsi128_si32(reg_01_16x8b);
365
366
96.0M
                    pu1_in_pixel += (u4_src_vert_increments * u4_in_stride) >> DOWNSCALER_Q;
367
368
96.0M
                    pu1_out_pixel++;
369
96.0M
                }
370
                /* Update the context for next Loop Count */
371
12.0M
                u4_center_pixel_pos += u4_src_horz_increments;
372
12.0M
            }
373
260k
        }
374
780k
    }
375
376
260k
    else /* for chroma */
377
260k
    {
378
926k
        for(j = 0; j < (WORD32) u4_num_iterations_vertical_by_16; j++)
379
666k
        {
380
666k
            pu1_src_j = pu1_src + ((j << 4) * u4_in_stride);
381
666k
            pu1_dst_j = pu1_dst + (j << 4);
382
383
666k
            u4_center_pixel_pos = u4_center_pixel_pos_src;
384
385
32.1M
            for(i = 0; i < (WORD32) u4_blk_wd; i++)
386
31.4M
            {
387
31.4M
                u1_phase = get_filter_phase(u4_center_pixel_pos);
388
31.4M
                pi1_filter_for_grid = pai1_filters[u1_phase];
389
390
31.4M
                u2_full_pixel_inc = u4_center_pixel_pos >> DOWNSCALER_Q;
391
392
31.4M
                pu1_in_pixel = pu1_src_j + (u2_full_pixel_inc << u1_is_chroma);
393
394
31.4M
                pu1_out_pixel = pu1_dst_j + ((i << u1_is_chroma) * u4_out_stride);
395
396
31.4M
                filt_coeff_grid = _mm_loadu_si128((__m128i *) pi1_filter_for_grid);
397
                /******************************************************/
398
                /* This loop is going vertically in bottom direction */
399
                /* but the output pixels are stored in horizontal    */
400
                /* direction in transpose manner                     */
401
                /******************************************************/
402
403
                /*Load 16 values shuffle to separate Cb and Cr and process*/
404
405
31.4M
                src_temp_0 = _mm_loadu_si128((__m128i *) pu1_in_pixel);
406
31.4M
                src_temp_1 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride));
407
408
31.4M
                src_temp_2 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride * 2));
409
410
31.4M
                src_temp_3 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride * 3));
411
412
31.4M
                src_temp_0 = _mm_shuffle_epi8(src_temp_0, reg_shuffle);
413
31.4M
                src_temp_1 = _mm_shuffle_epi8(src_temp_1, reg_shuffle);
414
31.4M
                src_temp_2 = _mm_shuffle_epi8(src_temp_2, reg_shuffle);
415
31.4M
                src_temp_3 = _mm_shuffle_epi8(src_temp_3, reg_shuffle);
416
417
31.4M
                reg_01_16x8b = _mm_maddubs_epi16(src_temp_0, filt_coeff_grid);
418
31.4M
                reg_02_16x8b = _mm_maddubs_epi16(src_temp_1, filt_coeff_grid);
419
420
31.4M
                reg_01_16x8b = _mm_hadd_epi16(reg_01_16x8b, reg_02_16x8b);
421
422
31.4M
                reg_01_16x8b = _mm_madd_epi16(reg_01_16x8b, reg_all_1s);
423
424
31.4M
                reg_01_16x8b = _mm_add_epi32(reg_01_16x8b, reg_64val_32bit);
425
426
31.4M
                reg_01_16x8b = _mm_srli_epi32(reg_01_16x8b, (int) 7);
427
428
31.4M
                reg_03_16x8b = _mm_maddubs_epi16(src_temp_2, filt_coeff_grid);
429
31.4M
                reg_04_16x8b = _mm_maddubs_epi16(src_temp_3, filt_coeff_grid);
430
431
31.4M
                src_temp_4 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride * 4));
432
433
31.4M
                src_temp_5 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride * 5));
434
435
31.4M
                src_temp_6 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride * 6));
436
437
31.4M
                src_temp_7 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride * 7));
438
439
31.4M
                src_temp_4 = _mm_shuffle_epi8(src_temp_4, reg_shuffle);
440
31.4M
                src_temp_5 = _mm_shuffle_epi8(src_temp_5, reg_shuffle);
441
31.4M
                src_temp_6 = _mm_shuffle_epi8(src_temp_6, reg_shuffle);
442
31.4M
                src_temp_7 = _mm_shuffle_epi8(src_temp_7, reg_shuffle);
443
444
31.4M
                reg_03_16x8b = _mm_hadd_epi16(reg_03_16x8b, reg_04_16x8b);
445
446
31.4M
                reg_03_16x8b = _mm_madd_epi16(reg_03_16x8b, reg_all_1s);
447
448
31.4M
                reg_03_16x8b = _mm_add_epi32(reg_03_16x8b, reg_64val_32bit);
449
450
31.4M
                reg_03_16x8b = _mm_srli_epi32(reg_03_16x8b, (int) 7);
451
452
31.4M
                reg_01_16x8b = _mm_packus_epi32(reg_01_16x8b, reg_03_16x8b);
453
454
31.4M
                reg_02_16x8b = _mm_maddubs_epi16(src_temp_4, filt_coeff_grid);
455
31.4M
                reg_04_16x8b = _mm_maddubs_epi16(src_temp_5, filt_coeff_grid);
456
457
31.4M
                reg_02_16x8b = _mm_hadd_epi16(reg_02_16x8b, reg_04_16x8b);
458
459
31.4M
                reg_02_16x8b = _mm_madd_epi16(reg_02_16x8b, reg_all_1s);
460
461
31.4M
                reg_02_16x8b = _mm_add_epi32(reg_02_16x8b, reg_64val_32bit);
462
463
31.4M
                reg_02_16x8b = _mm_srli_epi32(reg_02_16x8b, (int) 7);
464
465
31.4M
                reg_03_16x8b = _mm_maddubs_epi16(src_temp_6, filt_coeff_grid);
466
31.4M
                reg_04_16x8b = _mm_maddubs_epi16(src_temp_7, filt_coeff_grid);
467
468
31.4M
                reg_03_16x8b = _mm_hadd_epi16(reg_03_16x8b, reg_04_16x8b);
469
470
31.4M
                reg_03_16x8b = _mm_madd_epi16(reg_03_16x8b, reg_all_1s);
471
472
31.4M
                reg_03_16x8b = _mm_add_epi32(reg_03_16x8b, reg_64val_32bit);
473
474
31.4M
                reg_03_16x8b = _mm_srli_epi32(reg_03_16x8b, (int) 7);
475
476
31.4M
                reg_02_16x8b = _mm_packus_epi32(reg_02_16x8b, reg_03_16x8b);
477
478
31.4M
                reg_01_16x8b = _mm_packus_epi16(reg_01_16x8b, reg_02_16x8b);
479
480
31.4M
                reg_01_16x8b = _mm_shuffle_epi8(reg_01_16x8b, reg_shuffle);
481
482
31.4M
                src_temp_0 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + 8 * u4_in_stride));
483
484
31.4M
                src_temp_1 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + 9 * u4_in_stride));
485
486
31.4M
                src_temp_2 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride * 10));
487
488
31.4M
                src_temp_3 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride * 11));
489
490
31.4M
                src_temp_0 = _mm_shuffle_epi8(src_temp_0, reg_shuffle);
491
31.4M
                src_temp_1 = _mm_shuffle_epi8(src_temp_1, reg_shuffle);
492
31.4M
                src_temp_2 = _mm_shuffle_epi8(src_temp_2, reg_shuffle);
493
31.4M
                src_temp_3 = _mm_shuffle_epi8(src_temp_3, reg_shuffle);
494
495
31.4M
                reg_02_16x8b = _mm_maddubs_epi16(src_temp_0, filt_coeff_grid);
496
31.4M
                reg_03_16x8b = _mm_maddubs_epi16(src_temp_1, filt_coeff_grid);
497
498
31.4M
                reg_02_16x8b = _mm_hadd_epi16(reg_02_16x8b, reg_03_16x8b);
499
500
31.4M
                reg_02_16x8b = _mm_madd_epi16(reg_02_16x8b, reg_all_1s);
501
502
31.4M
                reg_02_16x8b = _mm_add_epi32(reg_02_16x8b, reg_64val_32bit);
503
504
31.4M
                reg_02_16x8b = _mm_srli_epi32(reg_02_16x8b, (int) 7);
505
506
31.4M
                reg_04_16x8b = _mm_maddubs_epi16(src_temp_2, filt_coeff_grid);
507
31.4M
                reg_05_16x8b = _mm_maddubs_epi16(src_temp_3, filt_coeff_grid);
508
509
31.4M
                src_temp_4 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride * 12));
510
511
31.4M
                src_temp_5 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride * 13));
512
513
31.4M
                src_temp_6 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride * 14));
514
515
31.4M
                src_temp_7 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride * 15));
516
517
31.4M
                src_temp_4 = _mm_shuffle_epi8(src_temp_4, reg_shuffle);
518
31.4M
                src_temp_5 = _mm_shuffle_epi8(src_temp_5, reg_shuffle);
519
31.4M
                src_temp_6 = _mm_shuffle_epi8(src_temp_6, reg_shuffle);
520
31.4M
                src_temp_7 = _mm_shuffle_epi8(src_temp_7, reg_shuffle);
521
522
31.4M
                reg_04_16x8b = _mm_hadd_epi16(reg_04_16x8b, reg_05_16x8b);
523
524
31.4M
                reg_04_16x8b = _mm_madd_epi16(reg_04_16x8b, reg_all_1s);
525
526
31.4M
                reg_04_16x8b = _mm_add_epi32(reg_04_16x8b, reg_64val_32bit);
527
528
31.4M
                reg_04_16x8b = _mm_srli_epi32(reg_04_16x8b, (int) 7);
529
530
31.4M
                reg_02_16x8b = _mm_packus_epi32(reg_02_16x8b, reg_04_16x8b);
531
532
31.4M
                reg_03_16x8b = _mm_maddubs_epi16(src_temp_4, filt_coeff_grid);
533
31.4M
                reg_05_16x8b = _mm_maddubs_epi16(src_temp_5, filt_coeff_grid);
534
535
31.4M
                reg_03_16x8b = _mm_hadd_epi16(reg_03_16x8b, reg_05_16x8b);
536
537
31.4M
                reg_03_16x8b = _mm_madd_epi16(reg_03_16x8b, reg_all_1s);
538
539
31.4M
                reg_03_16x8b = _mm_add_epi32(reg_03_16x8b, reg_64val_32bit);
540
541
31.4M
                reg_03_16x8b = _mm_srli_epi32(reg_03_16x8b, (int) 7);
542
543
31.4M
                reg_04_16x8b = _mm_maddubs_epi16(src_temp_6, filt_coeff_grid);
544
31.4M
                reg_05_16x8b = _mm_maddubs_epi16(src_temp_7, filt_coeff_grid);
545
546
31.4M
                reg_04_16x8b = _mm_hadd_epi16(reg_04_16x8b, reg_05_16x8b);
547
548
31.4M
                reg_04_16x8b = _mm_madd_epi16(reg_04_16x8b, reg_all_1s);
549
550
31.4M
                reg_04_16x8b = _mm_add_epi32(reg_04_16x8b, reg_64val_32bit);
551
552
31.4M
                reg_04_16x8b = _mm_srli_epi32(reg_04_16x8b, (int) 7);
553
554
31.4M
                reg_03_16x8b = _mm_packus_epi32(reg_03_16x8b, reg_04_16x8b);
555
556
31.4M
                reg_02_16x8b = _mm_packus_epi16(reg_02_16x8b, reg_03_16x8b);
557
558
31.4M
                reg_02_16x8b = _mm_shuffle_epi8(reg_02_16x8b, reg_shuffle);
559
560
31.4M
                reg_03_16x8b = _mm_unpacklo_epi64(reg_01_16x8b, reg_02_16x8b);
561
562
31.4M
                reg_04_16x8b = _mm_unpackhi_epi64(reg_01_16x8b, reg_02_16x8b);
563
564
                /*Storing after shuffling again*/
565
566
31.4M
                _mm_storeu_si128((__m128i *) pu1_out_pixel, reg_03_16x8b);
567
31.4M
                _mm_storeu_si128((__m128i *) (pu1_out_pixel + u4_out_stride), reg_04_16x8b);
568
569
31.4M
                pu1_out_pixel += 16;
570
571
31.4M
                pu1_in_pixel += (u4_src_vert_increments * (u4_in_stride << 4)) >> DOWNSCALER_Q;
572
573
                /* Update the context for next Loop Count */
574
31.4M
                u4_center_pixel_pos += u4_src_horz_increments;
575
31.4M
            }
576
666k
        }
577
578
        /*if height is not a multiple of 8 process 2 rows at a
579
        time for the remaining rows*/
580
260k
        if(u4_rem_vert_loop)
581
223k
        {
582
223k
            pu1_src_j = pu1_src + ((j << 4) * u4_in_stride);
583
223k
            pu1_dst_j = pu1_dst + (j << 4);
584
585
223k
            u4_center_pixel_pos = u4_center_pixel_pos_src;
586
10.2M
            for(i = 0; i < (WORD32) u4_blk_wd; i++)
587
9.97M
            {
588
9.97M
                UWORD8 u1_phase = get_filter_phase(u4_center_pixel_pos);
589
9.97M
                pi1_filter_for_grid = pai1_filters[u1_phase];
590
591
9.97M
                u2_full_pixel_inc = u4_center_pixel_pos >> DOWNSCALER_Q;
592
593
9.97M
                pu1_in_pixel = pu1_src_j + (u2_full_pixel_inc << u1_is_chroma);
594
595
9.97M
                pu1_out_pixel = pu1_dst_j + ((i << u1_is_chroma) * u4_out_stride);
596
597
9.97M
                filt_coeff_grid = _mm_loadu_si128((__m128i *) pi1_filter_for_grid);
598
599
49.8M
                for(j = u4_rem_vert_loop; j > 0; j = j - 2)
600
39.9M
                {
601
39.9M
                    src_temp_0 = _mm_loadu_si128((__m128i const *) pu1_in_pixel);
602
39.9M
                    src_temp_0 = _mm_shuffle_epi8(src_temp_0, reg_shuffle);
603
604
39.9M
                    src_temp_1 = _mm_loadu_si128((__m128i const *) (pu1_in_pixel + u4_in_stride));
605
606
39.9M
                    src_temp_1 = _mm_shuffle_epi8(src_temp_1, reg_shuffle);
607
608
39.9M
                    src_temp_0 = _mm_maddubs_epi16(src_temp_0, filt_coeff_grid);
609
39.9M
                    src_temp_1 = _mm_maddubs_epi16(src_temp_1, filt_coeff_grid);
610
611
39.9M
                    reg_01_16x8b = _mm_hadd_epi16(src_temp_0, src_temp_1);
612
613
39.9M
                    reg_01_16x8b = _mm_madd_epi16(reg_01_16x8b, reg_all_1s);
614
615
                    /*Add offset of 64 for rounding each out pixel value*/
616
39.9M
                    reg_01_16x8b = _mm_add_epi32(reg_01_16x8b, reg_64val_32bit);
617
                    /*Divide by 128 each out pixel value*/
618
39.9M
                    reg_01_16x8b = _mm_srli_epi32(reg_01_16x8b, (int) 7);
619
620
39.9M
                    reg_01_16x8b = _mm_packus_epi32(reg_01_16x8b, reg_all_0s);
621
622
                    /*next get saturated 8 bit output pixel values*/
623
39.9M
                    reg_01_16x8b = _mm_packus_epi16(reg_01_16x8b, reg_all_0s);
624
625
39.9M
                    reg_01_16x8b = _mm_shuffle_epi8(reg_01_16x8b, reg_shuffle);
626
627
39.9M
                    reg_02_16x8b = _mm_srli_si128(reg_01_16x8b, (int) 8);
628
629
                    /*Store the 2 output values*/
630
39.9M
                    i4_temp_pixel_holder = _mm_cvtsi128_si32(reg_01_16x8b);
631
632
39.9M
                    *pu1_out_pixel = (UWORD8) i4_temp_pixel_holder;
633
39.9M
                    i4_temp_pixel_holder >>= 8;
634
635
39.9M
                    *(pu1_out_pixel + 1) = (UWORD8) i4_temp_pixel_holder;
636
637
39.9M
                    i4_temp_pixel_holder = _mm_cvtsi128_si32(reg_02_16x8b);
638
639
39.9M
                    *(pu1_out_pixel + u4_out_stride) = (UWORD8) i4_temp_pixel_holder;
640
39.9M
                    i4_temp_pixel_holder >>= 8;
641
642
39.9M
                    *(pu1_out_pixel + u4_out_stride + 1) = (UWORD8) i4_temp_pixel_holder;
643
644
39.9M
                    pu1_in_pixel += (u4_src_vert_increments * (u4_in_stride << 1)) >> DOWNSCALER_Q;
645
39.9M
                    pu1_out_pixel += 2;
646
39.9M
                }
647
                /* Update the context for next Loop Count */
648
9.97M
                u4_center_pixel_pos += u4_src_horz_increments;
649
9.97M
            }
650
223k
        }
651
260k
    }
652
1.04M
}