Coverage Report

Created: 2026-01-09 06:18

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libavc/common/x86/ih264_weighted_pred_avx2.c
Line
Count
Source
1
/******************************************************************************
2
 *
3
 * Copyright (C) 2015 The Android Open Source Project
4
 *
5
 * Licensed under the Apache License, Version 2.0 (the "License");
6
 * you may not use this file except in compliance with the License.
7
 * You may obtain a copy of the License at:
8
 *
9
 * http://www.apache.org/licenses/LICENSE-2.0
10
 *
11
 * Unless required by applicable law or agreed to in writing, software
12
 * distributed under the License is distributed on an "AS IS" BASIS,
13
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 * See the License for the specific language governing permissions and
15
 * limitations under the License.
16
 *
17
 *****************************************************************************
18
 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19
*/
20
/*****************************************************************************/
21
/*****************************************************************************/
22
/* File Includes                                                             */
23
/*****************************************************************************/
24
25
#include <immintrin.h>
26
#include "ih264_typedefs.h"
27
#include "ih264_macros.h"
28
#include "ih264_platform_macros.h"
29
#include "ih264_weighted_pred.h"
30
#include <stdint.h>
31
#include <string.h>
32
33
#include <stdio.h>
34
35
36
/*****************************************************************************/
37
/*                                                                           */
38
/*  Function Name : ih264_weighted_bi_pred_luma_avx2                         */
39
/*                                                                           */
40
/*  Description   : This function performs the weighted biprediction as      */
41
/*                  described in sec 8.4.2.3.2 titled "Weighted sample       */
42
/*                  prediction process" for luma. The function gets two      */
43
/*                  ht x wd blocks, weights them, adds them, rounds off the  */
44
/*                  sum, offsets it, saturates it to unsigned 8-bit and      */
45
/*                  stores it in the destination block. (ht,wd) can be       */
46
/*                  (4,4), (8,4), (4,8), (8,8), (16,8), (8,16) or (16,16).   */
47
/*                                                                           */
48
/*  Inputs        : pu1_src1  - Pointer to source 1                          */
49
/*                  pu1_src2  - Pointer to source 2                          */
50
/*                  pu1_dst   - Pointer to destination                       */
51
/*                  src_strd1 - stride for source 1                          */
52
/*                  src_strd2 - stride for source 2                          */
53
/*                  dst_strd2 - stride for destination                       */
54
/*                  log_wd    - number of bits to be rounded off             */
55
/*                  wt1       - weight value for source 1                    */
56
/*                  wt2       - weight value for source 2                    */
57
/*                  ofst1     - offset value for source 1                    */
58
/*                  ofst2     - offset value for source 2                    */
59
/*                  ht        - height of the block                          */
60
/*                  wd        - width of the block                           */
61
/*                                                                           */
62
/*  Issues        : None                                                     */
63
/*                                                                           */
64
/*  Revision History:                                                        */
65
/*                                                                           */
66
/*         DD MM YYYY   Author(s)       Changes                              */
67
/*         04 02 2015   Kaushik         Initial Version                      */
68
/*                      Senthoor                                             */
69
/*         15 09 2020   Priyanka Bose   AVX2 Intel Intrinsics Support        */
70
/*****************************************************************************/
71
void ih264_weighted_bi_pred_luma_avx2(UWORD8 *pu1_src1,
72
                                       UWORD8 *pu1_src2,
73
                                       UWORD8 *pu1_dst,
74
                                       WORD32 src_strd1,
75
                                       WORD32 src_strd2,
76
                                       WORD32 dst_strd,
77
                                       WORD32 log_wd,
78
                                       WORD32 wt1,
79
                                       WORD32 wt2,
80
                                       WORD32 ofst1,
81
                                       WORD32 ofst2,
82
                                       WORD32 ht,
83
                                       WORD32 wd)
84
1.85k
{
85
86
1.85k
    __m256i wt1_8x32b, wt2_8x32b;
87
1.85k
    __m256i ofst_8x32b, round_8x32b;
88
1.85k
    __m256i zero;
89
1.85k
    zero = _mm256_set1_epi8(0);
90
91
1.85k
    WORD32 ofst;
92
1.85k
    WORD32 round_val, shft;
93
94
1.85k
    wt1 = (WORD16)(wt1 & 0xffff);
95
1.85k
    wt2 = (WORD16)(wt2 & 0xffff);
96
1.85k
    round_val = 1 << log_wd;
97
1.85k
    shft = log_wd + 1;
98
1.85k
    ofst1 = (WORD8)(ofst1 & 0xff);
99
1.85k
    ofst2 = (WORD8)(ofst2 & 0xff);
100
1.85k
    ofst = (ofst1 + ofst2 + 1) >> 1;
101
102
1.85k
    wt1_8x32b = _mm256_set1_epi16(wt1);
103
1.85k
    wt2_8x32b = _mm256_set1_epi16(wt2);
104
1.85k
    round_8x32b = _mm256_set1_epi16(round_val);
105
1.85k
    ofst_8x32b = _mm256_set1_epi16(ofst);
106
107
108
1.85k
    if(wd == 4)
109
0
    {
110
0
        __m128i y1_2_16x8b, y1_3_16x8b;
111
0
        __m128i y2_2_16x8b, y2_3_16x8b;
112
113
0
        __m256i y1_02_32x8b,y1_13_32x8b,y2_02_32x8b,y2_13_32x8b,y1_0_32x8b,y2_0_32x8b,y1_0_8x32b,y2_1_8x32b,y2_0_8x32b;
114
0
        __m128i y1_0_16x8b_128,y2_0_16x8b_128,y1_1_16x8b_128,y1_2_16x8b_128,y1_3_16x8b_128;
115
116
0
        do
117
0
        {
118
0
            y1_02_32x8b =  _mm256_loadu2_m128i((__m128i *)(pu1_src1 + (src_strd1 << 1)), (__m128i *)(pu1_src1));
119
0
            y1_13_32x8b =  _mm256_loadu2_m128i((__m128i *)(pu1_src1 + src_strd1 * 3), (__m128i *)(pu1_src1 + src_strd1));
120
121
0
            y2_02_32x8b =  _mm256_loadu2_m128i((__m128i *)(pu1_src2 + (src_strd2 << 1)), (__m128i *)(pu1_src2));
122
0
            y2_13_32x8b =  _mm256_loadu2_m128i((__m128i *)(pu1_src2 + src_strd2 * 3), (__m128i *)(pu1_src2 + src_strd2));
123
124
0
            y1_02_32x8b = _mm256_unpacklo_epi64(y1_02_32x8b, zero);
125
0
            y1_13_32x8b = _mm256_unpacklo_epi64(y1_13_32x8b, zero);
126
0
            y2_02_32x8b = _mm256_unpacklo_epi64(y2_02_32x8b, zero);
127
0
            y2_13_32x8b = _mm256_unpacklo_epi64(y2_13_32x8b, zero);
128
129
0
            y1_0_32x8b = _mm256_unpacklo_epi32(y1_02_32x8b, y1_13_32x8b);
130
0
            y2_0_32x8b = _mm256_unpacklo_epi32(y2_02_32x8b, y2_13_32x8b);
131
0
            y1_0_16x8b_128 = _mm256_castsi256_si128(_mm256_permute4x64_epi64(y1_0_32x8b, 0xD8));
132
0
            y2_0_16x8b_128 = _mm256_castsi256_si128(_mm256_permute4x64_epi64(y2_0_32x8b, 0xD8));
133
134
0
            y1_0_8x32b = _mm256_cvtepu8_epi16(y1_0_16x8b_128); // 8 to 16
135
0
            y2_0_8x32b = _mm256_cvtepu8_epi16(y2_0_16x8b_128);
136
137
0
            y1_0_8x32b = _mm256_mullo_epi16(y1_0_8x32b, wt1_8x32b);
138
0
            y2_0_8x32b = _mm256_mullo_epi16(y2_0_8x32b, wt2_8x32b);
139
140
0
            y1_0_8x32b = _mm256_adds_epi16(y1_0_8x32b, y2_0_8x32b);
141
142
0
            y1_0_8x32b = _mm256_srai_epi16(y1_0_8x32b, shft);
143
144
0
            y1_0_8x32b = _mm256_adds_epi16(ofst_8x32b, y1_0_8x32b);
145
146
0
            y1_0_16x8b_128 = _mm256_castsi256_si128(_mm256_packus_epi16(y1_0_8x32b, y1_0_8x32b));
147
0
            y1_2_16x8b_128 = _mm_srli_si128(y1_0_16x8b_128, 4);
148
0
            y1_1_16x8b_128 = _mm_srli_si128(y1_0_16x8b_128, 8);
149
0
            y1_3_16x8b_128 = _mm_srli_si128(y1_0_16x8b_128, 12);
150
151
0
            *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y1_0_16x8b_128);
152
0
            *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y1_1_16x8b_128);
153
0
            *((WORD32 *)(pu1_dst + (dst_strd << 1))) = _mm_cvtsi128_si32(y1_2_16x8b_128);
154
0
            *((WORD32 *)(pu1_dst + dst_strd * 3)) = _mm_cvtsi128_si32(y1_3_16x8b_128);
155
156
0
            ht -= 4;
157
0
            pu1_src1 += src_strd1 << 2;
158
0
            pu1_src2 += src_strd2 << 2;
159
0
            pu1_dst += dst_strd << 2;
160
0
        }
161
0
        while(ht > 0);
162
0
    }
163
1.85k
    else if(wd == 8)
164
184
    {
165
184
        __m128i y1_0_16x8b_128,y2_0_16x8b_128,y1_2_16x8b_128,y1_1_16x8b_128,y1_3_16x8b_128;
166
184
        __m256i y1_02_32x8b,y1_13_32x8b,y2_02_32x8b,y2_13_32x8b,y1_0_32x8b,y2_0_32x8b,y1_0_8x32b;
167
184
        __m256i y1_1_8x32b,y2_0_8x32b,y2_1_8x32b;
168
169
184
        do
170
736
        {
171
172
736
            y1_02_32x8b =  _mm256_loadu2_m128i((__m128i *)(pu1_src1 + (src_strd1 << 1)), (__m128i *)(pu1_src1));
173
736
            y1_13_32x8b =  _mm256_loadu2_m128i((__m128i *)(pu1_src1 + src_strd1 * 3), (__m128i *)(pu1_src1 + src_strd1));
174
175
736
            y2_02_32x8b =  _mm256_loadu2_m128i((__m128i *)(pu1_src2 + (src_strd2 << 1)), (__m128i *)(pu1_src2));
176
736
            y2_13_32x8b =  _mm256_loadu2_m128i((__m128i *)(pu1_src2 + src_strd2 * 3), (__m128i *)(pu1_src2 + src_strd2));
177
178
736
            y1_02_32x8b = _mm256_unpacklo_epi64(y1_02_32x8b, zero);
179
736
            y1_13_32x8b = _mm256_unpacklo_epi64(y1_13_32x8b, zero);
180
736
            y2_02_32x8b = _mm256_unpacklo_epi64(y2_02_32x8b, zero);
181
736
            y2_13_32x8b = _mm256_unpacklo_epi64(y2_13_32x8b, zero);
182
183
736
            y1_0_32x8b = _mm256_unpacklo_epi64(y1_02_32x8b, y1_13_32x8b);
184
736
            y2_0_32x8b = _mm256_unpacklo_epi64(y2_02_32x8b, y2_13_32x8b);
185
186
736
            y1_0_8x32b = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(y1_0_32x8b));
187
736
            y1_0_16x8b_128 = _mm256_castsi256_si128(_mm256_permute2x128_si256(y1_0_32x8b,y1_0_32x8b,0x1));
188
736
            y1_1_8x32b = _mm256_cvtepu8_epi16(y1_0_16x8b_128);
189
190
736
            y2_0_8x32b = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(y2_0_32x8b));
191
736
            y2_0_16x8b_128 = _mm256_castsi256_si128(_mm256_permute2x128_si256(y2_0_32x8b,y2_0_32x8b,0x1));
192
736
            y2_1_8x32b = _mm256_cvtepu8_epi16(y2_0_16x8b_128);
193
194
195
736
            y1_0_8x32b = _mm256_mullo_epi16(y1_0_8x32b, wt1_8x32b);
196
736
            y2_0_8x32b = _mm256_mullo_epi16(y2_0_8x32b, wt2_8x32b);
197
736
            y1_1_8x32b = _mm256_mullo_epi16(y1_1_8x32b, wt1_8x32b);
198
736
            y2_1_8x32b = _mm256_mullo_epi16(y2_1_8x32b, wt2_8x32b);
199
200
736
            y1_0_8x32b = _mm256_adds_epi16(y1_0_8x32b, y2_0_8x32b);
201
736
            y1_1_8x32b = _mm256_adds_epi16(y1_1_8x32b, y2_1_8x32b);
202
203
736
            y1_0_8x32b = _mm256_srai_epi16(y1_0_8x32b, shft);
204
736
            y1_1_8x32b = _mm256_srai_epi16(y1_1_8x32b, shft);
205
206
736
            y1_0_8x32b = _mm256_adds_epi16(ofst_8x32b, y1_0_8x32b);
207
736
            y1_1_8x32b = _mm256_adds_epi16(ofst_8x32b, y1_1_8x32b);
208
209
736
            y1_0_32x8b = _mm256_packus_epi16(y1_0_8x32b, y1_1_8x32b);
210
736
            y1_0_16x8b_128 = _mm256_castsi256_si128(y1_0_32x8b);
211
736
            y1_2_16x8b_128 = _mm256_castsi256_si128(_mm256_srli_si256(y1_0_32x8b, 8));
212
213
736
            y1_0_32x8b = _mm256_permute2x128_si256(y1_0_32x8b,y1_0_32x8b,1);
214
736
            y1_1_16x8b_128 = _mm256_castsi256_si128(y1_0_32x8b);
215
736
            y1_3_16x8b_128 = _mm256_castsi256_si128(_mm256_srli_si256(y1_0_32x8b, 8));
216
217
736
            _mm_storel_epi64((__m128i *)pu1_dst, y1_0_16x8b_128);
218
736
            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b_128);
219
736
            _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd << 1)), y1_2_16x8b_128);
220
736
            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd * 3), y1_3_16x8b_128);
221
222
736
            ht -= 4;
223
736
            pu1_src1 += src_strd1 << 2;
224
736
            pu1_src2 += src_strd2 << 2;
225
736
            pu1_dst += dst_strd << 2;
226
227
736
        }
228
736
        while(ht > 0);
229
184
    }
230
1.66k
    else // wd == 16
231
1.66k
    {
232
1.66k
        __m256i y1_0L_8x32b, y1_0H_8x32b, y1_1L_8x32b, y1_1H_8x32b;
233
1.66k
        __m256i y2_0L_8x32b, y2_0H_8x32b, y2_1L_8x32b, y2_1H_8x32b;
234
235
1.66k
        __m256i zero_32x8b,y1_0_32x8b,y2_0_32x8b;
236
1.66k
        zero_32x8b = _mm256_set1_epi8(0);
237
238
1.66k
        do
239
10.2k
        {
240
241
10.2k
            y1_0_32x8b = _mm256_loadu_si256((__m256i *)pu1_src1);
242
10.2k
            y2_0_32x8b = _mm256_loadu_si256((__m256i *)pu1_src2);
243
244
10.2k
            y1_0L_8x32b = _mm256_unpacklo_epi8(y1_0_32x8b, zero_32x8b);
245
10.2k
            y1_0H_8x32b = _mm256_unpackhi_epi8(y1_0_32x8b, zero_32x8b);
246
247
10.2k
            y2_0L_8x32b = _mm256_unpacklo_epi8(y2_0_32x8b,zero_32x8b);
248
10.2k
            y2_0H_8x32b = _mm256_unpackhi_epi8(y2_0_32x8b, zero_32x8b);
249
250
10.2k
            y1_0L_8x32b = _mm256_mullo_epi16(y1_0L_8x32b, wt1_8x32b);
251
10.2k
            y1_0H_8x32b = _mm256_mullo_epi16(y1_0H_8x32b, wt1_8x32b);
252
253
10.2k
            y2_0L_8x32b = _mm256_mullo_epi16(y2_0L_8x32b, wt2_8x32b);
254
10.2k
            y2_0H_8x32b = _mm256_mullo_epi16(y2_0H_8x32b, wt2_8x32b);
255
256
10.2k
            y1_0L_8x32b = _mm256_adds_epi16(y1_0L_8x32b, y2_0L_8x32b);
257
10.2k
            y1_0H_8x32b = _mm256_adds_epi16(y1_0H_8x32b, y2_0H_8x32b);
258
259
10.2k
            y1_0L_8x32b = _mm256_adds_epi16(round_8x32b, y1_0L_8x32b);
260
10.2k
            y1_0H_8x32b = _mm256_adds_epi16(round_8x32b, y1_0H_8x32b);
261
262
10.2k
            y1_0L_8x32b = _mm256_srai_epi16(y1_0L_8x32b, shft);
263
10.2k
            y1_0H_8x32b = _mm256_srai_epi16(y1_0H_8x32b, shft);
264
265
10.2k
            y1_0L_8x32b = _mm256_adds_epi16(ofst_8x32b, y1_0L_8x32b);
266
10.2k
            y1_0H_8x32b = _mm256_adds_epi16(ofst_8x32b, y1_0H_8x32b);
267
268
10.2k
            y1_0_32x8b = _mm256_packus_epi16(y1_0L_8x32b, y1_0H_8x32b);
269
270
10.2k
            _mm256_storeu_si256((__m256i *)pu1_dst, y1_0_32x8b);
271
272
10.2k
            ht -= 2;
273
10.2k
            pu1_src1 += src_strd1 << 1;
274
10.2k
            pu1_src2 += src_strd2 << 1;
275
10.2k
            pu1_dst += dst_strd << 1;
276
10.2k
        }
277
10.2k
        while(ht > 0);
278
1.66k
    }
279
1.85k
}
280
281
282
/*****************************************************************************/
283
/*                                                                           */
284
/*  Function Name : ih264_weighted_bi_pred_chroma_avx2                       */
285
/*                                                                           */
286
/*  Description   : This function performs the weighted biprediction as      */
287
/*                  described in sec 8.4.2.3.2 titled "Weighted sample       */
288
/*                  prediction process" for chroma. The function gets two    */
289
/*                  ht x wd blocks, weights them, adds them, rounds off the  */
290
/*                  sum, offsets it, saturates it to unsigned 8-bit and      */
291
/*                  stores it in the destination block. (ht,wd) can be       */
292
/*                  (2,2), (4,2), (2,4), (4,4), (8,4), (4,8) or (8,8).       */
293
/*                                                                           */
294
/*  Inputs        : pu1_src1  - Pointer to source 1                          */
295
/*                  pu1_src2  - Pointer to source 2                          */
296
/*                  pu1_dst   - Pointer to destination                       */
297
/*                  src_strd1 - stride for source 1                          */
298
/*                  src_strd2 - stride for source 2                          */
299
/*                  dst_strd2 - stride for destination                       */
300
/*                  log_wd    - number of bits to be rounded off             */
301
/*                  wt1       - weight values for u and v in source 1        */
302
/*                  wt2       - weight values for u and v in source 2        */
303
/*                  ofst1     - offset value for u and v in source 1         */
304
/*                  ofst2     - offset value for u and v in source 2         */
305
/*                  ht        - height of the block                          */
306
/*                  wd        - width of the block                           */
307
/*                                                                           */
308
/*  Issues        : None                                                     */
309
/*                                                                           */
310
/*  Revision History:                                                        */
311
/*                                                                           */
312
/*         DD MM YYYY   Author(s)       Changes                              */
313
/*         04 02 2015   Kaushik         Initial Version                      */
314
/*                      Senthoor                                             */
315
/*         15 09 2020   Priyanka Bose   AVX2 Intel Intrinsics Support        */
316
/*****************************************************************************/
317
void ih264_weighted_bi_pred_chroma_avx2(UWORD8 *pu1_src1,
318
                                         UWORD8 *pu1_src2,
319
                                         UWORD8 *pu1_dst,
320
                                         WORD32 src_strd1,
321
                                         WORD32 src_strd2,
322
                                         WORD32 dst_strd,
323
                                         WORD32 log_wd,
324
                                         WORD32 wt1,
325
                                         WORD32 wt2,
326
                                         WORD32 ofst1,
327
                                         WORD32 ofst2,
328
                                         WORD32 ht,
329
                                         WORD32 wd)
330
1.85k
{
331
332
1.85k
    __m128i y1_0_16x8b, y1_1_16x8b;
333
1.85k
    __m128i y2_0_16x8b, y2_1_16x8b;
334
335
1.85k
    __m128i wt1_8x16b, wt2_8x16b;
336
1.85k
    __m128i ofst_8x16b, round_8x16b;
337
338
1.85k
    WORD32 ofst1_u, ofst2_u, ofst_u;
339
1.85k
    WORD32 ofst1_v, ofst2_v, ofst_v;
340
1.85k
    WORD32 round_val, shft, ofst_val,ofst_val_256;
341
342
1.85k
    round_val = 1 << log_wd;
343
1.85k
    shft = log_wd + 1;
344
345
1.85k
    ofst1_u = (WORD8)(ofst1 & 0xff);
346
1.85k
    ofst1_v = (WORD8)(ofst1 >> 8);
347
1.85k
    ofst2_u = (WORD8)(ofst2 & 0xff);
348
1.85k
    ofst2_v = (WORD8)(ofst2 >> 8);
349
350
1.85k
    wt1_8x16b = _mm_set1_epi32(wt1);
351
1.85k
    wt2_8x16b = _mm_set1_epi32(wt2);
352
353
1.85k
    ofst_u = (ofst1_u + ofst2_u + 1) >> 1;
354
1.85k
    ofst_v = (ofst1_v + ofst2_v + 1) >> 1;
355
1.85k
    ofst_val = (ofst_u & 0xffff) | (ofst_v << 16);
356
1.85k
    ofst_val_256 = (ofst_u & 0xffff) | (ofst_v << 16);
357
358
1.85k
    round_8x16b = _mm_set1_epi16(round_val);
359
1.85k
    ofst_8x16b =  _mm_set1_epi32(ofst_val);
360
361
1.85k
    if(wd == 2)
362
0
    {
363
0
        __m128i y1_0_8x16b, y2_0_8x16b;
364
365
0
        do
366
0
        {
367
0
            y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);                 //Loading 64 bits from diff location
368
0
            y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
369
370
0
            y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
371
0
            y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
372
373
0
            y1_0_16x8b = _mm_unpacklo_epi32(y1_0_16x8b, y1_1_16x8b);
374
0
            y2_0_16x8b = _mm_unpacklo_epi32(y2_0_16x8b, y2_1_16x8b);
375
376
0
            y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
377
0
            y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
378
379
0
            y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b);
380
0
            y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b);
381
382
0
            y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b);
383
0
            y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b);
384
385
0
            y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft);
386
0
            y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b);
387
388
0
            y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_0_8x16b);
389
0
            y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 4);
390
391
0
            *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y1_0_16x8b);
392
0
            *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y1_1_16x8b);
393
394
0
            ht -= 2;
395
0
            pu1_src1 += src_strd1 << 1;
396
0
            pu1_src2 += src_strd2 << 1;
397
0
            pu1_dst += dst_strd << 1;
398
0
        }
399
0
        while(ht > 0);
400
0
    }
401
1.85k
    else if(wd == 4)
402
184
    {
403
184
        __m128i y1_0_8x16b, y1_1_8x16b;
404
184
        __m128i y2_0_8x16b, y2_1_8x16b;
405
406
184
        do
407
736
        {
408
736
            y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);                  //Loading 64 bits from diff location
409
736
            y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
410
411
736
            y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
412
736
            y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
413
414
736
            y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
415
736
            y1_1_8x16b = _mm_cvtepu8_epi16(y1_1_16x8b);
416
417
736
            y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
418
736
            y2_1_8x16b = _mm_cvtepu8_epi16(y2_1_16x8b);
419
420
736
            y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b);
421
736
            y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b);
422
736
            y1_1_8x16b = _mm_mullo_epi16(y1_1_8x16b, wt1_8x16b);
423
736
            y2_1_8x16b = _mm_mullo_epi16(y2_1_8x16b, wt2_8x16b);
424
425
736
            y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b);
426
736
            y1_1_8x16b = _mm_adds_epi16(y1_1_8x16b, y2_1_8x16b);
427
428
736
            y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b);
429
736
            y1_1_8x16b = _mm_adds_epi16(round_8x16b, y1_1_8x16b);
430
431
736
            y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft);
432
736
            y1_1_8x16b = _mm_srai_epi16(y1_1_8x16b, shft);
433
434
736
            y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b);
435
736
            y1_1_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1_8x16b);
436
437
736
            y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_1_8x16b);
438
736
            y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 8);
439
440
736
            _mm_storel_epi64((__m128i *)pu1_dst, y1_0_16x8b);
441
736
            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b);
442
443
736
            ht -= 2;
444
736
            pu1_src1 += src_strd1 << 1;
445
736
            pu1_src2 += src_strd2 << 1;
446
736
            pu1_dst += dst_strd << 1;
447
736
        }
448
736
        while(ht > 0);
449
184
    }
450
1.66k
    else // wd == 8
451
1.66k
    {
452
1.66k
        __m256i y1_0L_8x32b, y1_0H_8x32b, y1_1L_8x32b, y1_1H_8x32b;
453
1.66k
        __m256i y2_0L_8x32b, y2_0H_8x32b, y2_1L_8x32b, y2_1H_8x32b;
454
1.66k
        __m256i y1_0_32x8b,y2_0_32x8b,ofst_8x32b,round_8x32b;
455
1.66k
        __m256i wt1_8x32b, wt2_8x32b;
456
1.66k
        __m256i zero_32x8b;
457
458
1.66k
        wt1_8x32b = _mm256_set1_epi16(wt1);
459
1.66k
        wt2_8x32b = _mm256_set1_epi16(wt2);
460
1.66k
        round_8x32b = _mm256_set1_epi16(round_val);
461
1.66k
        ofst_8x32b = _mm256_set1_epi32(ofst_val_256);
462
1.66k
        zero_32x8b = _mm256_set1_epi8(0);
463
464
1.66k
        do
465
5.12k
        {
466
5.12k
            y1_0_32x8b = _mm256_loadu_si256((__m256i *)pu1_src1);
467
5.12k
            y2_0_32x8b = _mm256_loadu_si256((__m256i *)pu1_src2);
468
5.12k
            y1_0L_8x32b = _mm256_unpacklo_epi8(y1_0_32x8b, zero_32x8b);
469
5.12k
            y1_0H_8x32b = _mm256_unpackhi_epi8(y1_0_32x8b, zero_32x8b);
470
5.12k
            y2_0L_8x32b = _mm256_unpacklo_epi8(y2_0_32x8b, zero_32x8b);
471
5.12k
            y2_0H_8x32b = _mm256_unpackhi_epi8(y2_0_32x8b, zero_32x8b);
472
5.12k
            y1_0L_8x32b = _mm256_mullo_epi16(y1_0L_8x32b, wt1_8x32b);
473
5.12k
            y1_0H_8x32b = _mm256_mullo_epi16(y1_0H_8x32b, wt1_8x32b);
474
475
5.12k
            y2_0L_8x32b = _mm256_mullo_epi16(y2_0L_8x32b, wt2_8x32b);
476
5.12k
            y2_0H_8x32b = _mm256_mullo_epi16(y2_0H_8x32b, wt2_8x32b);
477
478
5.12k
            y1_0L_8x32b = _mm256_adds_epi16(y1_0L_8x32b, y2_0L_8x32b);
479
5.12k
            y1_0H_8x32b = _mm256_adds_epi16(y1_0H_8x32b, y2_0H_8x32b);
480
481
5.12k
            y1_0L_8x32b = _mm256_adds_epi16(round_8x32b, y1_0L_8x32b);
482
5.12k
            y1_0H_8x32b = _mm256_adds_epi16(round_8x32b, y1_0H_8x32b);
483
484
5.12k
            y1_0L_8x32b = _mm256_srai_epi16(y1_0L_8x32b, shft);
485
5.12k
            y1_0H_8x32b = _mm256_srai_epi16(y1_0H_8x32b, shft);
486
487
5.12k
            y1_0L_8x32b = _mm256_adds_epi16(ofst_8x32b, y1_0L_8x32b);
488
5.12k
            y1_0H_8x32b = _mm256_adds_epi16(ofst_8x32b, y1_0H_8x32b);
489
490
491
5.12k
            y1_0_32x8b = _mm256_packus_epi16(y1_0L_8x32b, y1_0H_8x32b);
492
5.12k
            _mm256_storeu_si256((__m256i *)pu1_dst, y1_0_32x8b);
493
494
5.12k
            ht -= 2;
495
5.12k
            pu1_src1 += src_strd1 << 1;
496
5.12k
            pu1_src2 += src_strd2 << 1;
497
5.12k
            pu1_dst += dst_strd << 1;
498
5.12k
        }
499
5.12k
        while(ht > 0);
500
1.66k
    }
501
1.85k
}