Coverage Report

Created: 2025-08-29 06:39

/src/libavc/common/x86/ih264_weighted_pred_sse42.c
Line
Count
Source
1
/******************************************************************************
2
 *
3
 * Copyright (C) 2015 The Android Open Source Project
4
 *
5
 * Licensed under the Apache License, Version 2.0 (the "License");
6
 * you may not use this file except in compliance with the License.
7
 * You may obtain a copy of the License at:
8
 *
9
 * http://www.apache.org/licenses/LICENSE-2.0
10
 *
11
 * Unless required by applicable law or agreed to in writing, software
12
 * distributed under the License is distributed on an "AS IS" BASIS,
13
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 * See the License for the specific language governing permissions and
15
 * limitations under the License.
16
 *
17
 *****************************************************************************
18
 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19
*/
20
/*****************************************************************************/
21
/*                                                                           */
22
/*  File Name         : ih264_weighted_pred_intr_sse42.c                     */
23
/*                                                                           */
24
/*  Description       : Contains function definitions for weighted           */
25
/*                      prediction functions in x86 sse4 intrinsics          */
26
/*                                                                           */
27
/*  List of Functions : ih264_default_weighted_pred_luma_sse42()             */
28
/*                      ih264_default_weighted_pred_chroma_sse42()           */
29
/*                      ih264_weighted_pred_luma_sse42()                     */
30
/*                      ih264_weighted_pred_chroma_sse42()                   */
31
/*                      ih264_weighted_bipred_luma_sse42()                   */
32
/*                      ih264_weighted_bipred_chroma_sse42()                 */
33
/*                                                                           */
34
/*  Issues / Problems : None                                                 */
35
/*                                                                           */
36
/*  Revision History  :                                                      */
37
/*                                                                           */
38
/*         DD MM YYYY   Author(s)       Changes                              */
39
/*         30 01 2015   Kaushik         Initial version                      */
40
/*                      Senthoor                                             */
41
/*                                                                           */
42
/*****************************************************************************/
43
/*****************************************************************************/
44
/* File Includes                                                             */
45
/*****************************************************************************/
46
47
#include <immintrin.h>
48
#include "ih264_typedefs.h"
49
#include "ih264_macros.h"
50
#include "ih264_platform_macros.h"
51
#include "ih264_weighted_pred.h"
52
53
/*****************************************************************************/
54
/*  Function definitions .                                                   */
55
/*****************************************************************************/
56
/*****************************************************************************/
57
/*                                                                           */
58
/*  Function Name : ih264_default_weighted_pred_luma_sse42                   */
59
/*                                                                           */
60
/*  Description   : This function performs the default weighted prediction   */
61
/*                  as described in sec 8.4.2.3.1 titled "Default weighted   */
62
/*                  sample prediction process" for luma. The function gets   */
63
/*                  two ht x wd blocks, calculates their rounded-average and */
64
/*                  stores it in the destination block. (ht,wd) can be       */
65
/*                  (4,4), (8,4), (4,8), (8,8), (16,8), (8,16) or (16,16).   */
66
/*                                                                           */
67
/*  Inputs        : pu1_src1  - Pointer to source 1                          */
68
/*                  pu1_src2  - Pointer to source 2                          */
69
/*                  pu1_dst   - Pointer to destination                       */
70
/*                  src_strd1 - stride for source 1                          */
71
/*                  src_strd1 - stride for source 2                          */
72
/*                  dst_strd  - stride for destination                       */
73
/*                  ht        - height of the block                          */
74
/*                  wd        - width of the block                           */
75
/*                                                                           */
76
/*  Issues        : None                                                     */
77
/*                                                                           */
78
/*  Revision History:                                                        */
79
/*                                                                           */
80
/*         DD MM YYYY   Author(s)       Changes                              */
81
/*         04 02 2015   Kaushik         Initial Version                      */
82
/*                      Senthoor                                             */
83
/*                                                                           */
84
/*****************************************************************************/
85
void ih264_default_weighted_pred_luma_sse42(UWORD8 *pu1_src1,
86
                                            UWORD8 *pu1_src2,
87
                                            UWORD8 *pu1_dst,
88
                                            WORD32 src_strd1,
89
                                            WORD32 src_strd2,
90
                                            WORD32 dst_strd,
91
                                            WORD32 ht,
92
                                            WORD32 wd)
93
374k
{
94
374k
    __m128i y0_0_16x8b, y0_1_16x8b, y0_2_16x8b, y0_3_16x8b;
95
374k
    __m128i y1_0_16x8b, y1_1_16x8b, y1_2_16x8b, y1_3_16x8b;
96
97
374k
    if(wd == 4)
98
4.59k
    {
99
4.59k
        do
100
6.12k
        {
101
6.12k
            y0_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
102
6.12k
            y0_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
103
6.12k
            y0_2_16x8b = _mm_loadl_epi64(
104
6.12k
                            (__m128i *)(pu1_src1 + (src_strd1 << 1)));
105
6.12k
            y0_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1 * 3));
106
107
6.12k
            y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
108
6.12k
            y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
109
6.12k
            y1_2_16x8b = _mm_loadl_epi64(
110
6.12k
                            (__m128i *)(pu1_src2 + (src_strd2 << 1)));
111
6.12k
            y1_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2 * 3));
112
113
6.12k
            y0_0_16x8b = _mm_avg_epu8(y0_0_16x8b, y1_0_16x8b);
114
6.12k
            y0_1_16x8b = _mm_avg_epu8(y0_1_16x8b, y1_1_16x8b);
115
6.12k
            y0_2_16x8b = _mm_avg_epu8(y0_2_16x8b, y1_2_16x8b);
116
6.12k
            y0_3_16x8b = _mm_avg_epu8(y0_3_16x8b, y1_3_16x8b);
117
118
6.12k
            *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y0_0_16x8b);
119
6.12k
            *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y0_1_16x8b);
120
6.12k
            *((WORD32 *)(pu1_dst + (dst_strd << 1))) = _mm_cvtsi128_si32(y0_2_16x8b);
121
6.12k
            *((WORD32 *)(pu1_dst + dst_strd * 3)) = _mm_cvtsi128_si32(y0_3_16x8b);
122
123
6.12k
            ht -= 4;
124
6.12k
            pu1_src1 += src_strd1 << 2;
125
6.12k
            pu1_src2 += src_strd2 << 2;
126
6.12k
            pu1_dst += dst_strd << 2;
127
6.12k
        }
128
6.12k
        while(ht > 0);
129
4.59k
    }
130
370k
    else if(wd == 8)
131
71.8k
    {
132
71.8k
        do
133
169k
        {
134
169k
            y0_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
135
169k
            y0_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
136
169k
            y0_2_16x8b = _mm_loadl_epi64(
137
169k
                            (__m128i *)(pu1_src1 + (src_strd1 << 1)));
138
169k
            y0_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1 * 3));
139
140
169k
            y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
141
169k
            y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
142
169k
            y1_2_16x8b = _mm_loadl_epi64(
143
169k
                            (__m128i *)(pu1_src2 + (src_strd2 << 1)));
144
169k
            y1_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2 * 3));
145
146
169k
            y0_0_16x8b = _mm_avg_epu8(y0_0_16x8b, y1_0_16x8b);
147
169k
            y0_1_16x8b = _mm_avg_epu8(y0_1_16x8b, y1_1_16x8b);
148
169k
            y0_2_16x8b = _mm_avg_epu8(y0_2_16x8b, y1_2_16x8b);
149
169k
            y0_3_16x8b = _mm_avg_epu8(y0_3_16x8b, y1_3_16x8b);
150
151
169k
            _mm_storel_epi64((__m128i *)pu1_dst, y0_0_16x8b);
152
169k
            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y0_1_16x8b);
153
169k
            _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd << 1)), y0_2_16x8b);
154
169k
            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd * 3), y0_3_16x8b);
155
156
169k
            ht -= 4;
157
169k
            pu1_src1 += src_strd1 << 2;
158
169k
            pu1_src2 += src_strd2 << 2;
159
169k
            pu1_dst += dst_strd << 2;
160
169k
        }
161
169k
        while(ht > 0);
162
71.8k
    }
163
298k
    else // wd == 16
164
298k
    {
165
298k
        __m128i y0_4_16x8b, y0_5_16x8b, y0_6_16x8b, y0_7_16x8b;
166
298k
        __m128i y1_4_16x8b, y1_5_16x8b, y1_6_16x8b, y1_7_16x8b;
167
168
298k
        do
169
575k
        {
170
575k
            y0_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src1);
171
575k
            y0_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1));
172
575k
            y0_2_16x8b = _mm_loadu_si128(
173
575k
                            (__m128i *)(pu1_src1 + (src_strd1 << 1)));
174
575k
            y0_3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1 * 3));
175
575k
            y0_4_16x8b = _mm_loadu_si128(
176
575k
                            (__m128i *)(pu1_src1 + (src_strd1 << 2)));
177
575k
            y0_5_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1 * 5));
178
575k
            y0_6_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1 * 6));
179
575k
            y0_7_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1 * 7));
180
181
575k
            y1_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src2);
182
575k
            y1_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2));
183
575k
            y1_2_16x8b = _mm_loadu_si128(
184
575k
                            (__m128i *)(pu1_src2 + (src_strd2 << 1)));
185
575k
            y1_3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2 * 3));
186
575k
            y1_4_16x8b = _mm_loadu_si128(
187
575k
                            (__m128i *)(pu1_src2 + (src_strd2 << 2)));
188
575k
            y1_5_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2 * 5));
189
575k
            y1_6_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2 * 6));
190
575k
            y1_7_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2 * 7));
191
192
575k
            y0_0_16x8b = _mm_avg_epu8(y0_0_16x8b, y1_0_16x8b);
193
575k
            y0_1_16x8b = _mm_avg_epu8(y0_1_16x8b, y1_1_16x8b);
194
575k
            y0_2_16x8b = _mm_avg_epu8(y0_2_16x8b, y1_2_16x8b);
195
575k
            y0_3_16x8b = _mm_avg_epu8(y0_3_16x8b, y1_3_16x8b);
196
575k
            y0_4_16x8b = _mm_avg_epu8(y0_4_16x8b, y1_4_16x8b);
197
575k
            y0_5_16x8b = _mm_avg_epu8(y0_5_16x8b, y1_5_16x8b);
198
575k
            y0_6_16x8b = _mm_avg_epu8(y0_6_16x8b, y1_6_16x8b);
199
575k
            y0_7_16x8b = _mm_avg_epu8(y0_7_16x8b, y1_7_16x8b);
200
201
575k
            _mm_storeu_si128((__m128i *)pu1_dst, y0_0_16x8b);
202
575k
            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y0_1_16x8b);
203
575k
            _mm_storeu_si128((__m128i *)(pu1_dst + (dst_strd << 1)), y0_2_16x8b);
204
575k
            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 3), y0_3_16x8b);
205
575k
            _mm_storeu_si128((__m128i *)(pu1_dst + (dst_strd << 2)), y0_4_16x8b);
206
575k
            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 5), y0_5_16x8b);
207
575k
            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 6), y0_6_16x8b);
208
575k
            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 7), y0_7_16x8b);
209
210
575k
            ht -= 8;
211
575k
            pu1_src1 += src_strd1 << 3;
212
575k
            pu1_src2 += src_strd2 << 3;
213
575k
            pu1_dst += dst_strd << 3;
214
575k
        }
215
575k
        while(ht > 0);
216
298k
    }
217
374k
}
218
219
/*****************************************************************************/
220
/*                                                                           */
221
/*  Function Name : ih264_default_weighted_pred_chroma_sse42                 */
222
/*                                                                           */
223
/*  Description   : This function performs the default weighted prediction   */
224
/*                  as described in sec 8.4.2.3.1 titled "Default weighted   */
225
/*                  sample prediction process" for chroma. The function gets */
226
/*                  two ht x wd blocks, calculates their rounded-average and */
227
/*                  stores it in the destination block. (ht,wd) can be       */
228
/*                  (2,2), (4,2) , (2,4), (4,4), (8,4), (4,8) or (8,8).      */
229
/*                                                                           */
230
/*  Inputs        : pu1_src1  - Pointer to source 1                          */
231
/*                  pu1_src2  - Pointer to source 2                          */
232
/*                  pu1_dst   - Pointer to destination                       */
233
/*                  src_strd1 - stride for source 1                          */
234
/*                  src_strd1 - stride for source 2                          */
235
/*                  dst_strd  - stride for destination                       */
236
/*                  ht        - height of the block                          */
237
/*                  wd        - width of the block                           */
238
/*                                                                           */
239
/*  Issues        : None                                                     */
240
/*                                                                           */
241
/*  Revision History:                                                        */
242
/*                                                                           */
243
/*         DD MM YYYY   Author(s)       Changes                              */
244
/*         04 02 2015   Kaushik         Initial Version                      */
245
/*                      Senthoor                                             */
246
/*                                                                           */
247
/*****************************************************************************/
248
void ih264_default_weighted_pred_chroma_sse42(UWORD8 *pu1_src1,
249
                                              UWORD8 *pu1_src2,
250
                                              UWORD8 *pu1_dst,
251
                                              WORD32 src_strd1,
252
                                              WORD32 src_strd2,
253
                                              WORD32 dst_strd,
254
                                              WORD32 ht,
255
                                              WORD32 wd)
256
374k
{
257
374k
    __m128i uv0_0_16x8b, uv0_1_16x8b;
258
374k
    __m128i uv1_0_16x8b, uv1_1_16x8b;
259
260
374k
    if(wd == 2)
261
4.59k
    {
262
4.59k
        do
263
6.12k
        {
264
6.12k
            uv0_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
265
6.12k
            uv0_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
266
267
6.12k
            uv1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
268
6.12k
            uv1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
269
270
6.12k
            uv0_0_16x8b = _mm_avg_epu8(uv0_0_16x8b, uv1_0_16x8b);
271
6.12k
            uv0_1_16x8b = _mm_avg_epu8(uv0_1_16x8b, uv1_1_16x8b);
272
273
6.12k
            *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(uv0_0_16x8b);
274
6.12k
            *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(uv0_1_16x8b);
275
276
6.12k
            ht -= 2;
277
6.12k
            pu1_src1 += src_strd1 << 1;
278
6.12k
            pu1_src2 += src_strd2 << 1;
279
6.12k
            pu1_dst += dst_strd << 1;
280
6.12k
        }
281
6.12k
        while(ht > 0);
282
4.59k
    }
283
370k
    else if(wd == 4)
284
71.8k
    {
285
71.8k
        do
286
169k
        {
287
169k
            uv0_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
288
169k
            uv0_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
289
290
169k
            uv1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
291
169k
            uv1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
292
293
169k
            uv0_0_16x8b = _mm_avg_epu8(uv0_0_16x8b, uv1_0_16x8b);
294
169k
            uv0_1_16x8b = _mm_avg_epu8(uv0_1_16x8b, uv1_1_16x8b);
295
296
169k
            _mm_storel_epi64((__m128i *)pu1_dst, uv0_0_16x8b);
297
169k
            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), uv0_1_16x8b);
298
299
169k
            ht -= 2;
300
169k
            pu1_src1 += src_strd1 << 1;
301
169k
            pu1_src2 += src_strd2 << 1;
302
169k
            pu1_dst += dst_strd << 1;
303
169k
        }
304
169k
        while(ht > 0);
305
71.8k
    }
306
298k
    else // wd == 8
307
298k
    {
308
298k
        __m128i uv0_2_16x8b, uv0_3_16x8b;
309
298k
        __m128i uv1_2_16x8b, uv1_3_16x8b;
310
311
298k
        do
312
575k
        {
313
575k
            uv0_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src1);
314
575k
            uv0_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1));
315
575k
            uv0_2_16x8b = _mm_loadu_si128(
316
575k
                            (__m128i *)(pu1_src1 + (src_strd1 << 1)));
317
575k
            uv0_3_16x8b = _mm_loadu_si128(
318
575k
                            (__m128i *)(pu1_src1 + src_strd1 * 3));
319
320
575k
            uv1_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src2);
321
575k
            uv1_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2));
322
575k
            uv1_2_16x8b = _mm_loadu_si128(
323
575k
                            (__m128i *)(pu1_src2 + (src_strd2 << 1)));
324
575k
            uv1_3_16x8b = _mm_loadu_si128(
325
575k
                            (__m128i *)(pu1_src2 + src_strd2 * 3));
326
327
575k
            uv0_0_16x8b = _mm_avg_epu8(uv0_0_16x8b, uv1_0_16x8b);
328
575k
            uv0_1_16x8b = _mm_avg_epu8(uv0_1_16x8b, uv1_1_16x8b);
329
575k
            uv0_2_16x8b = _mm_avg_epu8(uv0_2_16x8b, uv1_2_16x8b);
330
575k
            uv0_3_16x8b = _mm_avg_epu8(uv0_3_16x8b, uv1_3_16x8b);
331
332
575k
            _mm_storeu_si128((__m128i *)pu1_dst, uv0_0_16x8b);
333
575k
            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), uv0_1_16x8b);
334
575k
            _mm_storeu_si128(
335
575k
                            (__m128i *)(pu1_dst + (dst_strd << 1)), uv0_2_16x8b);
336
575k
            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 3), uv0_3_16x8b);
337
338
575k
            ht -= 4;
339
575k
            pu1_src1 += src_strd1 << 2;
340
575k
            pu1_src2 += src_strd2 << 2;
341
575k
            pu1_dst += dst_strd << 2;
342
575k
        }
343
575k
        while(ht > 0);
344
298k
    }
345
374k
}
346
347
/*****************************************************************************/
348
/*                                                                           */
349
/*  Function Name : ih264_weighted_pred_luma_sse42                           */
350
/*                                                                           */
351
/*  Description   : This function performs the weighted prediction as        */
352
/*                  described in sec 8.4.2.3.2 titled "Weighted sample       */
353
/*                  prediction process" for luma. The function gets one      */
354
/*                  ht x wd block, weights it, rounds it off, offsets it,    */
355
/*                  saturates it to unsigned 8-bit and stores it in the      */
356
/*                  destination block. (ht,wd) can be (4,4), (8,4), (4,8),   */
357
/*                  (8,8), (16,8), (8,16) or (16,16).                        */
358
/*                                                                           */
359
/*  Inputs        : pu1_src  - Pointer to source                             */
360
/*                  pu1_dst  - Pointer to destination                        */
361
/*                  src_strd - stride for source                             */
362
/*                  dst_strd - stride for destination                        */
363
/*                  log_wd   - number of bits to be rounded off              */
364
/*                  wt       - weight value                                  */
365
/*                  ofst     - offset value                                  */
366
/*                  ht       - height of the block                           */
367
/*                  wd       - width of the block                            */
368
/*                                                                           */
369
/*  Issues        : None                                                     */
370
/*                                                                           */
371
/*  Revision History:                                                        */
372
/*                                                                           */
373
/*         DD MM YYYY   Author(s)       Changes                              */
374
/*         04 02 2015   Kaushik         Initial Version                      */
375
/*                      Senthoor                                             */
376
/*                                                                           */
377
/*****************************************************************************/
378
void ih264_weighted_pred_luma_sse42(UWORD8 *pu1_src,
379
                                    UWORD8 *pu1_dst,
380
                                    WORD32 src_strd,
381
                                    WORD32 dst_strd,
382
                                    WORD32 log_wd,
383
                                    WORD32 wt,
384
                                    WORD32 ofst,
385
                                    WORD32 ht,
386
                                    WORD32 wd)
387
6.08M
{
388
6.08M
    __m128i y_0_16x8b, y_1_16x8b, y_2_16x8b, y_3_16x8b;
389
390
6.08M
    __m128i wt_8x16b, round_8x16b, ofst_8x16b;
391
392
6.08M
    WORD32 round_val;
393
394
6.08M
    wt = (WORD16)(wt & 0xffff);
395
6.08M
    round_val = 1 << (log_wd - 1);
396
6.08M
    ofst = (WORD8)(ofst & 0xff);
397
398
6.08M
    wt_8x16b = _mm_set1_epi16(wt);
399
6.08M
    round_8x16b = _mm_set1_epi16(round_val);
400
6.08M
    ofst_8x16b = _mm_set1_epi16(ofst);
401
402
6.08M
    if(wd == 4)
403
72.8k
    {
404
72.8k
        __m128i y_0_8x16b, y_2_8x16b;
405
406
72.8k
        do
407
132k
        {
408
132k
            y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
409
132k
            y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
410
132k
            y_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (src_strd << 1)));
411
132k
            y_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd * 3));
412
413
132k
            y_0_16x8b = _mm_unpacklo_epi32(y_0_16x8b, y_1_16x8b);
414
132k
            y_2_16x8b = _mm_unpacklo_epi32(y_2_16x8b, y_3_16x8b);
415
416
132k
            y_0_8x16b = _mm_cvtepu8_epi16(y_0_16x8b);
417
132k
            y_2_8x16b = _mm_cvtepu8_epi16(y_2_16x8b);
418
419
132k
            y_0_8x16b = _mm_mullo_epi16(y_0_8x16b, wt_8x16b);
420
132k
            y_2_8x16b = _mm_mullo_epi16(y_2_8x16b, wt_8x16b);
421
422
132k
            y_0_8x16b = _mm_adds_epi16(round_8x16b, y_0_8x16b);
423
132k
            y_2_8x16b = _mm_adds_epi16(round_8x16b, y_2_8x16b);
424
425
132k
            y_0_8x16b = _mm_srai_epi16(y_0_8x16b, log_wd);
426
132k
            y_2_8x16b = _mm_srai_epi16(y_2_8x16b, log_wd);
427
428
132k
            y_0_8x16b = _mm_adds_epi16(ofst_8x16b, y_0_8x16b);
429
132k
            y_2_8x16b = _mm_adds_epi16(ofst_8x16b, y_2_8x16b);
430
431
132k
            y_0_16x8b = _mm_packus_epi16(y_0_8x16b, y_2_8x16b);
432
132k
            y_1_16x8b = _mm_srli_si128(y_0_16x8b, 4);
433
132k
            y_2_16x8b = _mm_srli_si128(y_0_16x8b, 8);
434
132k
            y_3_16x8b = _mm_srli_si128(y_0_16x8b, 12);
435
436
132k
            *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y_0_16x8b);
437
132k
            *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y_1_16x8b);
438
132k
            *((WORD32 *)(pu1_dst + (dst_strd << 1))) = _mm_cvtsi128_si32(y_2_16x8b);
439
132k
            *((WORD32 *)(pu1_dst + dst_strd * 3)) = _mm_cvtsi128_si32(y_3_16x8b);
440
441
132k
            ht -= 4;
442
132k
            pu1_src += src_strd << 2;
443
132k
            pu1_dst += dst_strd << 2;
444
132k
        }
445
132k
        while(ht > 0);
446
72.8k
    }
447
6.01M
    else if(wd == 8)
448
76.0k
    {
449
76.0k
        __m128i y_0_8x16b, y_1_8x16b, y_2_8x16b, y_3_8x16b;
450
451
76.0k
        do
452
186k
        {
453
186k
            y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
454
186k
            y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
455
186k
            y_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (src_strd << 1)));
456
186k
            y_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd * 3));
457
458
186k
            y_0_8x16b = _mm_cvtepu8_epi16(y_0_16x8b);
459
186k
            y_1_8x16b = _mm_cvtepu8_epi16(y_1_16x8b);
460
186k
            y_2_8x16b = _mm_cvtepu8_epi16(y_2_16x8b);
461
186k
            y_3_8x16b = _mm_cvtepu8_epi16(y_3_16x8b);
462
463
186k
            y_0_8x16b = _mm_mullo_epi16(y_0_8x16b, wt_8x16b);
464
186k
            y_1_8x16b = _mm_mullo_epi16(y_1_8x16b, wt_8x16b);
465
186k
            y_2_8x16b = _mm_mullo_epi16(y_2_8x16b, wt_8x16b);
466
186k
            y_3_8x16b = _mm_mullo_epi16(y_3_8x16b, wt_8x16b);
467
468
186k
            y_0_8x16b = _mm_adds_epi16(round_8x16b, y_0_8x16b);
469
186k
            y_1_8x16b = _mm_adds_epi16(round_8x16b, y_1_8x16b);
470
186k
            y_2_8x16b = _mm_adds_epi16(round_8x16b, y_2_8x16b);
471
186k
            y_3_8x16b = _mm_adds_epi16(round_8x16b, y_3_8x16b);
472
473
186k
            y_0_8x16b = _mm_srai_epi16(y_0_8x16b, log_wd);
474
186k
            y_1_8x16b = _mm_srai_epi16(y_1_8x16b, log_wd);
475
186k
            y_2_8x16b = _mm_srai_epi16(y_2_8x16b, log_wd);
476
186k
            y_3_8x16b = _mm_srai_epi16(y_3_8x16b, log_wd);
477
478
186k
            y_0_8x16b = _mm_adds_epi16(ofst_8x16b, y_0_8x16b);
479
186k
            y_1_8x16b = _mm_adds_epi16(ofst_8x16b, y_1_8x16b);
480
186k
            y_2_8x16b = _mm_adds_epi16(ofst_8x16b, y_2_8x16b);
481
186k
            y_3_8x16b = _mm_adds_epi16(ofst_8x16b, y_3_8x16b);
482
483
186k
            y_0_16x8b = _mm_packus_epi16(y_0_8x16b, y_1_8x16b);
484
186k
            y_2_16x8b = _mm_packus_epi16(y_2_8x16b, y_3_8x16b);
485
186k
            y_1_16x8b = _mm_srli_si128(y_0_16x8b, 8);
486
186k
            y_3_16x8b = _mm_srli_si128(y_2_16x8b, 8);
487
488
186k
            _mm_storel_epi64((__m128i *)pu1_dst, y_0_16x8b);
489
186k
            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y_1_16x8b);
490
186k
            _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd << 1)), y_2_16x8b);
491
186k
            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd * 3), y_3_16x8b);
492
493
186k
            ht -= 4;
494
186k
            pu1_src += src_strd << 2;
495
186k
            pu1_dst += dst_strd << 2;
496
186k
        }
497
186k
        while(ht > 0);
498
76.0k
    }
499
5.93M
    else // wd == 16
500
5.93M
    {
501
5.93M
        __m128i y_0L_8x16b, y_1L_8x16b, y_2L_8x16b, y_3L_8x16b;
502
5.93M
        __m128i y_0H_8x16b, y_1H_8x16b, y_2H_8x16b, y_3H_8x16b;
503
504
5.93M
        __m128i zero_16x8b;
505
5.93M
        zero_16x8b = _mm_set1_epi8(0);
506
507
5.93M
        do
508
23.6M
        {
509
23.6M
            y_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
510
23.6M
            y_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd));
511
23.6M
            y_2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + (src_strd << 1)));
512
23.6M
            y_3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd * 3));
513
514
23.6M
            y_0L_8x16b = _mm_cvtepu8_epi16(y_0_16x8b);
515
23.6M
            y_0H_8x16b = _mm_unpackhi_epi8(y_0_16x8b, zero_16x8b);
516
23.6M
            y_1L_8x16b = _mm_cvtepu8_epi16(y_1_16x8b);
517
23.6M
            y_1H_8x16b = _mm_unpackhi_epi8(y_1_16x8b, zero_16x8b);
518
23.6M
            y_2L_8x16b = _mm_cvtepu8_epi16(y_2_16x8b);
519
23.6M
            y_2H_8x16b = _mm_unpackhi_epi8(y_2_16x8b, zero_16x8b);
520
23.6M
            y_3L_8x16b = _mm_cvtepu8_epi16(y_3_16x8b);
521
23.6M
            y_3H_8x16b = _mm_unpackhi_epi8(y_3_16x8b, zero_16x8b);
522
523
23.6M
            y_0L_8x16b = _mm_mullo_epi16(y_0L_8x16b, wt_8x16b);
524
23.6M
            y_0H_8x16b = _mm_mullo_epi16(y_0H_8x16b, wt_8x16b);
525
23.6M
            y_1L_8x16b = _mm_mullo_epi16(y_1L_8x16b, wt_8x16b);
526
23.6M
            y_1H_8x16b = _mm_mullo_epi16(y_1H_8x16b, wt_8x16b);
527
23.6M
            y_2L_8x16b = _mm_mullo_epi16(y_2L_8x16b, wt_8x16b);
528
23.6M
            y_2H_8x16b = _mm_mullo_epi16(y_2H_8x16b, wt_8x16b);
529
23.6M
            y_3L_8x16b = _mm_mullo_epi16(y_3L_8x16b, wt_8x16b);
530
23.6M
            y_3H_8x16b = _mm_mullo_epi16(y_3H_8x16b, wt_8x16b);
531
532
23.6M
            y_0L_8x16b = _mm_adds_epi16(round_8x16b, y_0L_8x16b);
533
23.6M
            y_0H_8x16b = _mm_adds_epi16(round_8x16b, y_0H_8x16b);
534
23.6M
            y_1L_8x16b = _mm_adds_epi16(round_8x16b, y_1L_8x16b);
535
23.6M
            y_1H_8x16b = _mm_adds_epi16(round_8x16b, y_1H_8x16b);
536
23.6M
            y_2L_8x16b = _mm_adds_epi16(round_8x16b, y_2L_8x16b);
537
23.6M
            y_2H_8x16b = _mm_adds_epi16(round_8x16b, y_2H_8x16b);
538
23.6M
            y_3L_8x16b = _mm_adds_epi16(round_8x16b, y_3L_8x16b);
539
23.6M
            y_3H_8x16b = _mm_adds_epi16(round_8x16b, y_3H_8x16b);
540
541
23.6M
            y_0L_8x16b = _mm_srai_epi16(y_0L_8x16b, log_wd);
542
23.6M
            y_0H_8x16b = _mm_srai_epi16(y_0H_8x16b, log_wd);
543
23.6M
            y_1L_8x16b = _mm_srai_epi16(y_1L_8x16b, log_wd);
544
23.6M
            y_1H_8x16b = _mm_srai_epi16(y_1H_8x16b, log_wd);
545
23.6M
            y_2L_8x16b = _mm_srai_epi16(y_2L_8x16b, log_wd);
546
23.6M
            y_2H_8x16b = _mm_srai_epi16(y_2H_8x16b, log_wd);
547
23.6M
            y_3L_8x16b = _mm_srai_epi16(y_3L_8x16b, log_wd);
548
23.6M
            y_3H_8x16b = _mm_srai_epi16(y_3H_8x16b, log_wd);
549
550
23.6M
            y_0L_8x16b = _mm_adds_epi16(ofst_8x16b, y_0L_8x16b);
551
23.6M
            y_0H_8x16b = _mm_adds_epi16(ofst_8x16b, y_0H_8x16b);
552
23.6M
            y_1L_8x16b = _mm_adds_epi16(ofst_8x16b, y_1L_8x16b);
553
23.6M
            y_1H_8x16b = _mm_adds_epi16(ofst_8x16b, y_1H_8x16b);
554
23.6M
            y_2L_8x16b = _mm_adds_epi16(ofst_8x16b, y_2L_8x16b);
555
23.6M
            y_2H_8x16b = _mm_adds_epi16(ofst_8x16b, y_2H_8x16b);
556
23.6M
            y_3L_8x16b = _mm_adds_epi16(ofst_8x16b, y_3L_8x16b);
557
23.6M
            y_3H_8x16b = _mm_adds_epi16(ofst_8x16b, y_3H_8x16b);
558
559
23.6M
            y_0_16x8b = _mm_packus_epi16(y_0L_8x16b, y_0H_8x16b);
560
23.6M
            y_1_16x8b = _mm_packus_epi16(y_1L_8x16b, y_1H_8x16b);
561
23.6M
            y_2_16x8b = _mm_packus_epi16(y_2L_8x16b, y_2H_8x16b);
562
23.6M
            y_3_16x8b = _mm_packus_epi16(y_3L_8x16b, y_3H_8x16b);
563
564
23.6M
            _mm_storeu_si128((__m128i *)pu1_dst, y_0_16x8b);
565
23.6M
            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y_1_16x8b);
566
23.6M
            _mm_storeu_si128((__m128i *)(pu1_dst + (dst_strd << 1)), y_2_16x8b);
567
23.6M
            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 3), y_3_16x8b);
568
569
23.6M
            ht -= 4;
570
23.6M
            pu1_src += src_strd << 2;
571
23.6M
            pu1_dst += dst_strd << 2;
572
23.6M
        }
573
23.6M
        while(ht > 0);
574
5.93M
    }
575
6.08M
}
576
577
/*****************************************************************************/
578
/*                                                                           */
579
/*  Function Name : ih264_weighted_pred_chroma_sse42                         */
580
/*                                                                           */
581
/*  Description   : This function performs the weighted prediction as        */
582
/*                  described in sec 8.4.2.3.2 titled "Weighted sample       */
583
/*                  prediction process" for chroma. The function gets one    */
584
/*                  ht x wd block, weights it, rounds it off, offsets it,    */
585
/*                  saturates it to unsigned 8-bit and stores it in the      */
586
/*                  destination block. (ht,wd) can be (2,2), (4,2), (2,4),   */
587
/*                  (4,4), (8,4), (4,8) or (8,8).                            */
588
/*                                                                           */
589
/*  Inputs        : pu1_src  - Pointer to source                             */
590
/*                  pu1_dst  - Pointer to destination                        */
591
/*                  src_strd - stride for source                             */
592
/*                  dst_strd - stride for destination                        */
593
/*                  log_wd   - number of bits to be rounded off              */
594
/*                  wt       - weight values for u and v                     */
595
/*                  ofst     - offset values for u and v                     */
596
/*                  ht       - height of the block                           */
597
/*                  wd       - width of the block                            */
598
/*                                                                           */
599
/*  Issues        : None                                                     */
600
/*                                                                           */
601
/*  Revision History:                                                        */
602
/*                                                                           */
603
/*         DD MM YYYY   Author(s)       Changes                              */
604
/*         04 02 2015   Kaushik         Initial Version                      */
605
/*                      Senthoor                                             */
606
/*                                                                           */
607
/*****************************************************************************/
608
void ih264_weighted_pred_chroma_sse42(UWORD8 *pu1_src,
609
                                      UWORD8 *pu1_dst,
610
                                      WORD32 src_strd,
611
                                      WORD32 dst_strd,
612
                                      WORD32 log_wd,
613
                                      WORD32 wt,
614
                                      WORD32 ofst,
615
                                      WORD32 ht,
616
                                      WORD32 wd)
617
6.08M
{
618
6.08M
    __m128i y_0_16x8b, y_1_16x8b;
619
620
6.08M
    __m128i wt_8x16b, round_8x16b, ofst_8x16b;
621
622
6.08M
    WORD32 ofst_u, ofst_v;
623
6.08M
    WORD32 round_val;
624
625
6.08M
    ofst_u = (WORD8)(ofst & 0xff);
626
6.08M
    ofst_v = (WORD8)(ofst >> 8);
627
6.08M
    round_val = 1 << (log_wd - 1);
628
6.08M
    ofst = (ofst_u & 0xffff) | (ofst_v << 16);
629
630
6.08M
    wt_8x16b = _mm_set1_epi32(wt);
631
6.08M
    round_8x16b = _mm_set1_epi16(round_val);
632
6.08M
    ofst_8x16b = _mm_set1_epi32(ofst);
633
634
6.08M
    if(wd == 2)
635
72.8k
    {
636
72.8k
        __m128i y_0_8x16b;
637
638
72.8k
        do
639
132k
        {
640
132k
            y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
641
132k
            y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
642
643
132k
            y_0_16x8b = _mm_unpacklo_epi32(y_0_16x8b, y_1_16x8b);
644
645
132k
            y_0_8x16b = _mm_cvtepu8_epi16(y_0_16x8b);
646
647
132k
            y_0_8x16b = _mm_mullo_epi16(y_0_8x16b, wt_8x16b);
648
649
132k
            y_0_8x16b = _mm_adds_epi16(round_8x16b, y_0_8x16b);
650
651
132k
            y_0_8x16b = _mm_srai_epi16(y_0_8x16b, log_wd);
652
653
132k
            y_0_8x16b = _mm_adds_epi16(ofst_8x16b, y_0_8x16b);
654
655
132k
            y_0_16x8b = _mm_packus_epi16(y_0_8x16b, y_0_8x16b);
656
132k
            y_1_16x8b = _mm_srli_si128(y_0_16x8b, 4);
657
658
132k
            *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y_0_16x8b);
659
132k
            *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y_1_16x8b);
660
661
132k
            ht -= 2;
662
132k
            pu1_src += src_strd << 1;
663
132k
            pu1_dst += dst_strd << 1;
664
132k
        }
665
132k
        while(ht > 0);
666
72.8k
    }
667
6.01M
    else if(wd == 4)
668
76.0k
    {
669
76.0k
        __m128i y_0_8x16b, y_1_8x16b;
670
671
76.0k
        do
672
186k
        {
673
186k
            y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
674
186k
            y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
675
676
186k
            y_0_8x16b = _mm_cvtepu8_epi16(y_0_16x8b);
677
186k
            y_1_8x16b = _mm_cvtepu8_epi16(y_1_16x8b);
678
679
186k
            y_0_8x16b = _mm_mullo_epi16(y_0_8x16b, wt_8x16b);
680
186k
            y_1_8x16b = _mm_mullo_epi16(y_1_8x16b, wt_8x16b);
681
682
186k
            y_0_8x16b = _mm_adds_epi16(round_8x16b, y_0_8x16b);
683
186k
            y_1_8x16b = _mm_adds_epi16(round_8x16b, y_1_8x16b);
684
685
186k
            y_0_8x16b = _mm_srai_epi16(y_0_8x16b, log_wd);
686
186k
            y_1_8x16b = _mm_srai_epi16(y_1_8x16b, log_wd);
687
688
186k
            y_0_8x16b = _mm_adds_epi16(ofst_8x16b, y_0_8x16b);
689
186k
            y_1_8x16b = _mm_adds_epi16(ofst_8x16b, y_1_8x16b);
690
691
186k
            y_0_16x8b = _mm_packus_epi16(y_0_8x16b, y_1_8x16b);
692
186k
            y_1_16x8b = _mm_srli_si128(y_0_16x8b, 8);
693
694
186k
            _mm_storel_epi64((__m128i *)pu1_dst, y_0_16x8b);
695
186k
            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y_1_16x8b);
696
697
186k
            ht -= 2;
698
186k
            pu1_src += src_strd << 1;
699
186k
            pu1_dst += dst_strd << 1;
700
186k
        }
701
186k
        while(ht > 0);
702
76.0k
    }
703
5.93M
    else // wd == 16
704
5.93M
    {
705
5.93M
        __m128i y_2_16x8b, y_3_16x8b;
706
5.93M
        __m128i y_0L_8x16b, y_1L_8x16b, y_2L_8x16b, y_3L_8x16b;
707
5.93M
        __m128i y_0H_8x16b, y_1H_8x16b, y_2H_8x16b, y_3H_8x16b;
708
709
5.93M
        __m128i zero_16x8b;
710
5.93M
        zero_16x8b = _mm_set1_epi8(0);
711
712
5.93M
        do
713
11.8M
        {
714
11.8M
            y_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
715
11.8M
            y_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd));
716
11.8M
            y_2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + (src_strd << 1)));
717
11.8M
            y_3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd * 3));
718
719
11.8M
            y_0L_8x16b = _mm_cvtepu8_epi16(y_0_16x8b);
720
11.8M
            y_0H_8x16b = _mm_unpackhi_epi8(y_0_16x8b, zero_16x8b);
721
11.8M
            y_1L_8x16b = _mm_cvtepu8_epi16(y_1_16x8b);
722
11.8M
            y_1H_8x16b = _mm_unpackhi_epi8(y_1_16x8b, zero_16x8b);
723
11.8M
            y_2L_8x16b = _mm_cvtepu8_epi16(y_2_16x8b);
724
11.8M
            y_2H_8x16b = _mm_unpackhi_epi8(y_2_16x8b, zero_16x8b);
725
11.8M
            y_3L_8x16b = _mm_cvtepu8_epi16(y_3_16x8b);
726
11.8M
            y_3H_8x16b = _mm_unpackhi_epi8(y_3_16x8b, zero_16x8b);
727
728
11.8M
            y_0L_8x16b = _mm_mullo_epi16(y_0L_8x16b, wt_8x16b);
729
11.8M
            y_0H_8x16b = _mm_mullo_epi16(y_0H_8x16b, wt_8x16b);
730
11.8M
            y_1L_8x16b = _mm_mullo_epi16(y_1L_8x16b, wt_8x16b);
731
11.8M
            y_1H_8x16b = _mm_mullo_epi16(y_1H_8x16b, wt_8x16b);
732
11.8M
            y_2L_8x16b = _mm_mullo_epi16(y_2L_8x16b, wt_8x16b);
733
11.8M
            y_2H_8x16b = _mm_mullo_epi16(y_2H_8x16b, wt_8x16b);
734
11.8M
            y_3L_8x16b = _mm_mullo_epi16(y_3L_8x16b, wt_8x16b);
735
11.8M
            y_3H_8x16b = _mm_mullo_epi16(y_3H_8x16b, wt_8x16b);
736
737
11.8M
            y_0L_8x16b = _mm_adds_epi16(round_8x16b, y_0L_8x16b);
738
11.8M
            y_0H_8x16b = _mm_adds_epi16(round_8x16b, y_0H_8x16b);
739
11.8M
            y_1L_8x16b = _mm_adds_epi16(round_8x16b, y_1L_8x16b);
740
11.8M
            y_1H_8x16b = _mm_adds_epi16(round_8x16b, y_1H_8x16b);
741
11.8M
            y_2L_8x16b = _mm_adds_epi16(round_8x16b, y_2L_8x16b);
742
11.8M
            y_2H_8x16b = _mm_adds_epi16(round_8x16b, y_2H_8x16b);
743
11.8M
            y_3L_8x16b = _mm_adds_epi16(round_8x16b, y_3L_8x16b);
744
11.8M
            y_3H_8x16b = _mm_adds_epi16(round_8x16b, y_3H_8x16b);
745
746
11.8M
            y_0L_8x16b = _mm_srai_epi16(y_0L_8x16b, log_wd);
747
11.8M
            y_0H_8x16b = _mm_srai_epi16(y_0H_8x16b, log_wd);
748
11.8M
            y_1L_8x16b = _mm_srai_epi16(y_1L_8x16b, log_wd);
749
11.8M
            y_1H_8x16b = _mm_srai_epi16(y_1H_8x16b, log_wd);
750
11.8M
            y_2L_8x16b = _mm_srai_epi16(y_2L_8x16b, log_wd);
751
11.8M
            y_2H_8x16b = _mm_srai_epi16(y_2H_8x16b, log_wd);
752
11.8M
            y_3L_8x16b = _mm_srai_epi16(y_3L_8x16b, log_wd);
753
11.8M
            y_3H_8x16b = _mm_srai_epi16(y_3H_8x16b, log_wd);
754
755
11.8M
            y_0L_8x16b = _mm_adds_epi16(ofst_8x16b, y_0L_8x16b);
756
11.8M
            y_0H_8x16b = _mm_adds_epi16(ofst_8x16b, y_0H_8x16b);
757
11.8M
            y_1L_8x16b = _mm_adds_epi16(ofst_8x16b, y_1L_8x16b);
758
11.8M
            y_1H_8x16b = _mm_adds_epi16(ofst_8x16b, y_1H_8x16b);
759
11.8M
            y_2L_8x16b = _mm_adds_epi16(ofst_8x16b, y_2L_8x16b);
760
11.8M
            y_2H_8x16b = _mm_adds_epi16(ofst_8x16b, y_2H_8x16b);
761
11.8M
            y_3L_8x16b = _mm_adds_epi16(ofst_8x16b, y_3L_8x16b);
762
11.8M
            y_3H_8x16b = _mm_adds_epi16(ofst_8x16b, y_3H_8x16b);
763
764
11.8M
            y_0_16x8b = _mm_packus_epi16(y_0L_8x16b, y_0H_8x16b);
765
11.8M
            y_1_16x8b = _mm_packus_epi16(y_1L_8x16b, y_1H_8x16b);
766
11.8M
            y_2_16x8b = _mm_packus_epi16(y_2L_8x16b, y_2H_8x16b);
767
11.8M
            y_3_16x8b = _mm_packus_epi16(y_3L_8x16b, y_3H_8x16b);
768
769
11.8M
            _mm_storeu_si128((__m128i *)pu1_dst, y_0_16x8b);
770
11.8M
            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y_1_16x8b);
771
11.8M
            _mm_storeu_si128((__m128i *)(pu1_dst + (dst_strd << 1)), y_2_16x8b);
772
11.8M
            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 3), y_3_16x8b);
773
774
11.8M
            ht -= 4;
775
11.8M
            pu1_src += src_strd << 2;
776
11.8M
            pu1_dst += dst_strd << 2;
777
11.8M
        }
778
11.8M
        while(ht > 0);
779
5.93M
    }
780
6.08M
}
781
782
/*****************************************************************************/
783
/*                                                                           */
784
/*  Function Name : ih264_weighted_bi_pred_luma_sse42                        */
785
/*                                                                           */
786
/*  Description   : This function performs the weighted biprediction as      */
787
/*                  described in sec 8.4.2.3.2 titled "Weighted sample       */
788
/*                  prediction process" for luma. The function gets two      */
789
/*                  ht x wd blocks, weights them, adds them, rounds off the  */
790
/*                  sum, offsets it, saturates it to unsigned 8-bit and      */
791
/*                  stores it in the destination block. (ht,wd) can be       */
792
/*                  (4,4), (8,4), (4,8), (8,8), (16,8), (8,16) or (16,16).   */
793
/*                                                                           */
794
/*  Inputs        : pu1_src1  - Pointer to source 1                          */
795
/*                  pu1_src2  - Pointer to source 2                          */
796
/*                  pu1_dst   - Pointer to destination                       */
797
/*                  src_strd1 - stride for source 1                          */
798
/*                  src_strd2 - stride for source 2                          */
799
/*                  dst_strd2 - stride for destination                       */
800
/*                  log_wd    - number of bits to be rounded off             */
801
/*                  wt1       - weight value for source 1                    */
802
/*                  wt2       - weight value for source 2                    */
803
/*                  ofst1     - offset value for source 1                    */
804
/*                  ofst2     - offset value for source 2                    */
805
/*                  ht        - height of the block                          */
806
/*                  wd        - width of the block                           */
807
/*                                                                           */
808
/*  Issues        : None                                                     */
809
/*                                                                           */
810
/*  Revision History:                                                        */
811
/*                                                                           */
812
/*         DD MM YYYY   Author(s)       Changes                              */
813
/*         04 02 2015   Kaushik         Initial Version                      */
814
/*                      Senthoor                                             */
815
/*                                                                           */
816
/*****************************************************************************/
817
void ih264_weighted_bi_pred_luma_sse42(UWORD8 *pu1_src1,
818
                                       UWORD8 *pu1_src2,
819
                                       UWORD8 *pu1_dst,
820
                                       WORD32 src_strd1,
821
                                       WORD32 src_strd2,
822
                                       WORD32 dst_strd,
823
                                       WORD32 log_wd,
824
                                       WORD32 wt1,
825
                                       WORD32 wt2,
826
                                       WORD32 ofst1,
827
                                       WORD32 ofst2,
828
                                       WORD32 ht,
829
                                       WORD32 wd)
830
733k
{
831
733k
    __m128i y1_0_16x8b, y1_1_16x8b;
832
733k
    __m128i y2_0_16x8b, y2_1_16x8b;
833
834
733k
    __m128i wt1_8x16b, wt2_8x16b;
835
733k
    __m128i ofst_8x16b, round_8x16b;
836
837
733k
    WORD32 ofst;
838
733k
    WORD32 round_val, shft;
839
840
733k
    wt1 = (WORD16)(wt1 & 0xffff);
841
733k
    wt2 = (WORD16)(wt2 & 0xffff);
842
733k
    round_val = 1 << log_wd;
843
733k
    shft = log_wd + 1;
844
733k
    ofst1 = (WORD8)(ofst1 & 0xff);
845
733k
    ofst2 = (WORD8)(ofst2 & 0xff);
846
733k
    ofst = (ofst1 + ofst2 + 1) >> 1;
847
848
733k
    wt1_8x16b = _mm_set1_epi16(wt1);
849
733k
    wt2_8x16b = _mm_set1_epi16(wt2);
850
733k
    round_8x16b = _mm_set1_epi16(round_val);
851
733k
    ofst_8x16b = _mm_set1_epi16(ofst);
852
853
733k
    if(wd == 4)
854
38.9k
    {
855
38.9k
        __m128i y1_2_16x8b, y1_3_16x8b;
856
38.9k
        __m128i y2_2_16x8b, y2_3_16x8b;
857
858
38.9k
        __m128i y1_0_8x16b, y1_2_8x16b;
859
38.9k
        __m128i y2_0_8x16b, y2_2_8x16b;
860
861
38.9k
        do
862
40.5k
        {
863
40.5k
            y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
864
40.5k
            y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
865
40.5k
            y1_2_16x8b = _mm_loadl_epi64(
866
40.5k
                            (__m128i *)(pu1_src1 + (src_strd1 << 1)));
867
40.5k
            y1_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1 * 3));
868
869
40.5k
            y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
870
40.5k
            y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
871
40.5k
            y2_2_16x8b = _mm_loadl_epi64(
872
40.5k
                            (__m128i *)(pu1_src2 + (src_strd2 << 1)));
873
40.5k
            y2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2 * 3));
874
875
40.5k
            y1_0_16x8b = _mm_unpacklo_epi32(y1_0_16x8b, y1_1_16x8b);
876
40.5k
            y1_2_16x8b = _mm_unpacklo_epi32(y1_2_16x8b, y1_3_16x8b);
877
40.5k
            y2_0_16x8b = _mm_unpacklo_epi32(y2_0_16x8b, y2_1_16x8b);
878
40.5k
            y2_2_16x8b = _mm_unpacklo_epi32(y2_2_16x8b, y2_3_16x8b);
879
880
40.5k
            y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
881
40.5k
            y1_2_8x16b = _mm_cvtepu8_epi16(y1_2_16x8b);
882
40.5k
            y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
883
40.5k
            y2_2_8x16b = _mm_cvtepu8_epi16(y2_2_16x8b);
884
885
40.5k
            y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b);
886
40.5k
            y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b);
887
40.5k
            y1_2_8x16b = _mm_mullo_epi16(y1_2_8x16b, wt1_8x16b);
888
40.5k
            y2_2_8x16b = _mm_mullo_epi16(y2_2_8x16b, wt2_8x16b);
889
890
40.5k
            y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b);
891
40.5k
            y1_2_8x16b = _mm_adds_epi16(y1_2_8x16b, y2_2_8x16b);
892
893
40.5k
            y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b);
894
40.5k
            y1_2_8x16b = _mm_adds_epi16(round_8x16b, y1_2_8x16b);
895
896
40.5k
            y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft);
897
40.5k
            y1_2_8x16b = _mm_srai_epi16(y1_2_8x16b, shft);
898
899
40.5k
            y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b);
900
40.5k
            y1_2_8x16b = _mm_adds_epi16(ofst_8x16b, y1_2_8x16b);
901
902
40.5k
            y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_2_8x16b);
903
40.5k
            y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 4);
904
40.5k
            y1_2_16x8b = _mm_srli_si128(y1_0_16x8b, 8);
905
40.5k
            y1_3_16x8b = _mm_srli_si128(y1_0_16x8b, 12);
906
907
40.5k
            *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y1_0_16x8b);
908
40.5k
            *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y1_1_16x8b);
909
40.5k
            *((WORD32 *)(pu1_dst + (dst_strd << 1))) = _mm_cvtsi128_si32(y1_2_16x8b);
910
40.5k
            *((WORD32 *)(pu1_dst + dst_strd * 3)) = _mm_cvtsi128_si32(y1_3_16x8b);
911
912
913
40.5k
            ht -= 4;
914
40.5k
            pu1_src1 += src_strd1 << 2;
915
40.5k
            pu1_src2 += src_strd2 << 2;
916
40.5k
            pu1_dst += dst_strd << 2;
917
40.5k
        }
918
40.5k
        while(ht > 0);
919
38.9k
    }
920
694k
    else if(wd == 8)
921
53.1k
    {
922
53.1k
        __m128i y1_2_16x8b, y1_3_16x8b;
923
53.1k
        __m128i y2_2_16x8b, y2_3_16x8b;
924
925
53.1k
        __m128i y1_0_8x16b, y1_1_8x16b, y1_2_8x16b, y1_3_8x16b;
926
53.1k
        __m128i y2_0_8x16b, y2_1_8x16b, y2_2_8x16b, y2_3_8x16b;
927
928
53.1k
        do
929
119k
        {
930
119k
            y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
931
119k
            y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
932
119k
            y1_2_16x8b = _mm_loadl_epi64(
933
119k
                            (__m128i *)(pu1_src1 + (src_strd1 << 1)));
934
119k
            y1_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1 * 3));
935
936
119k
            y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
937
119k
            y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
938
119k
            y2_2_16x8b = _mm_loadl_epi64(
939
119k
                            (__m128i *)(pu1_src2 + (src_strd2 << 1)));
940
119k
            y2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2 * 3));
941
942
119k
            y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
943
119k
            y1_1_8x16b = _mm_cvtepu8_epi16(y1_1_16x8b);
944
119k
            y1_2_8x16b = _mm_cvtepu8_epi16(y1_2_16x8b);
945
119k
            y1_3_8x16b = _mm_cvtepu8_epi16(y1_3_16x8b);
946
947
119k
            y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
948
119k
            y2_1_8x16b = _mm_cvtepu8_epi16(y2_1_16x8b);
949
119k
            y2_2_8x16b = _mm_cvtepu8_epi16(y2_2_16x8b);
950
119k
            y2_3_8x16b = _mm_cvtepu8_epi16(y2_3_16x8b);
951
952
119k
            y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b);
953
119k
            y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b);
954
119k
            y1_1_8x16b = _mm_mullo_epi16(y1_1_8x16b, wt1_8x16b);
955
119k
            y2_1_8x16b = _mm_mullo_epi16(y2_1_8x16b, wt2_8x16b);
956
957
119k
            y1_2_8x16b = _mm_mullo_epi16(y1_2_8x16b, wt1_8x16b);
958
119k
            y2_2_8x16b = _mm_mullo_epi16(y2_2_8x16b, wt2_8x16b);
959
119k
            y1_3_8x16b = _mm_mullo_epi16(y1_3_8x16b, wt1_8x16b);
960
119k
            y2_3_8x16b = _mm_mullo_epi16(y2_3_8x16b, wt2_8x16b);
961
962
119k
            y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b);
963
119k
            y1_1_8x16b = _mm_adds_epi16(y1_1_8x16b, y2_1_8x16b);
964
119k
            y1_2_8x16b = _mm_adds_epi16(y1_2_8x16b, y2_2_8x16b);
965
119k
            y1_3_8x16b = _mm_adds_epi16(y1_3_8x16b, y2_3_8x16b);
966
967
119k
            y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b);
968
119k
            y1_1_8x16b = _mm_adds_epi16(round_8x16b, y1_1_8x16b);
969
119k
            y1_2_8x16b = _mm_adds_epi16(round_8x16b, y1_2_8x16b);
970
119k
            y1_3_8x16b = _mm_adds_epi16(round_8x16b, y1_3_8x16b);
971
972
119k
            y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft);
973
119k
            y1_1_8x16b = _mm_srai_epi16(y1_1_8x16b, shft);
974
119k
            y1_2_8x16b = _mm_srai_epi16(y1_2_8x16b, shft);
975
119k
            y1_3_8x16b = _mm_srai_epi16(y1_3_8x16b, shft);
976
977
119k
            y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b);
978
119k
            y1_1_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1_8x16b);
979
119k
            y1_2_8x16b = _mm_adds_epi16(ofst_8x16b, y1_2_8x16b);
980
119k
            y1_3_8x16b = _mm_adds_epi16(ofst_8x16b, y1_3_8x16b);
981
982
119k
            y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_1_8x16b);
983
119k
            y1_2_16x8b = _mm_packus_epi16(y1_2_8x16b, y1_3_8x16b);
984
119k
            y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 8);
985
119k
            y1_3_16x8b = _mm_srli_si128(y1_2_16x8b, 8);
986
987
119k
            _mm_storel_epi64((__m128i *)pu1_dst, y1_0_16x8b);
988
119k
            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b);
989
119k
            _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd << 1)), y1_2_16x8b);
990
119k
            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd * 3), y1_3_16x8b);
991
992
119k
            ht -= 4;
993
119k
            pu1_src1 += src_strd1 << 2;
994
119k
            pu1_src2 += src_strd2 << 2;
995
119k
            pu1_dst += dst_strd << 2;
996
119k
        }
997
119k
        while(ht > 0);
998
53.1k
    }
999
641k
    else // wd == 16
1000
641k
    {
1001
641k
        __m128i y1_0L_8x16b, y1_0H_8x16b, y1_1L_8x16b, y1_1H_8x16b;
1002
641k
        __m128i y2_0L_8x16b, y2_0H_8x16b, y2_1L_8x16b, y2_1H_8x16b;
1003
1004
641k
        __m128i zero_16x8b;
1005
641k
        zero_16x8b = _mm_set1_epi8(0);
1006
1007
641k
        do
1008
5.08M
        {
1009
5.08M
            y1_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src1);
1010
5.08M
            y1_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1));
1011
5.08M
            y2_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src2);
1012
5.08M
            y2_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2));
1013
1014
5.08M
            y1_0L_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
1015
5.08M
            y1_0H_8x16b = _mm_unpackhi_epi8(y1_0_16x8b, zero_16x8b);
1016
5.08M
            y1_1L_8x16b = _mm_cvtepu8_epi16(y1_1_16x8b);
1017
5.08M
            y1_1H_8x16b = _mm_unpackhi_epi8(y1_1_16x8b, zero_16x8b);
1018
1019
5.08M
            y2_0L_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
1020
5.08M
            y2_0H_8x16b = _mm_unpackhi_epi8(y2_0_16x8b, zero_16x8b);
1021
5.08M
            y2_1L_8x16b = _mm_cvtepu8_epi16(y2_1_16x8b);
1022
5.08M
            y2_1H_8x16b = _mm_unpackhi_epi8(y2_1_16x8b, zero_16x8b);
1023
1024
5.08M
            y1_0L_8x16b = _mm_mullo_epi16(y1_0L_8x16b, wt1_8x16b);
1025
5.08M
            y1_0H_8x16b = _mm_mullo_epi16(y1_0H_8x16b, wt1_8x16b);
1026
5.08M
            y1_1L_8x16b = _mm_mullo_epi16(y1_1L_8x16b, wt1_8x16b);
1027
5.08M
            y1_1H_8x16b = _mm_mullo_epi16(y1_1H_8x16b, wt1_8x16b);
1028
1029
5.08M
            y2_0L_8x16b = _mm_mullo_epi16(y2_0L_8x16b, wt2_8x16b);
1030
5.08M
            y2_0H_8x16b = _mm_mullo_epi16(y2_0H_8x16b, wt2_8x16b);
1031
5.08M
            y2_1L_8x16b = _mm_mullo_epi16(y2_1L_8x16b, wt2_8x16b);
1032
5.08M
            y2_1H_8x16b = _mm_mullo_epi16(y2_1H_8x16b, wt2_8x16b);
1033
1034
5.08M
            y1_0L_8x16b = _mm_adds_epi16(y1_0L_8x16b, y2_0L_8x16b);
1035
5.08M
            y1_0H_8x16b = _mm_adds_epi16(y1_0H_8x16b, y2_0H_8x16b);
1036
5.08M
            y1_1L_8x16b = _mm_adds_epi16(y1_1L_8x16b, y2_1L_8x16b);
1037
5.08M
            y1_1H_8x16b = _mm_adds_epi16(y1_1H_8x16b, y2_1H_8x16b);
1038
1039
5.08M
            y1_0L_8x16b = _mm_adds_epi16(round_8x16b, y1_0L_8x16b);
1040
5.08M
            y1_0H_8x16b = _mm_adds_epi16(round_8x16b, y1_0H_8x16b);
1041
5.08M
            y1_1L_8x16b = _mm_adds_epi16(round_8x16b, y1_1L_8x16b);
1042
5.08M
            y1_1H_8x16b = _mm_adds_epi16(round_8x16b, y1_1H_8x16b);
1043
1044
5.08M
            y1_0L_8x16b = _mm_srai_epi16(y1_0L_8x16b, shft);
1045
5.08M
            y1_0H_8x16b = _mm_srai_epi16(y1_0H_8x16b, shft);
1046
5.08M
            y1_1L_8x16b = _mm_srai_epi16(y1_1L_8x16b, shft);
1047
5.08M
            y1_1H_8x16b = _mm_srai_epi16(y1_1H_8x16b, shft);
1048
1049
5.08M
            y1_0L_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0L_8x16b);
1050
5.08M
            y1_0H_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0H_8x16b);
1051
5.08M
            y1_1L_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1L_8x16b);
1052
5.08M
            y1_1H_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1H_8x16b);
1053
1054
5.08M
            y1_0_16x8b = _mm_packus_epi16(y1_0L_8x16b, y1_0H_8x16b);
1055
5.08M
            y1_1_16x8b = _mm_packus_epi16(y1_1L_8x16b, y1_1H_8x16b);
1056
1057
5.08M
            _mm_storeu_si128((__m128i *)pu1_dst, y1_0_16x8b);
1058
5.08M
            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b);
1059
1060
5.08M
            ht -= 2;
1061
5.08M
            pu1_src1 += src_strd1 << 1;
1062
5.08M
            pu1_src2 += src_strd2 << 1;
1063
5.08M
            pu1_dst += dst_strd << 1;
1064
5.08M
        }
1065
5.08M
        while(ht > 0);
1066
641k
    }
1067
733k
}
1068
1069
/*****************************************************************************/
1070
/*                                                                           */
1071
/*  Function Name : ih264_weighted_bi_pred_chroma_sse42                      */
1072
/*                                                                           */
1073
/*  Description   : This function performs the weighted biprediction as      */
1074
/*                  described in sec 8.4.2.3.2 titled "Weighted sample       */
1075
/*                  prediction process" for chroma. The function gets two    */
1076
/*                  ht x wd blocks, weights them, adds them, rounds off the  */
1077
/*                  sum, offsets it, saturates it to unsigned 8-bit and      */
1078
/*                  stores it in the destination block. (ht,wd) can be       */
1079
/*                  (2,2), (4,2), (2,4), (4,4), (8,4), (4,8) or (8,8).       */
1080
/*                                                                           */
1081
/*  Inputs        : pu1_src1  - Pointer to source 1                          */
1082
/*                  pu1_src2  - Pointer to source 2                          */
1083
/*                  pu1_dst   - Pointer to destination                       */
1084
/*                  src_strd1 - stride for source 1                          */
1085
/*                  src_strd2 - stride for source 2                          */
1086
/*                  dst_strd2 - stride for destination                       */
1087
/*                  log_wd    - number of bits to be rounded off             */
1088
/*                  wt1       - weight values for u and v in source 1        */
1089
/*                  wt2       - weight values for u and v in source 2        */
1090
/*                  ofst1     - offset value for u and v in source 1         */
1091
/*                  ofst2     - offset value for u and v in source 2         */
1092
/*                  ht        - height of the block                          */
1093
/*                  wd        - width of the block                           */
1094
/*                                                                           */
1095
/*  Issues        : None                                                     */
1096
/*                                                                           */
1097
/*  Revision History:                                                        */
1098
/*                                                                           */
1099
/*         DD MM YYYY   Author(s)       Changes                              */
1100
/*         04 02 2015   Kaushik         Initial Version                      */
1101
/*                      Senthoor                                             */
1102
/*                                                                           */
1103
/*****************************************************************************/
1104
void ih264_weighted_bi_pred_chroma_sse42(UWORD8 *pu1_src1,
1105
                                         UWORD8 *pu1_src2,
1106
                                         UWORD8 *pu1_dst,
1107
                                         WORD32 src_strd1,
1108
                                         WORD32 src_strd2,
1109
                                         WORD32 dst_strd,
1110
                                         WORD32 log_wd,
1111
                                         WORD32 wt1,
1112
                                         WORD32 wt2,
1113
                                         WORD32 ofst1,
1114
                                         WORD32 ofst2,
1115
                                         WORD32 ht,
1116
                                         WORD32 wd)
1117
733k
{
1118
733k
    __m128i y1_0_16x8b, y1_1_16x8b;
1119
733k
    __m128i y2_0_16x8b, y2_1_16x8b;
1120
1121
733k
    __m128i wt1_8x16b, wt2_8x16b;
1122
733k
    __m128i ofst_8x16b, round_8x16b;
1123
1124
733k
    WORD32 ofst1_u, ofst2_u, ofst_u;
1125
733k
    WORD32 ofst1_v, ofst2_v, ofst_v;
1126
733k
    WORD32 round_val, shft, ofst_val;
1127
1128
733k
    round_val = 1 << log_wd;
1129
733k
    shft = log_wd + 1;
1130
1131
733k
    ofst1_u = (WORD8)(ofst1 & 0xff);
1132
733k
    ofst1_v = (WORD8)(ofst1 >> 8);
1133
733k
    ofst2_u = (WORD8)(ofst2 & 0xff);
1134
733k
    ofst2_v = (WORD8)(ofst2 >> 8);
1135
1136
733k
    wt1_8x16b = _mm_set1_epi32(wt1);
1137
733k
    wt2_8x16b = _mm_set1_epi32(wt2);
1138
1139
733k
    ofst_u = (ofst1_u + ofst2_u + 1) >> 1;
1140
733k
    ofst_v = (ofst1_v + ofst2_v + 1) >> 1;
1141
733k
    ofst_val = (ofst_u & 0xffff) | (ofst_v << 16);
1142
1143
733k
    round_8x16b = _mm_set1_epi16(round_val);
1144
733k
    ofst_8x16b = _mm_set1_epi32(ofst_val);
1145
1146
733k
    if(wd == 2)
1147
38.9k
    {
1148
38.9k
        __m128i y1_0_8x16b, y2_0_8x16b;
1149
1150
38.9k
        do
1151
40.5k
        {
1152
40.5k
            y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
1153
40.5k
            y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
1154
1155
40.5k
            y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
1156
40.5k
            y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
1157
1158
40.5k
            y1_0_16x8b = _mm_unpacklo_epi32(y1_0_16x8b, y1_1_16x8b);
1159
40.5k
            y2_0_16x8b = _mm_unpacklo_epi32(y2_0_16x8b, y2_1_16x8b);
1160
1161
40.5k
            y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
1162
40.5k
            y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
1163
1164
40.5k
            y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b);
1165
40.5k
            y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b);
1166
1167
40.5k
            y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b);
1168
40.5k
            y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b);
1169
1170
40.5k
            y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft);
1171
40.5k
            y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b);
1172
1173
40.5k
            y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_0_8x16b);
1174
40.5k
            y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 4);
1175
1176
40.5k
            *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y1_0_16x8b);
1177
40.5k
            *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y1_1_16x8b);
1178
1179
40.5k
            ht -= 2;
1180
40.5k
            pu1_src1 += src_strd1 << 1;
1181
40.5k
            pu1_src2 += src_strd2 << 1;
1182
40.5k
            pu1_dst += dst_strd << 1;
1183
40.5k
        }
1184
40.5k
        while(ht > 0);
1185
38.9k
    }
1186
694k
    else if(wd == 4)
1187
53.1k
    {
1188
53.1k
        __m128i y1_0_8x16b, y1_1_8x16b;
1189
53.1k
        __m128i y2_0_8x16b, y2_1_8x16b;
1190
1191
53.1k
        do
1192
119k
        {
1193
119k
            y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
1194
119k
            y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
1195
1196
119k
            y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
1197
119k
            y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
1198
1199
119k
            y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
1200
119k
            y1_1_8x16b = _mm_cvtepu8_epi16(y1_1_16x8b);
1201
1202
119k
            y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
1203
119k
            y2_1_8x16b = _mm_cvtepu8_epi16(y2_1_16x8b);
1204
1205
119k
            y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b);
1206
119k
            y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b);
1207
119k
            y1_1_8x16b = _mm_mullo_epi16(y1_1_8x16b, wt1_8x16b);
1208
119k
            y2_1_8x16b = _mm_mullo_epi16(y2_1_8x16b, wt2_8x16b);
1209
1210
119k
            y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b);
1211
119k
            y1_1_8x16b = _mm_adds_epi16(y1_1_8x16b, y2_1_8x16b);
1212
1213
119k
            y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b);
1214
119k
            y1_1_8x16b = _mm_adds_epi16(round_8x16b, y1_1_8x16b);
1215
1216
119k
            y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft);
1217
119k
            y1_1_8x16b = _mm_srai_epi16(y1_1_8x16b, shft);
1218
1219
119k
            y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b);
1220
119k
            y1_1_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1_8x16b);
1221
1222
119k
            y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_1_8x16b);
1223
119k
            y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 8);
1224
1225
119k
            _mm_storel_epi64((__m128i *)pu1_dst, y1_0_16x8b);
1226
119k
            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b);
1227
1228
119k
            ht -= 2;
1229
119k
            pu1_src1 += src_strd1 << 1;
1230
119k
            pu1_src2 += src_strd2 << 1;
1231
119k
            pu1_dst += dst_strd << 1;
1232
119k
        }
1233
119k
        while(ht > 0);
1234
53.1k
    }
1235
641k
    else // wd == 8
1236
641k
    {
1237
641k
        __m128i y1_0L_8x16b, y1_0H_8x16b, y1_1L_8x16b, y1_1H_8x16b;
1238
641k
        __m128i y2_0L_8x16b, y2_0H_8x16b, y2_1L_8x16b, y2_1H_8x16b;
1239
1240
641k
        __m128i zero_16x8b;
1241
641k
        zero_16x8b = _mm_set1_epi8(0);
1242
1243
641k
        do
1244
2.54M
        {
1245
2.54M
            y1_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src1);
1246
2.54M
            y1_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1));
1247
2.54M
            y2_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src2);
1248
2.54M
            y2_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2));
1249
1250
2.54M
            y1_0L_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
1251
2.54M
            y1_0H_8x16b = _mm_unpackhi_epi8(y1_0_16x8b, zero_16x8b);
1252
2.54M
            y1_1L_8x16b = _mm_cvtepu8_epi16(y1_1_16x8b);
1253
2.54M
            y1_1H_8x16b = _mm_unpackhi_epi8(y1_1_16x8b, zero_16x8b);
1254
1255
2.54M
            y2_0L_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
1256
2.54M
            y2_0H_8x16b = _mm_unpackhi_epi8(y2_0_16x8b, zero_16x8b);
1257
2.54M
            y2_1L_8x16b = _mm_cvtepu8_epi16(y2_1_16x8b);
1258
2.54M
            y2_1H_8x16b = _mm_unpackhi_epi8(y2_1_16x8b, zero_16x8b);
1259
1260
2.54M
            y1_0L_8x16b = _mm_mullo_epi16(y1_0L_8x16b, wt1_8x16b);
1261
2.54M
            y1_0H_8x16b = _mm_mullo_epi16(y1_0H_8x16b, wt1_8x16b);
1262
2.54M
            y1_1L_8x16b = _mm_mullo_epi16(y1_1L_8x16b, wt1_8x16b);
1263
2.54M
            y1_1H_8x16b = _mm_mullo_epi16(y1_1H_8x16b, wt1_8x16b);
1264
1265
2.54M
            y2_0L_8x16b = _mm_mullo_epi16(y2_0L_8x16b, wt2_8x16b);
1266
2.54M
            y2_0H_8x16b = _mm_mullo_epi16(y2_0H_8x16b, wt2_8x16b);
1267
2.54M
            y2_1L_8x16b = _mm_mullo_epi16(y2_1L_8x16b, wt2_8x16b);
1268
2.54M
            y2_1H_8x16b = _mm_mullo_epi16(y2_1H_8x16b, wt2_8x16b);
1269
1270
2.54M
            y1_0L_8x16b = _mm_adds_epi16(y1_0L_8x16b, y2_0L_8x16b);
1271
2.54M
            y1_0H_8x16b = _mm_adds_epi16(y1_0H_8x16b, y2_0H_8x16b);
1272
2.54M
            y1_1L_8x16b = _mm_adds_epi16(y1_1L_8x16b, y2_1L_8x16b);
1273
2.54M
            y1_1H_8x16b = _mm_adds_epi16(y1_1H_8x16b, y2_1H_8x16b);
1274
1275
2.54M
            y1_0L_8x16b = _mm_adds_epi16(round_8x16b, y1_0L_8x16b);
1276
2.54M
            y1_0H_8x16b = _mm_adds_epi16(round_8x16b, y1_0H_8x16b);
1277
2.54M
            y1_1L_8x16b = _mm_adds_epi16(round_8x16b, y1_1L_8x16b);
1278
2.54M
            y1_1H_8x16b = _mm_adds_epi16(round_8x16b, y1_1H_8x16b);
1279
1280
2.54M
            y1_0L_8x16b = _mm_srai_epi16(y1_0L_8x16b, shft);
1281
2.54M
            y1_0H_8x16b = _mm_srai_epi16(y1_0H_8x16b, shft);
1282
2.54M
            y1_1L_8x16b = _mm_srai_epi16(y1_1L_8x16b, shft);
1283
2.54M
            y1_1H_8x16b = _mm_srai_epi16(y1_1H_8x16b, shft);
1284
1285
2.54M
            y1_0L_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0L_8x16b);
1286
2.54M
            y1_0H_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0H_8x16b);
1287
2.54M
            y1_1L_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1L_8x16b);
1288
2.54M
            y1_1H_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1H_8x16b);
1289
1290
2.54M
            y1_0_16x8b = _mm_packus_epi16(y1_0L_8x16b, y1_0H_8x16b);
1291
2.54M
            y1_1_16x8b = _mm_packus_epi16(y1_1L_8x16b, y1_1H_8x16b);
1292
1293
2.54M
            _mm_storeu_si128((__m128i *)pu1_dst, y1_0_16x8b);
1294
2.54M
            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b);
1295
1296
2.54M
            ht -= 2;
1297
2.54M
            pu1_src1 += src_strd1 << 1;
1298
2.54M
            pu1_src2 += src_strd2 << 1;
1299
2.54M
            pu1_dst += dst_strd << 1;
1300
2.54M
        }
1301
2.54M
        while(ht > 0);
1302
641k
    }
1303
733k
}