Coverage Report

Created: 2025-10-10 06:30

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libavc/common/x86/ih264_weighted_pred_sse42.c
Line
Count
Source
1
/******************************************************************************
2
 *
3
 * Copyright (C) 2015 The Android Open Source Project
4
 *
5
 * Licensed under the Apache License, Version 2.0 (the "License");
6
 * you may not use this file except in compliance with the License.
7
 * You may obtain a copy of the License at:
8
 *
9
 * http://www.apache.org/licenses/LICENSE-2.0
10
 *
11
 * Unless required by applicable law or agreed to in writing, software
12
 * distributed under the License is distributed on an "AS IS" BASIS,
13
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 * See the License for the specific language governing permissions and
15
 * limitations under the License.
16
 *
17
 *****************************************************************************
18
 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19
*/
20
/*****************************************************************************/
21
/*                                                                           */
22
/*  File Name         : ih264_weighted_pred_intr_sse42.c                     */
23
/*                                                                           */
24
/*  Description       : Contains function definitions for weighted           */
25
/*                      prediction functions in x86 sse4 intrinsics          */
26
/*                                                                           */
27
/*  List of Functions : ih264_default_weighted_pred_luma_sse42()             */
28
/*                      ih264_default_weighted_pred_chroma_sse42()           */
29
/*                      ih264_weighted_pred_luma_sse42()                     */
30
/*                      ih264_weighted_pred_chroma_sse42()                   */
31
/*                      ih264_weighted_bipred_luma_sse42()                   */
32
/*                      ih264_weighted_bipred_chroma_sse42()                 */
33
/*                                                                           */
34
/*  Issues / Problems : None                                                 */
35
/*                                                                           */
36
/*  Revision History  :                                                      */
37
/*                                                                           */
38
/*         DD MM YYYY   Author(s)       Changes                              */
39
/*         30 01 2015   Kaushik         Initial version                      */
40
/*                      Senthoor                                             */
41
/*                                                                           */
42
/*****************************************************************************/
43
/*****************************************************************************/
44
/* File Includes                                                             */
45
/*****************************************************************************/
46
47
#include <immintrin.h>
48
#include "ih264_typedefs.h"
49
#include "ih264_macros.h"
50
#include "ih264_platform_macros.h"
51
#include "ih264_weighted_pred.h"
52
53
/*****************************************************************************/
54
/*  Function definitions .                                                   */
55
/*****************************************************************************/
56
/*****************************************************************************/
57
/*                                                                           */
58
/*  Function Name : ih264_default_weighted_pred_luma_sse42                   */
59
/*                                                                           */
60
/*  Description   : This function performs the default weighted prediction   */
61
/*                  as described in sec 8.4.2.3.1 titled "Default weighted   */
62
/*                  sample prediction process" for luma. The function gets   */
63
/*                  two ht x wd blocks, calculates their rounded-average and */
64
/*                  stores it in the destination block. (ht,wd) can be       */
65
/*                  (4,4), (8,4), (4,8), (8,8), (16,8), (8,16) or (16,16).   */
66
/*                                                                           */
67
/*  Inputs        : pu1_src1  - Pointer to source 1                          */
68
/*                  pu1_src2  - Pointer to source 2                          */
69
/*                  pu1_dst   - Pointer to destination                       */
70
/*                  src_strd1 - stride for source 1                          */
71
/*                  src_strd1 - stride for source 2                          */
72
/*                  dst_strd  - stride for destination                       */
73
/*                  ht        - height of the block                          */
74
/*                  wd        - width of the block                           */
75
/*                                                                           */
76
/*  Issues        : None                                                     */
77
/*                                                                           */
78
/*  Revision History:                                                        */
79
/*                                                                           */
80
/*         DD MM YYYY   Author(s)       Changes                              */
81
/*         04 02 2015   Kaushik         Initial Version                      */
82
/*                      Senthoor                                             */
83
/*                                                                           */
84
/*****************************************************************************/
85
void ih264_default_weighted_pred_luma_sse42(UWORD8 *pu1_src1,
86
                                            UWORD8 *pu1_src2,
87
                                            UWORD8 *pu1_dst,
88
                                            WORD32 src_strd1,
89
                                            WORD32 src_strd2,
90
                                            WORD32 dst_strd,
91
                                            WORD32 ht,
92
                                            WORD32 wd)
93
505k
{
94
505k
    __m128i y0_0_16x8b, y0_1_16x8b, y0_2_16x8b, y0_3_16x8b;
95
505k
    __m128i y1_0_16x8b, y1_1_16x8b, y1_2_16x8b, y1_3_16x8b;
96
97
505k
    if(wd == 4)
98
7.94k
    {
99
7.94k
        do
100
9.71k
        {
101
9.71k
            y0_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
102
9.71k
            y0_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
103
9.71k
            y0_2_16x8b = _mm_loadl_epi64(
104
9.71k
                            (__m128i *)(pu1_src1 + (src_strd1 << 1)));
105
9.71k
            y0_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1 * 3));
106
107
9.71k
            y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
108
9.71k
            y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
109
9.71k
            y1_2_16x8b = _mm_loadl_epi64(
110
9.71k
                            (__m128i *)(pu1_src2 + (src_strd2 << 1)));
111
9.71k
            y1_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2 * 3));
112
113
9.71k
            y0_0_16x8b = _mm_avg_epu8(y0_0_16x8b, y1_0_16x8b);
114
9.71k
            y0_1_16x8b = _mm_avg_epu8(y0_1_16x8b, y1_1_16x8b);
115
9.71k
            y0_2_16x8b = _mm_avg_epu8(y0_2_16x8b, y1_2_16x8b);
116
9.71k
            y0_3_16x8b = _mm_avg_epu8(y0_3_16x8b, y1_3_16x8b);
117
118
9.71k
            *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y0_0_16x8b);
119
9.71k
            *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y0_1_16x8b);
120
9.71k
            *((WORD32 *)(pu1_dst + (dst_strd << 1))) = _mm_cvtsi128_si32(y0_2_16x8b);
121
9.71k
            *((WORD32 *)(pu1_dst + dst_strd * 3)) = _mm_cvtsi128_si32(y0_3_16x8b);
122
123
9.71k
            ht -= 4;
124
9.71k
            pu1_src1 += src_strd1 << 2;
125
9.71k
            pu1_src2 += src_strd2 << 2;
126
9.71k
            pu1_dst += dst_strd << 2;
127
9.71k
        }
128
9.71k
        while(ht > 0);
129
7.94k
    }
130
497k
    else if(wd == 8)
131
100k
    {
132
100k
        do
133
235k
        {
134
235k
            y0_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
135
235k
            y0_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
136
235k
            y0_2_16x8b = _mm_loadl_epi64(
137
235k
                            (__m128i *)(pu1_src1 + (src_strd1 << 1)));
138
235k
            y0_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1 * 3));
139
140
235k
            y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
141
235k
            y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
142
235k
            y1_2_16x8b = _mm_loadl_epi64(
143
235k
                            (__m128i *)(pu1_src2 + (src_strd2 << 1)));
144
235k
            y1_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2 * 3));
145
146
235k
            y0_0_16x8b = _mm_avg_epu8(y0_0_16x8b, y1_0_16x8b);
147
235k
            y0_1_16x8b = _mm_avg_epu8(y0_1_16x8b, y1_1_16x8b);
148
235k
            y0_2_16x8b = _mm_avg_epu8(y0_2_16x8b, y1_2_16x8b);
149
235k
            y0_3_16x8b = _mm_avg_epu8(y0_3_16x8b, y1_3_16x8b);
150
151
235k
            _mm_storel_epi64((__m128i *)pu1_dst, y0_0_16x8b);
152
235k
            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y0_1_16x8b);
153
235k
            _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd << 1)), y0_2_16x8b);
154
235k
            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd * 3), y0_3_16x8b);
155
156
235k
            ht -= 4;
157
235k
            pu1_src1 += src_strd1 << 2;
158
235k
            pu1_src2 += src_strd2 << 2;
159
235k
            pu1_dst += dst_strd << 2;
160
235k
        }
161
235k
        while(ht > 0);
162
100k
    }
163
397k
    else // wd == 16
164
397k
    {
165
397k
        __m128i y0_4_16x8b, y0_5_16x8b, y0_6_16x8b, y0_7_16x8b;
166
397k
        __m128i y1_4_16x8b, y1_5_16x8b, y1_6_16x8b, y1_7_16x8b;
167
168
397k
        do
169
750k
        {
170
750k
            y0_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src1);
171
750k
            y0_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1));
172
750k
            y0_2_16x8b = _mm_loadu_si128(
173
750k
                            (__m128i *)(pu1_src1 + (src_strd1 << 1)));
174
750k
            y0_3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1 * 3));
175
750k
            y0_4_16x8b = _mm_loadu_si128(
176
750k
                            (__m128i *)(pu1_src1 + (src_strd1 << 2)));
177
750k
            y0_5_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1 * 5));
178
750k
            y0_6_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1 * 6));
179
750k
            y0_7_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1 * 7));
180
181
750k
            y1_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src2);
182
750k
            y1_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2));
183
750k
            y1_2_16x8b = _mm_loadu_si128(
184
750k
                            (__m128i *)(pu1_src2 + (src_strd2 << 1)));
185
750k
            y1_3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2 * 3));
186
750k
            y1_4_16x8b = _mm_loadu_si128(
187
750k
                            (__m128i *)(pu1_src2 + (src_strd2 << 2)));
188
750k
            y1_5_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2 * 5));
189
750k
            y1_6_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2 * 6));
190
750k
            y1_7_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2 * 7));
191
192
750k
            y0_0_16x8b = _mm_avg_epu8(y0_0_16x8b, y1_0_16x8b);
193
750k
            y0_1_16x8b = _mm_avg_epu8(y0_1_16x8b, y1_1_16x8b);
194
750k
            y0_2_16x8b = _mm_avg_epu8(y0_2_16x8b, y1_2_16x8b);
195
750k
            y0_3_16x8b = _mm_avg_epu8(y0_3_16x8b, y1_3_16x8b);
196
750k
            y0_4_16x8b = _mm_avg_epu8(y0_4_16x8b, y1_4_16x8b);
197
750k
            y0_5_16x8b = _mm_avg_epu8(y0_5_16x8b, y1_5_16x8b);
198
750k
            y0_6_16x8b = _mm_avg_epu8(y0_6_16x8b, y1_6_16x8b);
199
750k
            y0_7_16x8b = _mm_avg_epu8(y0_7_16x8b, y1_7_16x8b);
200
201
750k
            _mm_storeu_si128((__m128i *)pu1_dst, y0_0_16x8b);
202
750k
            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y0_1_16x8b);
203
750k
            _mm_storeu_si128((__m128i *)(pu1_dst + (dst_strd << 1)), y0_2_16x8b);
204
750k
            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 3), y0_3_16x8b);
205
750k
            _mm_storeu_si128((__m128i *)(pu1_dst + (dst_strd << 2)), y0_4_16x8b);
206
750k
            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 5), y0_5_16x8b);
207
750k
            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 6), y0_6_16x8b);
208
750k
            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 7), y0_7_16x8b);
209
210
750k
            ht -= 8;
211
750k
            pu1_src1 += src_strd1 << 3;
212
750k
            pu1_src2 += src_strd2 << 3;
213
750k
            pu1_dst += dst_strd << 3;
214
750k
        }
215
750k
        while(ht > 0);
216
397k
    }
217
505k
}
218
219
/*****************************************************************************/
220
/*                                                                           */
221
/*  Function Name : ih264_default_weighted_pred_chroma_sse42                 */
222
/*                                                                           */
223
/*  Description   : This function performs the default weighted prediction   */
224
/*                  as described in sec 8.4.2.3.1 titled "Default weighted   */
225
/*                  sample prediction process" for chroma. The function gets */
226
/*                  two ht x wd blocks, calculates their rounded-average and */
227
/*                  stores it in the destination block. (ht,wd) can be       */
228
/*                  (2,2), (4,2) , (2,4), (4,4), (8,4), (4,8) or (8,8).      */
229
/*                                                                           */
230
/*  Inputs        : pu1_src1  - Pointer to source 1                          */
231
/*                  pu1_src2  - Pointer to source 2                          */
232
/*                  pu1_dst   - Pointer to destination                       */
233
/*                  src_strd1 - stride for source 1                          */
234
/*                  src_strd1 - stride for source 2                          */
235
/*                  dst_strd  - stride for destination                       */
236
/*                  ht        - height of the block                          */
237
/*                  wd        - width of the block                           */
238
/*                                                                           */
239
/*  Issues        : None                                                     */
240
/*                                                                           */
241
/*  Revision History:                                                        */
242
/*                                                                           */
243
/*         DD MM YYYY   Author(s)       Changes                              */
244
/*         04 02 2015   Kaushik         Initial Version                      */
245
/*                      Senthoor                                             */
246
/*                                                                           */
247
/*****************************************************************************/
248
void ih264_default_weighted_pred_chroma_sse42(UWORD8 *pu1_src1,
249
                                              UWORD8 *pu1_src2,
250
                                              UWORD8 *pu1_dst,
251
                                              WORD32 src_strd1,
252
                                              WORD32 src_strd2,
253
                                              WORD32 dst_strd,
254
                                              WORD32 ht,
255
                                              WORD32 wd)
256
505k
{
257
505k
    __m128i uv0_0_16x8b, uv0_1_16x8b;
258
505k
    __m128i uv1_0_16x8b, uv1_1_16x8b;
259
260
505k
    if(wd == 2)
261
7.94k
    {
262
7.94k
        do
263
9.71k
        {
264
9.71k
            uv0_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
265
9.71k
            uv0_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
266
267
9.71k
            uv1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
268
9.71k
            uv1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
269
270
9.71k
            uv0_0_16x8b = _mm_avg_epu8(uv0_0_16x8b, uv1_0_16x8b);
271
9.71k
            uv0_1_16x8b = _mm_avg_epu8(uv0_1_16x8b, uv1_1_16x8b);
272
273
9.71k
            *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(uv0_0_16x8b);
274
9.71k
            *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(uv0_1_16x8b);
275
276
9.71k
            ht -= 2;
277
9.71k
            pu1_src1 += src_strd1 << 1;
278
9.71k
            pu1_src2 += src_strd2 << 1;
279
9.71k
            pu1_dst += dst_strd << 1;
280
9.71k
        }
281
9.71k
        while(ht > 0);
282
7.94k
    }
283
497k
    else if(wd == 4)
284
100k
    {
285
100k
        do
286
235k
        {
287
235k
            uv0_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
288
235k
            uv0_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
289
290
235k
            uv1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
291
235k
            uv1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
292
293
235k
            uv0_0_16x8b = _mm_avg_epu8(uv0_0_16x8b, uv1_0_16x8b);
294
235k
            uv0_1_16x8b = _mm_avg_epu8(uv0_1_16x8b, uv1_1_16x8b);
295
296
235k
            _mm_storel_epi64((__m128i *)pu1_dst, uv0_0_16x8b);
297
235k
            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), uv0_1_16x8b);
298
299
235k
            ht -= 2;
300
235k
            pu1_src1 += src_strd1 << 1;
301
235k
            pu1_src2 += src_strd2 << 1;
302
235k
            pu1_dst += dst_strd << 1;
303
235k
        }
304
235k
        while(ht > 0);
305
100k
    }
306
397k
    else // wd == 8
307
397k
    {
308
397k
        __m128i uv0_2_16x8b, uv0_3_16x8b;
309
397k
        __m128i uv1_2_16x8b, uv1_3_16x8b;
310
311
397k
        do
312
750k
        {
313
750k
            uv0_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src1);
314
750k
            uv0_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1));
315
750k
            uv0_2_16x8b = _mm_loadu_si128(
316
750k
                            (__m128i *)(pu1_src1 + (src_strd1 << 1)));
317
750k
            uv0_3_16x8b = _mm_loadu_si128(
318
750k
                            (__m128i *)(pu1_src1 + src_strd1 * 3));
319
320
750k
            uv1_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src2);
321
750k
            uv1_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2));
322
750k
            uv1_2_16x8b = _mm_loadu_si128(
323
750k
                            (__m128i *)(pu1_src2 + (src_strd2 << 1)));
324
750k
            uv1_3_16x8b = _mm_loadu_si128(
325
750k
                            (__m128i *)(pu1_src2 + src_strd2 * 3));
326
327
750k
            uv0_0_16x8b = _mm_avg_epu8(uv0_0_16x8b, uv1_0_16x8b);
328
750k
            uv0_1_16x8b = _mm_avg_epu8(uv0_1_16x8b, uv1_1_16x8b);
329
750k
            uv0_2_16x8b = _mm_avg_epu8(uv0_2_16x8b, uv1_2_16x8b);
330
750k
            uv0_3_16x8b = _mm_avg_epu8(uv0_3_16x8b, uv1_3_16x8b);
331
332
750k
            _mm_storeu_si128((__m128i *)pu1_dst, uv0_0_16x8b);
333
750k
            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), uv0_1_16x8b);
334
750k
            _mm_storeu_si128(
335
750k
                            (__m128i *)(pu1_dst + (dst_strd << 1)), uv0_2_16x8b);
336
750k
            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 3), uv0_3_16x8b);
337
338
750k
            ht -= 4;
339
750k
            pu1_src1 += src_strd1 << 2;
340
750k
            pu1_src2 += src_strd2 << 2;
341
750k
            pu1_dst += dst_strd << 2;
342
750k
        }
343
750k
        while(ht > 0);
344
397k
    }
345
505k
}
346
347
/*****************************************************************************/
348
/*                                                                           */
349
/*  Function Name : ih264_weighted_pred_luma_sse42                           */
350
/*                                                                           */
351
/*  Description   : This function performs the weighted prediction as        */
352
/*                  described in sec 8.4.2.3.2 titled "Weighted sample       */
353
/*                  prediction process" for luma. The function gets one      */
354
/*                  ht x wd block, weights it, rounds it off, offsets it,    */
355
/*                  saturates it to unsigned 8-bit and stores it in the      */
356
/*                  destination block. (ht,wd) can be (4,4), (8,4), (4,8),   */
357
/*                  (8,8), (16,8), (8,16) or (16,16).                        */
358
/*                                                                           */
359
/*  Inputs        : pu1_src  - Pointer to source                             */
360
/*                  pu1_dst  - Pointer to destination                        */
361
/*                  src_strd - stride for source                             */
362
/*                  dst_strd - stride for destination                        */
363
/*                  log_wd   - number of bits to be rounded off              */
364
/*                  wt       - weight value                                  */
365
/*                  ofst     - offset value                                  */
366
/*                  ht       - height of the block                           */
367
/*                  wd       - width of the block                            */
368
/*                                                                           */
369
/*  Issues        : None                                                     */
370
/*                                                                           */
371
/*  Revision History:                                                        */
372
/*                                                                           */
373
/*         DD MM YYYY   Author(s)       Changes                              */
374
/*         04 02 2015   Kaushik         Initial Version                      */
375
/*                      Senthoor                                             */
376
/*                                                                           */
377
/*****************************************************************************/
378
void ih264_weighted_pred_luma_sse42(UWORD8 *pu1_src,
379
                                    UWORD8 *pu1_dst,
380
                                    WORD32 src_strd,
381
                                    WORD32 dst_strd,
382
                                    WORD32 log_wd,
383
                                    WORD32 wt,
384
                                    WORD32 ofst,
385
                                    WORD32 ht,
386
                                    WORD32 wd)
387
5.83M
{
388
5.83M
    __m128i y_0_16x8b, y_1_16x8b, y_2_16x8b, y_3_16x8b;
389
390
5.83M
    __m128i wt_8x16b, round_8x16b, ofst_8x16b;
391
392
5.83M
    WORD32 round_val;
393
394
5.83M
    wt = (WORD16)(wt & 0xffff);
395
5.83M
    round_val = 1 << (log_wd - 1);
396
5.83M
    ofst = (WORD8)(ofst & 0xff);
397
398
5.83M
    wt_8x16b = _mm_set1_epi16(wt);
399
5.83M
    round_8x16b = _mm_set1_epi16(round_val);
400
5.83M
    ofst_8x16b = _mm_set1_epi16(ofst);
401
402
5.83M
    if(wd == 4)
403
77.5k
    {
404
77.5k
        __m128i y_0_8x16b, y_2_8x16b;
405
406
77.5k
        do
407
133k
        {
408
133k
            y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
409
133k
            y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
410
133k
            y_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (src_strd << 1)));
411
133k
            y_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd * 3));
412
413
133k
            y_0_16x8b = _mm_unpacklo_epi32(y_0_16x8b, y_1_16x8b);
414
133k
            y_2_16x8b = _mm_unpacklo_epi32(y_2_16x8b, y_3_16x8b);
415
416
133k
            y_0_8x16b = _mm_cvtepu8_epi16(y_0_16x8b);
417
133k
            y_2_8x16b = _mm_cvtepu8_epi16(y_2_16x8b);
418
419
133k
            y_0_8x16b = _mm_mullo_epi16(y_0_8x16b, wt_8x16b);
420
133k
            y_2_8x16b = _mm_mullo_epi16(y_2_8x16b, wt_8x16b);
421
422
133k
            y_0_8x16b = _mm_adds_epi16(round_8x16b, y_0_8x16b);
423
133k
            y_2_8x16b = _mm_adds_epi16(round_8x16b, y_2_8x16b);
424
425
133k
            y_0_8x16b = _mm_srai_epi16(y_0_8x16b, log_wd);
426
133k
            y_2_8x16b = _mm_srai_epi16(y_2_8x16b, log_wd);
427
428
133k
            y_0_8x16b = _mm_adds_epi16(ofst_8x16b, y_0_8x16b);
429
133k
            y_2_8x16b = _mm_adds_epi16(ofst_8x16b, y_2_8x16b);
430
431
133k
            y_0_16x8b = _mm_packus_epi16(y_0_8x16b, y_2_8x16b);
432
133k
            y_1_16x8b = _mm_srli_si128(y_0_16x8b, 4);
433
133k
            y_2_16x8b = _mm_srli_si128(y_0_16x8b, 8);
434
133k
            y_3_16x8b = _mm_srli_si128(y_0_16x8b, 12);
435
436
133k
            *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y_0_16x8b);
437
133k
            *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y_1_16x8b);
438
133k
            *((WORD32 *)(pu1_dst + (dst_strd << 1))) = _mm_cvtsi128_si32(y_2_16x8b);
439
133k
            *((WORD32 *)(pu1_dst + dst_strd * 3)) = _mm_cvtsi128_si32(y_3_16x8b);
440
441
133k
            ht -= 4;
442
133k
            pu1_src += src_strd << 2;
443
133k
            pu1_dst += dst_strd << 2;
444
133k
        }
445
133k
        while(ht > 0);
446
77.5k
    }
447
5.75M
    else if(wd == 8)
448
74.0k
    {
449
74.0k
        __m128i y_0_8x16b, y_1_8x16b, y_2_8x16b, y_3_8x16b;
450
451
74.0k
        do
452
197k
        {
453
197k
            y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
454
197k
            y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
455
197k
            y_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (src_strd << 1)));
456
197k
            y_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd * 3));
457
458
197k
            y_0_8x16b = _mm_cvtepu8_epi16(y_0_16x8b);
459
197k
            y_1_8x16b = _mm_cvtepu8_epi16(y_1_16x8b);
460
197k
            y_2_8x16b = _mm_cvtepu8_epi16(y_2_16x8b);
461
197k
            y_3_8x16b = _mm_cvtepu8_epi16(y_3_16x8b);
462
463
197k
            y_0_8x16b = _mm_mullo_epi16(y_0_8x16b, wt_8x16b);
464
197k
            y_1_8x16b = _mm_mullo_epi16(y_1_8x16b, wt_8x16b);
465
197k
            y_2_8x16b = _mm_mullo_epi16(y_2_8x16b, wt_8x16b);
466
197k
            y_3_8x16b = _mm_mullo_epi16(y_3_8x16b, wt_8x16b);
467
468
197k
            y_0_8x16b = _mm_adds_epi16(round_8x16b, y_0_8x16b);
469
197k
            y_1_8x16b = _mm_adds_epi16(round_8x16b, y_1_8x16b);
470
197k
            y_2_8x16b = _mm_adds_epi16(round_8x16b, y_2_8x16b);
471
197k
            y_3_8x16b = _mm_adds_epi16(round_8x16b, y_3_8x16b);
472
473
197k
            y_0_8x16b = _mm_srai_epi16(y_0_8x16b, log_wd);
474
197k
            y_1_8x16b = _mm_srai_epi16(y_1_8x16b, log_wd);
475
197k
            y_2_8x16b = _mm_srai_epi16(y_2_8x16b, log_wd);
476
197k
            y_3_8x16b = _mm_srai_epi16(y_3_8x16b, log_wd);
477
478
197k
            y_0_8x16b = _mm_adds_epi16(ofst_8x16b, y_0_8x16b);
479
197k
            y_1_8x16b = _mm_adds_epi16(ofst_8x16b, y_1_8x16b);
480
197k
            y_2_8x16b = _mm_adds_epi16(ofst_8x16b, y_2_8x16b);
481
197k
            y_3_8x16b = _mm_adds_epi16(ofst_8x16b, y_3_8x16b);
482
483
197k
            y_0_16x8b = _mm_packus_epi16(y_0_8x16b, y_1_8x16b);
484
197k
            y_2_16x8b = _mm_packus_epi16(y_2_8x16b, y_3_8x16b);
485
197k
            y_1_16x8b = _mm_srli_si128(y_0_16x8b, 8);
486
197k
            y_3_16x8b = _mm_srli_si128(y_2_16x8b, 8);
487
488
197k
            _mm_storel_epi64((__m128i *)pu1_dst, y_0_16x8b);
489
197k
            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y_1_16x8b);
490
197k
            _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd << 1)), y_2_16x8b);
491
197k
            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd * 3), y_3_16x8b);
492
493
197k
            ht -= 4;
494
197k
            pu1_src += src_strd << 2;
495
197k
            pu1_dst += dst_strd << 2;
496
197k
        }
497
197k
        while(ht > 0);
498
74.0k
    }
499
5.68M
    else // wd == 16
500
5.68M
    {
501
5.68M
        __m128i y_0L_8x16b, y_1L_8x16b, y_2L_8x16b, y_3L_8x16b;
502
5.68M
        __m128i y_0H_8x16b, y_1H_8x16b, y_2H_8x16b, y_3H_8x16b;
503
504
5.68M
        __m128i zero_16x8b;
505
5.68M
        zero_16x8b = _mm_set1_epi8(0);
506
507
5.68M
        do
508
22.6M
        {
509
22.6M
            y_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
510
22.6M
            y_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd));
511
22.6M
            y_2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + (src_strd << 1)));
512
22.6M
            y_3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd * 3));
513
514
22.6M
            y_0L_8x16b = _mm_cvtepu8_epi16(y_0_16x8b);
515
22.6M
            y_0H_8x16b = _mm_unpackhi_epi8(y_0_16x8b, zero_16x8b);
516
22.6M
            y_1L_8x16b = _mm_cvtepu8_epi16(y_1_16x8b);
517
22.6M
            y_1H_8x16b = _mm_unpackhi_epi8(y_1_16x8b, zero_16x8b);
518
22.6M
            y_2L_8x16b = _mm_cvtepu8_epi16(y_2_16x8b);
519
22.6M
            y_2H_8x16b = _mm_unpackhi_epi8(y_2_16x8b, zero_16x8b);
520
22.6M
            y_3L_8x16b = _mm_cvtepu8_epi16(y_3_16x8b);
521
22.6M
            y_3H_8x16b = _mm_unpackhi_epi8(y_3_16x8b, zero_16x8b);
522
523
22.6M
            y_0L_8x16b = _mm_mullo_epi16(y_0L_8x16b, wt_8x16b);
524
22.6M
            y_0H_8x16b = _mm_mullo_epi16(y_0H_8x16b, wt_8x16b);
525
22.6M
            y_1L_8x16b = _mm_mullo_epi16(y_1L_8x16b, wt_8x16b);
526
22.6M
            y_1H_8x16b = _mm_mullo_epi16(y_1H_8x16b, wt_8x16b);
527
22.6M
            y_2L_8x16b = _mm_mullo_epi16(y_2L_8x16b, wt_8x16b);
528
22.6M
            y_2H_8x16b = _mm_mullo_epi16(y_2H_8x16b, wt_8x16b);
529
22.6M
            y_3L_8x16b = _mm_mullo_epi16(y_3L_8x16b, wt_8x16b);
530
22.6M
            y_3H_8x16b = _mm_mullo_epi16(y_3H_8x16b, wt_8x16b);
531
532
22.6M
            y_0L_8x16b = _mm_adds_epi16(round_8x16b, y_0L_8x16b);
533
22.6M
            y_0H_8x16b = _mm_adds_epi16(round_8x16b, y_0H_8x16b);
534
22.6M
            y_1L_8x16b = _mm_adds_epi16(round_8x16b, y_1L_8x16b);
535
22.6M
            y_1H_8x16b = _mm_adds_epi16(round_8x16b, y_1H_8x16b);
536
22.6M
            y_2L_8x16b = _mm_adds_epi16(round_8x16b, y_2L_8x16b);
537
22.6M
            y_2H_8x16b = _mm_adds_epi16(round_8x16b, y_2H_8x16b);
538
22.6M
            y_3L_8x16b = _mm_adds_epi16(round_8x16b, y_3L_8x16b);
539
22.6M
            y_3H_8x16b = _mm_adds_epi16(round_8x16b, y_3H_8x16b);
540
541
22.6M
            y_0L_8x16b = _mm_srai_epi16(y_0L_8x16b, log_wd);
542
22.6M
            y_0H_8x16b = _mm_srai_epi16(y_0H_8x16b, log_wd);
543
22.6M
            y_1L_8x16b = _mm_srai_epi16(y_1L_8x16b, log_wd);
544
22.6M
            y_1H_8x16b = _mm_srai_epi16(y_1H_8x16b, log_wd);
545
22.6M
            y_2L_8x16b = _mm_srai_epi16(y_2L_8x16b, log_wd);
546
22.6M
            y_2H_8x16b = _mm_srai_epi16(y_2H_8x16b, log_wd);
547
22.6M
            y_3L_8x16b = _mm_srai_epi16(y_3L_8x16b, log_wd);
548
22.6M
            y_3H_8x16b = _mm_srai_epi16(y_3H_8x16b, log_wd);
549
550
22.6M
            y_0L_8x16b = _mm_adds_epi16(ofst_8x16b, y_0L_8x16b);
551
22.6M
            y_0H_8x16b = _mm_adds_epi16(ofst_8x16b, y_0H_8x16b);
552
22.6M
            y_1L_8x16b = _mm_adds_epi16(ofst_8x16b, y_1L_8x16b);
553
22.6M
            y_1H_8x16b = _mm_adds_epi16(ofst_8x16b, y_1H_8x16b);
554
22.6M
            y_2L_8x16b = _mm_adds_epi16(ofst_8x16b, y_2L_8x16b);
555
22.6M
            y_2H_8x16b = _mm_adds_epi16(ofst_8x16b, y_2H_8x16b);
556
22.6M
            y_3L_8x16b = _mm_adds_epi16(ofst_8x16b, y_3L_8x16b);
557
22.6M
            y_3H_8x16b = _mm_adds_epi16(ofst_8x16b, y_3H_8x16b);
558
559
22.6M
            y_0_16x8b = _mm_packus_epi16(y_0L_8x16b, y_0H_8x16b);
560
22.6M
            y_1_16x8b = _mm_packus_epi16(y_1L_8x16b, y_1H_8x16b);
561
22.6M
            y_2_16x8b = _mm_packus_epi16(y_2L_8x16b, y_2H_8x16b);
562
22.6M
            y_3_16x8b = _mm_packus_epi16(y_3L_8x16b, y_3H_8x16b);
563
564
22.6M
            _mm_storeu_si128((__m128i *)pu1_dst, y_0_16x8b);
565
22.6M
            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y_1_16x8b);
566
22.6M
            _mm_storeu_si128((__m128i *)(pu1_dst + (dst_strd << 1)), y_2_16x8b);
567
22.6M
            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 3), y_3_16x8b);
568
569
22.6M
            ht -= 4;
570
22.6M
            pu1_src += src_strd << 2;
571
22.6M
            pu1_dst += dst_strd << 2;
572
22.6M
        }
573
22.6M
        while(ht > 0);
574
5.68M
    }
575
5.83M
}
576
577
/*****************************************************************************/
578
/*                                                                           */
579
/*  Function Name : ih264_weighted_pred_chroma_sse42                         */
580
/*                                                                           */
581
/*  Description   : This function performs the weighted prediction as        */
582
/*                  described in sec 8.4.2.3.2 titled "Weighted sample       */
583
/*                  prediction process" for chroma. The function gets one    */
584
/*                  ht x wd block, weights it, rounds it off, offsets it,    */
585
/*                  saturates it to unsigned 8-bit and stores it in the      */
586
/*                  destination block. (ht,wd) can be (2,2), (4,2), (2,4),   */
587
/*                  (4,4), (8,4), (4,8) or (8,8).                            */
588
/*                                                                           */
589
/*  Inputs        : pu1_src  - Pointer to source                             */
590
/*                  pu1_dst  - Pointer to destination                        */
591
/*                  src_strd - stride for source                             */
592
/*                  dst_strd - stride for destination                        */
593
/*                  log_wd   - number of bits to be rounded off              */
594
/*                  wt       - weight values for u and v                     */
595
/*                  ofst     - offset values for u and v                     */
596
/*                  ht       - height of the block                           */
597
/*                  wd       - width of the block                            */
598
/*                                                                           */
599
/*  Issues        : None                                                     */
600
/*                                                                           */
601
/*  Revision History:                                                        */
602
/*                                                                           */
603
/*         DD MM YYYY   Author(s)       Changes                              */
604
/*         04 02 2015   Kaushik         Initial Version                      */
605
/*                      Senthoor                                             */
606
/*                                                                           */
607
/*****************************************************************************/
608
void ih264_weighted_pred_chroma_sse42(UWORD8 *pu1_src,
609
                                      UWORD8 *pu1_dst,
610
                                      WORD32 src_strd,
611
                                      WORD32 dst_strd,
612
                                      WORD32 log_wd,
613
                                      WORD32 wt,
614
                                      WORD32 ofst,
615
                                      WORD32 ht,
616
                                      WORD32 wd)
617
5.83M
{
618
5.83M
    __m128i y_0_16x8b, y_1_16x8b;
619
620
5.83M
    __m128i wt_8x16b, round_8x16b, ofst_8x16b;
621
622
5.83M
    WORD32 ofst_u, ofst_v;
623
5.83M
    WORD32 round_val;
624
625
5.83M
    ofst_u = (WORD8)(ofst & 0xff);
626
5.83M
    ofst_v = (WORD8)(ofst >> 8);
627
5.83M
    round_val = 1 << (log_wd - 1);
628
5.83M
    ofst = (ofst_u & 0xffff) | (ofst_v << 16);
629
630
5.83M
    wt_8x16b = _mm_set1_epi32(wt);
631
5.83M
    round_8x16b = _mm_set1_epi16(round_val);
632
5.83M
    ofst_8x16b = _mm_set1_epi32(ofst);
633
634
5.83M
    if(wd == 2)
635
77.5k
    {
636
77.5k
        __m128i y_0_8x16b;
637
638
77.5k
        do
639
133k
        {
640
133k
            y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
641
133k
            y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
642
643
133k
            y_0_16x8b = _mm_unpacklo_epi32(y_0_16x8b, y_1_16x8b);
644
645
133k
            y_0_8x16b = _mm_cvtepu8_epi16(y_0_16x8b);
646
647
133k
            y_0_8x16b = _mm_mullo_epi16(y_0_8x16b, wt_8x16b);
648
649
133k
            y_0_8x16b = _mm_adds_epi16(round_8x16b, y_0_8x16b);
650
651
133k
            y_0_8x16b = _mm_srai_epi16(y_0_8x16b, log_wd);
652
653
133k
            y_0_8x16b = _mm_adds_epi16(ofst_8x16b, y_0_8x16b);
654
655
133k
            y_0_16x8b = _mm_packus_epi16(y_0_8x16b, y_0_8x16b);
656
133k
            y_1_16x8b = _mm_srli_si128(y_0_16x8b, 4);
657
658
133k
            *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y_0_16x8b);
659
133k
            *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y_1_16x8b);
660
661
133k
            ht -= 2;
662
133k
            pu1_src += src_strd << 1;
663
133k
            pu1_dst += dst_strd << 1;
664
133k
        }
665
133k
        while(ht > 0);
666
77.5k
    }
667
5.75M
    else if(wd == 4)
668
74.0k
    {
669
74.0k
        __m128i y_0_8x16b, y_1_8x16b;
670
671
74.0k
        do
672
197k
        {
673
197k
            y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
674
197k
            y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
675
676
197k
            y_0_8x16b = _mm_cvtepu8_epi16(y_0_16x8b);
677
197k
            y_1_8x16b = _mm_cvtepu8_epi16(y_1_16x8b);
678
679
197k
            y_0_8x16b = _mm_mullo_epi16(y_0_8x16b, wt_8x16b);
680
197k
            y_1_8x16b = _mm_mullo_epi16(y_1_8x16b, wt_8x16b);
681
682
197k
            y_0_8x16b = _mm_adds_epi16(round_8x16b, y_0_8x16b);
683
197k
            y_1_8x16b = _mm_adds_epi16(round_8x16b, y_1_8x16b);
684
685
197k
            y_0_8x16b = _mm_srai_epi16(y_0_8x16b, log_wd);
686
197k
            y_1_8x16b = _mm_srai_epi16(y_1_8x16b, log_wd);
687
688
197k
            y_0_8x16b = _mm_adds_epi16(ofst_8x16b, y_0_8x16b);
689
197k
            y_1_8x16b = _mm_adds_epi16(ofst_8x16b, y_1_8x16b);
690
691
197k
            y_0_16x8b = _mm_packus_epi16(y_0_8x16b, y_1_8x16b);
692
197k
            y_1_16x8b = _mm_srli_si128(y_0_16x8b, 8);
693
694
197k
            _mm_storel_epi64((__m128i *)pu1_dst, y_0_16x8b);
695
197k
            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y_1_16x8b);
696
697
197k
            ht -= 2;
698
197k
            pu1_src += src_strd << 1;
699
197k
            pu1_dst += dst_strd << 1;
700
197k
        }
701
197k
        while(ht > 0);
702
74.0k
    }
703
5.68M
    else // wd == 16
704
5.68M
    {
705
5.68M
        __m128i y_2_16x8b, y_3_16x8b;
706
5.68M
        __m128i y_0L_8x16b, y_1L_8x16b, y_2L_8x16b, y_3L_8x16b;
707
5.68M
        __m128i y_0H_8x16b, y_1H_8x16b, y_2H_8x16b, y_3H_8x16b;
708
709
5.68M
        __m128i zero_16x8b;
710
5.68M
        zero_16x8b = _mm_set1_epi8(0);
711
712
5.68M
        do
713
11.3M
        {
714
11.3M
            y_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
715
11.3M
            y_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd));
716
11.3M
            y_2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + (src_strd << 1)));
717
11.3M
            y_3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd * 3));
718
719
11.3M
            y_0L_8x16b = _mm_cvtepu8_epi16(y_0_16x8b);
720
11.3M
            y_0H_8x16b = _mm_unpackhi_epi8(y_0_16x8b, zero_16x8b);
721
11.3M
            y_1L_8x16b = _mm_cvtepu8_epi16(y_1_16x8b);
722
11.3M
            y_1H_8x16b = _mm_unpackhi_epi8(y_1_16x8b, zero_16x8b);
723
11.3M
            y_2L_8x16b = _mm_cvtepu8_epi16(y_2_16x8b);
724
11.3M
            y_2H_8x16b = _mm_unpackhi_epi8(y_2_16x8b, zero_16x8b);
725
11.3M
            y_3L_8x16b = _mm_cvtepu8_epi16(y_3_16x8b);
726
11.3M
            y_3H_8x16b = _mm_unpackhi_epi8(y_3_16x8b, zero_16x8b);
727
728
11.3M
            y_0L_8x16b = _mm_mullo_epi16(y_0L_8x16b, wt_8x16b);
729
11.3M
            y_0H_8x16b = _mm_mullo_epi16(y_0H_8x16b, wt_8x16b);
730
11.3M
            y_1L_8x16b = _mm_mullo_epi16(y_1L_8x16b, wt_8x16b);
731
11.3M
            y_1H_8x16b = _mm_mullo_epi16(y_1H_8x16b, wt_8x16b);
732
11.3M
            y_2L_8x16b = _mm_mullo_epi16(y_2L_8x16b, wt_8x16b);
733
11.3M
            y_2H_8x16b = _mm_mullo_epi16(y_2H_8x16b, wt_8x16b);
734
11.3M
            y_3L_8x16b = _mm_mullo_epi16(y_3L_8x16b, wt_8x16b);
735
11.3M
            y_3H_8x16b = _mm_mullo_epi16(y_3H_8x16b, wt_8x16b);
736
737
11.3M
            y_0L_8x16b = _mm_adds_epi16(round_8x16b, y_0L_8x16b);
738
11.3M
            y_0H_8x16b = _mm_adds_epi16(round_8x16b, y_0H_8x16b);
739
11.3M
            y_1L_8x16b = _mm_adds_epi16(round_8x16b, y_1L_8x16b);
740
11.3M
            y_1H_8x16b = _mm_adds_epi16(round_8x16b, y_1H_8x16b);
741
11.3M
            y_2L_8x16b = _mm_adds_epi16(round_8x16b, y_2L_8x16b);
742
11.3M
            y_2H_8x16b = _mm_adds_epi16(round_8x16b, y_2H_8x16b);
743
11.3M
            y_3L_8x16b = _mm_adds_epi16(round_8x16b, y_3L_8x16b);
744
11.3M
            y_3H_8x16b = _mm_adds_epi16(round_8x16b, y_3H_8x16b);
745
746
11.3M
            y_0L_8x16b = _mm_srai_epi16(y_0L_8x16b, log_wd);
747
11.3M
            y_0H_8x16b = _mm_srai_epi16(y_0H_8x16b, log_wd);
748
11.3M
            y_1L_8x16b = _mm_srai_epi16(y_1L_8x16b, log_wd);
749
11.3M
            y_1H_8x16b = _mm_srai_epi16(y_1H_8x16b, log_wd);
750
11.3M
            y_2L_8x16b = _mm_srai_epi16(y_2L_8x16b, log_wd);
751
11.3M
            y_2H_8x16b = _mm_srai_epi16(y_2H_8x16b, log_wd);
752
11.3M
            y_3L_8x16b = _mm_srai_epi16(y_3L_8x16b, log_wd);
753
11.3M
            y_3H_8x16b = _mm_srai_epi16(y_3H_8x16b, log_wd);
754
755
11.3M
            y_0L_8x16b = _mm_adds_epi16(ofst_8x16b, y_0L_8x16b);
756
11.3M
            y_0H_8x16b = _mm_adds_epi16(ofst_8x16b, y_0H_8x16b);
757
11.3M
            y_1L_8x16b = _mm_adds_epi16(ofst_8x16b, y_1L_8x16b);
758
11.3M
            y_1H_8x16b = _mm_adds_epi16(ofst_8x16b, y_1H_8x16b);
759
11.3M
            y_2L_8x16b = _mm_adds_epi16(ofst_8x16b, y_2L_8x16b);
760
11.3M
            y_2H_8x16b = _mm_adds_epi16(ofst_8x16b, y_2H_8x16b);
761
11.3M
            y_3L_8x16b = _mm_adds_epi16(ofst_8x16b, y_3L_8x16b);
762
11.3M
            y_3H_8x16b = _mm_adds_epi16(ofst_8x16b, y_3H_8x16b);
763
764
11.3M
            y_0_16x8b = _mm_packus_epi16(y_0L_8x16b, y_0H_8x16b);
765
11.3M
            y_1_16x8b = _mm_packus_epi16(y_1L_8x16b, y_1H_8x16b);
766
11.3M
            y_2_16x8b = _mm_packus_epi16(y_2L_8x16b, y_2H_8x16b);
767
11.3M
            y_3_16x8b = _mm_packus_epi16(y_3L_8x16b, y_3H_8x16b);
768
769
11.3M
            _mm_storeu_si128((__m128i *)pu1_dst, y_0_16x8b);
770
11.3M
            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y_1_16x8b);
771
11.3M
            _mm_storeu_si128((__m128i *)(pu1_dst + (dst_strd << 1)), y_2_16x8b);
772
11.3M
            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 3), y_3_16x8b);
773
774
11.3M
            ht -= 4;
775
11.3M
            pu1_src += src_strd << 2;
776
11.3M
            pu1_dst += dst_strd << 2;
777
11.3M
        }
778
11.3M
        while(ht > 0);
779
5.68M
    }
780
5.83M
}
781
782
/*****************************************************************************/
783
/*                                                                           */
784
/*  Function Name : ih264_weighted_bi_pred_luma_sse42                        */
785
/*                                                                           */
786
/*  Description   : This function performs the weighted biprediction as      */
787
/*                  described in sec 8.4.2.3.2 titled "Weighted sample       */
788
/*                  prediction process" for luma. The function gets two      */
789
/*                  ht x wd blocks, weights them, adds them, rounds off the  */
790
/*                  sum, offsets it, saturates it to unsigned 8-bit and      */
791
/*                  stores it in the destination block. (ht,wd) can be       */
792
/*                  (4,4), (8,4), (4,8), (8,8), (16,8), (8,16) or (16,16).   */
793
/*                                                                           */
794
/*  Inputs        : pu1_src1  - Pointer to source 1                          */
795
/*                  pu1_src2  - Pointer to source 2                          */
796
/*                  pu1_dst   - Pointer to destination                       */
797
/*                  src_strd1 - stride for source 1                          */
798
/*                  src_strd2 - stride for source 2                          */
799
/*                  dst_strd2 - stride for destination                       */
800
/*                  log_wd    - number of bits to be rounded off             */
801
/*                  wt1       - weight value for source 1                    */
802
/*                  wt2       - weight value for source 2                    */
803
/*                  ofst1     - offset value for source 1                    */
804
/*                  ofst2     - offset value for source 2                    */
805
/*                  ht        - height of the block                          */
806
/*                  wd        - width of the block                           */
807
/*                                                                           */
808
/*  Issues        : None                                                     */
809
/*                                                                           */
810
/*  Revision History:                                                        */
811
/*                                                                           */
812
/*         DD MM YYYY   Author(s)       Changes                              */
813
/*         04 02 2015   Kaushik         Initial Version                      */
814
/*                      Senthoor                                             */
815
/*                                                                           */
816
/*****************************************************************************/
817
void ih264_weighted_bi_pred_luma_sse42(UWORD8 *pu1_src1,
818
                                       UWORD8 *pu1_src2,
819
                                       UWORD8 *pu1_dst,
820
                                       WORD32 src_strd1,
821
                                       WORD32 src_strd2,
822
                                       WORD32 dst_strd,
823
                                       WORD32 log_wd,
824
                                       WORD32 wt1,
825
                                       WORD32 wt2,
826
                                       WORD32 ofst1,
827
                                       WORD32 ofst2,
828
                                       WORD32 ht,
829
                                       WORD32 wd)
830
616k
{
831
616k
    __m128i y1_0_16x8b, y1_1_16x8b;
832
616k
    __m128i y2_0_16x8b, y2_1_16x8b;
833
834
616k
    __m128i wt1_8x16b, wt2_8x16b;
835
616k
    __m128i ofst_8x16b, round_8x16b;
836
837
616k
    WORD32 ofst;
838
616k
    WORD32 round_val, shft;
839
840
616k
    wt1 = (WORD16)(wt1 & 0xffff);
841
616k
    wt2 = (WORD16)(wt2 & 0xffff);
842
616k
    round_val = 1 << log_wd;
843
616k
    shft = log_wd + 1;
844
616k
    ofst1 = (WORD8)(ofst1 & 0xff);
845
616k
    ofst2 = (WORD8)(ofst2 & 0xff);
846
616k
    ofst = (ofst1 + ofst2 + 1) >> 1;
847
848
616k
    wt1_8x16b = _mm_set1_epi16(wt1);
849
616k
    wt2_8x16b = _mm_set1_epi16(wt2);
850
616k
    round_8x16b = _mm_set1_epi16(round_val);
851
616k
    ofst_8x16b = _mm_set1_epi16(ofst);
852
853
616k
    if(wd == 4)
854
36.6k
    {
855
36.6k
        __m128i y1_2_16x8b, y1_3_16x8b;
856
36.6k
        __m128i y2_2_16x8b, y2_3_16x8b;
857
858
36.6k
        __m128i y1_0_8x16b, y1_2_8x16b;
859
36.6k
        __m128i y2_0_8x16b, y2_2_8x16b;
860
861
36.6k
        do
862
38.2k
        {
863
38.2k
            y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
864
38.2k
            y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
865
38.2k
            y1_2_16x8b = _mm_loadl_epi64(
866
38.2k
                            (__m128i *)(pu1_src1 + (src_strd1 << 1)));
867
38.2k
            y1_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1 * 3));
868
869
38.2k
            y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
870
38.2k
            y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
871
38.2k
            y2_2_16x8b = _mm_loadl_epi64(
872
38.2k
                            (__m128i *)(pu1_src2 + (src_strd2 << 1)));
873
38.2k
            y2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2 * 3));
874
875
38.2k
            y1_0_16x8b = _mm_unpacklo_epi32(y1_0_16x8b, y1_1_16x8b);
876
38.2k
            y1_2_16x8b = _mm_unpacklo_epi32(y1_2_16x8b, y1_3_16x8b);
877
38.2k
            y2_0_16x8b = _mm_unpacklo_epi32(y2_0_16x8b, y2_1_16x8b);
878
38.2k
            y2_2_16x8b = _mm_unpacklo_epi32(y2_2_16x8b, y2_3_16x8b);
879
880
38.2k
            y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
881
38.2k
            y1_2_8x16b = _mm_cvtepu8_epi16(y1_2_16x8b);
882
38.2k
            y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
883
38.2k
            y2_2_8x16b = _mm_cvtepu8_epi16(y2_2_16x8b);
884
885
38.2k
            y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b);
886
38.2k
            y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b);
887
38.2k
            y1_2_8x16b = _mm_mullo_epi16(y1_2_8x16b, wt1_8x16b);
888
38.2k
            y2_2_8x16b = _mm_mullo_epi16(y2_2_8x16b, wt2_8x16b);
889
890
38.2k
            y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b);
891
38.2k
            y1_2_8x16b = _mm_adds_epi16(y1_2_8x16b, y2_2_8x16b);
892
893
38.2k
            y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b);
894
38.2k
            y1_2_8x16b = _mm_adds_epi16(round_8x16b, y1_2_8x16b);
895
896
38.2k
            y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft);
897
38.2k
            y1_2_8x16b = _mm_srai_epi16(y1_2_8x16b, shft);
898
899
38.2k
            y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b);
900
38.2k
            y1_2_8x16b = _mm_adds_epi16(ofst_8x16b, y1_2_8x16b);
901
902
38.2k
            y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_2_8x16b);
903
38.2k
            y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 4);
904
38.2k
            y1_2_16x8b = _mm_srli_si128(y1_0_16x8b, 8);
905
38.2k
            y1_3_16x8b = _mm_srli_si128(y1_0_16x8b, 12);
906
907
38.2k
            *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y1_0_16x8b);
908
38.2k
            *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y1_1_16x8b);
909
38.2k
            *((WORD32 *)(pu1_dst + (dst_strd << 1))) = _mm_cvtsi128_si32(y1_2_16x8b);
910
38.2k
            *((WORD32 *)(pu1_dst + dst_strd * 3)) = _mm_cvtsi128_si32(y1_3_16x8b);
911
912
913
38.2k
            ht -= 4;
914
38.2k
            pu1_src1 += src_strd1 << 2;
915
38.2k
            pu1_src2 += src_strd2 << 2;
916
38.2k
            pu1_dst += dst_strd << 2;
917
38.2k
        }
918
38.2k
        while(ht > 0);
919
36.6k
    }
920
579k
    else if(wd == 8)
921
35.7k
    {
922
35.7k
        __m128i y1_2_16x8b, y1_3_16x8b;
923
35.7k
        __m128i y2_2_16x8b, y2_3_16x8b;
924
925
35.7k
        __m128i y1_0_8x16b, y1_1_8x16b, y1_2_8x16b, y1_3_8x16b;
926
35.7k
        __m128i y2_0_8x16b, y2_1_8x16b, y2_2_8x16b, y2_3_8x16b;
927
928
35.7k
        do
929
83.6k
        {
930
83.6k
            y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
931
83.6k
            y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
932
83.6k
            y1_2_16x8b = _mm_loadl_epi64(
933
83.6k
                            (__m128i *)(pu1_src1 + (src_strd1 << 1)));
934
83.6k
            y1_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1 * 3));
935
936
83.6k
            y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
937
83.6k
            y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
938
83.6k
            y2_2_16x8b = _mm_loadl_epi64(
939
83.6k
                            (__m128i *)(pu1_src2 + (src_strd2 << 1)));
940
83.6k
            y2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2 * 3));
941
942
83.6k
            y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
943
83.6k
            y1_1_8x16b = _mm_cvtepu8_epi16(y1_1_16x8b);
944
83.6k
            y1_2_8x16b = _mm_cvtepu8_epi16(y1_2_16x8b);
945
83.6k
            y1_3_8x16b = _mm_cvtepu8_epi16(y1_3_16x8b);
946
947
83.6k
            y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
948
83.6k
            y2_1_8x16b = _mm_cvtepu8_epi16(y2_1_16x8b);
949
83.6k
            y2_2_8x16b = _mm_cvtepu8_epi16(y2_2_16x8b);
950
83.6k
            y2_3_8x16b = _mm_cvtepu8_epi16(y2_3_16x8b);
951
952
83.6k
            y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b);
953
83.6k
            y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b);
954
83.6k
            y1_1_8x16b = _mm_mullo_epi16(y1_1_8x16b, wt1_8x16b);
955
83.6k
            y2_1_8x16b = _mm_mullo_epi16(y2_1_8x16b, wt2_8x16b);
956
957
83.6k
            y1_2_8x16b = _mm_mullo_epi16(y1_2_8x16b, wt1_8x16b);
958
83.6k
            y2_2_8x16b = _mm_mullo_epi16(y2_2_8x16b, wt2_8x16b);
959
83.6k
            y1_3_8x16b = _mm_mullo_epi16(y1_3_8x16b, wt1_8x16b);
960
83.6k
            y2_3_8x16b = _mm_mullo_epi16(y2_3_8x16b, wt2_8x16b);
961
962
83.6k
            y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b);
963
83.6k
            y1_1_8x16b = _mm_adds_epi16(y1_1_8x16b, y2_1_8x16b);
964
83.6k
            y1_2_8x16b = _mm_adds_epi16(y1_2_8x16b, y2_2_8x16b);
965
83.6k
            y1_3_8x16b = _mm_adds_epi16(y1_3_8x16b, y2_3_8x16b);
966
967
83.6k
            y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b);
968
83.6k
            y1_1_8x16b = _mm_adds_epi16(round_8x16b, y1_1_8x16b);
969
83.6k
            y1_2_8x16b = _mm_adds_epi16(round_8x16b, y1_2_8x16b);
970
83.6k
            y1_3_8x16b = _mm_adds_epi16(round_8x16b, y1_3_8x16b);
971
972
83.6k
            y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft);
973
83.6k
            y1_1_8x16b = _mm_srai_epi16(y1_1_8x16b, shft);
974
83.6k
            y1_2_8x16b = _mm_srai_epi16(y1_2_8x16b, shft);
975
83.6k
            y1_3_8x16b = _mm_srai_epi16(y1_3_8x16b, shft);
976
977
83.6k
            y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b);
978
83.6k
            y1_1_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1_8x16b);
979
83.6k
            y1_2_8x16b = _mm_adds_epi16(ofst_8x16b, y1_2_8x16b);
980
83.6k
            y1_3_8x16b = _mm_adds_epi16(ofst_8x16b, y1_3_8x16b);
981
982
83.6k
            y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_1_8x16b);
983
83.6k
            y1_2_16x8b = _mm_packus_epi16(y1_2_8x16b, y1_3_8x16b);
984
83.6k
            y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 8);
985
83.6k
            y1_3_16x8b = _mm_srli_si128(y1_2_16x8b, 8);
986
987
83.6k
            _mm_storel_epi64((__m128i *)pu1_dst, y1_0_16x8b);
988
83.6k
            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b);
989
83.6k
            _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd << 1)), y1_2_16x8b);
990
83.6k
            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd * 3), y1_3_16x8b);
991
992
83.6k
            ht -= 4;
993
83.6k
            pu1_src1 += src_strd1 << 2;
994
83.6k
            pu1_src2 += src_strd2 << 2;
995
83.6k
            pu1_dst += dst_strd << 2;
996
83.6k
        }
997
83.6k
        while(ht > 0);
998
35.7k
    }
999
543k
    else // wd == 16
1000
543k
    {
1001
543k
        __m128i y1_0L_8x16b, y1_0H_8x16b, y1_1L_8x16b, y1_1H_8x16b;
1002
543k
        __m128i y2_0L_8x16b, y2_0H_8x16b, y2_1L_8x16b, y2_1H_8x16b;
1003
1004
543k
        __m128i zero_16x8b;
1005
543k
        zero_16x8b = _mm_set1_epi8(0);
1006
1007
543k
        do
1008
4.32M
        {
1009
4.32M
            y1_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src1);
1010
4.32M
            y1_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1));
1011
4.32M
            y2_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src2);
1012
4.32M
            y2_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2));
1013
1014
4.32M
            y1_0L_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
1015
4.32M
            y1_0H_8x16b = _mm_unpackhi_epi8(y1_0_16x8b, zero_16x8b);
1016
4.32M
            y1_1L_8x16b = _mm_cvtepu8_epi16(y1_1_16x8b);
1017
4.32M
            y1_1H_8x16b = _mm_unpackhi_epi8(y1_1_16x8b, zero_16x8b);
1018
1019
4.32M
            y2_0L_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
1020
4.32M
            y2_0H_8x16b = _mm_unpackhi_epi8(y2_0_16x8b, zero_16x8b);
1021
4.32M
            y2_1L_8x16b = _mm_cvtepu8_epi16(y2_1_16x8b);
1022
4.32M
            y2_1H_8x16b = _mm_unpackhi_epi8(y2_1_16x8b, zero_16x8b);
1023
1024
4.32M
            y1_0L_8x16b = _mm_mullo_epi16(y1_0L_8x16b, wt1_8x16b);
1025
4.32M
            y1_0H_8x16b = _mm_mullo_epi16(y1_0H_8x16b, wt1_8x16b);
1026
4.32M
            y1_1L_8x16b = _mm_mullo_epi16(y1_1L_8x16b, wt1_8x16b);
1027
4.32M
            y1_1H_8x16b = _mm_mullo_epi16(y1_1H_8x16b, wt1_8x16b);
1028
1029
4.32M
            y2_0L_8x16b = _mm_mullo_epi16(y2_0L_8x16b, wt2_8x16b);
1030
4.32M
            y2_0H_8x16b = _mm_mullo_epi16(y2_0H_8x16b, wt2_8x16b);
1031
4.32M
            y2_1L_8x16b = _mm_mullo_epi16(y2_1L_8x16b, wt2_8x16b);
1032
4.32M
            y2_1H_8x16b = _mm_mullo_epi16(y2_1H_8x16b, wt2_8x16b);
1033
1034
4.32M
            y1_0L_8x16b = _mm_adds_epi16(y1_0L_8x16b, y2_0L_8x16b);
1035
4.32M
            y1_0H_8x16b = _mm_adds_epi16(y1_0H_8x16b, y2_0H_8x16b);
1036
4.32M
            y1_1L_8x16b = _mm_adds_epi16(y1_1L_8x16b, y2_1L_8x16b);
1037
4.32M
            y1_1H_8x16b = _mm_adds_epi16(y1_1H_8x16b, y2_1H_8x16b);
1038
1039
4.32M
            y1_0L_8x16b = _mm_adds_epi16(round_8x16b, y1_0L_8x16b);
1040
4.32M
            y1_0H_8x16b = _mm_adds_epi16(round_8x16b, y1_0H_8x16b);
1041
4.32M
            y1_1L_8x16b = _mm_adds_epi16(round_8x16b, y1_1L_8x16b);
1042
4.32M
            y1_1H_8x16b = _mm_adds_epi16(round_8x16b, y1_1H_8x16b);
1043
1044
4.32M
            y1_0L_8x16b = _mm_srai_epi16(y1_0L_8x16b, shft);
1045
4.32M
            y1_0H_8x16b = _mm_srai_epi16(y1_0H_8x16b, shft);
1046
4.32M
            y1_1L_8x16b = _mm_srai_epi16(y1_1L_8x16b, shft);
1047
4.32M
            y1_1H_8x16b = _mm_srai_epi16(y1_1H_8x16b, shft);
1048
1049
4.32M
            y1_0L_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0L_8x16b);
1050
4.32M
            y1_0H_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0H_8x16b);
1051
4.32M
            y1_1L_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1L_8x16b);
1052
4.32M
            y1_1H_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1H_8x16b);
1053
1054
4.32M
            y1_0_16x8b = _mm_packus_epi16(y1_0L_8x16b, y1_0H_8x16b);
1055
4.32M
            y1_1_16x8b = _mm_packus_epi16(y1_1L_8x16b, y1_1H_8x16b);
1056
1057
4.32M
            _mm_storeu_si128((__m128i *)pu1_dst, y1_0_16x8b);
1058
4.32M
            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b);
1059
1060
4.32M
            ht -= 2;
1061
4.32M
            pu1_src1 += src_strd1 << 1;
1062
4.32M
            pu1_src2 += src_strd2 << 1;
1063
4.32M
            pu1_dst += dst_strd << 1;
1064
4.32M
        }
1065
4.32M
        while(ht > 0);
1066
543k
    }
1067
616k
}
1068
1069
/*****************************************************************************/
1070
/*                                                                           */
1071
/*  Function Name : ih264_weighted_bi_pred_chroma_sse42                      */
1072
/*                                                                           */
1073
/*  Description   : This function performs the weighted biprediction as      */
1074
/*                  described in sec 8.4.2.3.2 titled "Weighted sample       */
1075
/*                  prediction process" for chroma. The function gets two    */
1076
/*                  ht x wd blocks, weights them, adds them, rounds off the  */
1077
/*                  sum, offsets it, saturates it to unsigned 8-bit and      */
1078
/*                  stores it in the destination block. (ht,wd) can be       */
1079
/*                  (2,2), (4,2), (2,4), (4,4), (8,4), (4,8) or (8,8).       */
1080
/*                                                                           */
1081
/*  Inputs        : pu1_src1  - Pointer to source 1                          */
1082
/*                  pu1_src2  - Pointer to source 2                          */
1083
/*                  pu1_dst   - Pointer to destination                       */
1084
/*                  src_strd1 - stride for source 1                          */
1085
/*                  src_strd2 - stride for source 2                          */
1086
/*                  dst_strd2 - stride for destination                       */
1087
/*                  log_wd    - number of bits to be rounded off             */
1088
/*                  wt1       - weight values for u and v in source 1        */
1089
/*                  wt2       - weight values for u and v in source 2        */
1090
/*                  ofst1     - offset value for u and v in source 1         */
1091
/*                  ofst2     - offset value for u and v in source 2         */
1092
/*                  ht        - height of the block                          */
1093
/*                  wd        - width of the block                           */
1094
/*                                                                           */
1095
/*  Issues        : None                                                     */
1096
/*                                                                           */
1097
/*  Revision History:                                                        */
1098
/*                                                                           */
1099
/*         DD MM YYYY   Author(s)       Changes                              */
1100
/*         04 02 2015   Kaushik         Initial Version                      */
1101
/*                      Senthoor                                             */
1102
/*                                                                           */
1103
/*****************************************************************************/
1104
void ih264_weighted_bi_pred_chroma_sse42(UWORD8 *pu1_src1,
1105
                                         UWORD8 *pu1_src2,
1106
                                         UWORD8 *pu1_dst,
1107
                                         WORD32 src_strd1,
1108
                                         WORD32 src_strd2,
1109
                                         WORD32 dst_strd,
1110
                                         WORD32 log_wd,
1111
                                         WORD32 wt1,
1112
                                         WORD32 wt2,
1113
                                         WORD32 ofst1,
1114
                                         WORD32 ofst2,
1115
                                         WORD32 ht,
1116
                                         WORD32 wd)
1117
616k
{
1118
616k
    __m128i y1_0_16x8b, y1_1_16x8b;
1119
616k
    __m128i y2_0_16x8b, y2_1_16x8b;
1120
1121
616k
    __m128i wt1_8x16b, wt2_8x16b;
1122
616k
    __m128i ofst_8x16b, round_8x16b;
1123
1124
616k
    WORD32 ofst1_u, ofst2_u, ofst_u;
1125
616k
    WORD32 ofst1_v, ofst2_v, ofst_v;
1126
616k
    WORD32 round_val, shft, ofst_val;
1127
1128
616k
    round_val = 1 << log_wd;
1129
616k
    shft = log_wd + 1;
1130
1131
616k
    ofst1_u = (WORD8)(ofst1 & 0xff);
1132
616k
    ofst1_v = (WORD8)(ofst1 >> 8);
1133
616k
    ofst2_u = (WORD8)(ofst2 & 0xff);
1134
616k
    ofst2_v = (WORD8)(ofst2 >> 8);
1135
1136
616k
    wt1_8x16b = _mm_set1_epi32(wt1);
1137
616k
    wt2_8x16b = _mm_set1_epi32(wt2);
1138
1139
616k
    ofst_u = (ofst1_u + ofst2_u + 1) >> 1;
1140
616k
    ofst_v = (ofst1_v + ofst2_v + 1) >> 1;
1141
616k
    ofst_val = (ofst_u & 0xffff) | (ofst_v << 16);
1142
1143
616k
    round_8x16b = _mm_set1_epi16(round_val);
1144
616k
    ofst_8x16b = _mm_set1_epi32(ofst_val);
1145
1146
616k
    if(wd == 2)
1147
36.6k
    {
1148
36.6k
        __m128i y1_0_8x16b, y2_0_8x16b;
1149
1150
36.6k
        do
1151
38.2k
        {
1152
38.2k
            y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
1153
38.2k
            y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
1154
1155
38.2k
            y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
1156
38.2k
            y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
1157
1158
38.2k
            y1_0_16x8b = _mm_unpacklo_epi32(y1_0_16x8b, y1_1_16x8b);
1159
38.2k
            y2_0_16x8b = _mm_unpacklo_epi32(y2_0_16x8b, y2_1_16x8b);
1160
1161
38.2k
            y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
1162
38.2k
            y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
1163
1164
38.2k
            y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b);
1165
38.2k
            y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b);
1166
1167
38.2k
            y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b);
1168
38.2k
            y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b);
1169
1170
38.2k
            y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft);
1171
38.2k
            y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b);
1172
1173
38.2k
            y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_0_8x16b);
1174
38.2k
            y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 4);
1175
1176
38.2k
            *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y1_0_16x8b);
1177
38.2k
            *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y1_1_16x8b);
1178
1179
38.2k
            ht -= 2;
1180
38.2k
            pu1_src1 += src_strd1 << 1;
1181
38.2k
            pu1_src2 += src_strd2 << 1;
1182
38.2k
            pu1_dst += dst_strd << 1;
1183
38.2k
        }
1184
38.2k
        while(ht > 0);
1185
36.6k
    }
1186
579k
    else if(wd == 4)
1187
35.7k
    {
1188
35.7k
        __m128i y1_0_8x16b, y1_1_8x16b;
1189
35.7k
        __m128i y2_0_8x16b, y2_1_8x16b;
1190
1191
35.7k
        do
1192
83.6k
        {
1193
83.6k
            y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
1194
83.6k
            y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
1195
1196
83.6k
            y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
1197
83.6k
            y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
1198
1199
83.6k
            y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
1200
83.6k
            y1_1_8x16b = _mm_cvtepu8_epi16(y1_1_16x8b);
1201
1202
83.6k
            y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
1203
83.6k
            y2_1_8x16b = _mm_cvtepu8_epi16(y2_1_16x8b);
1204
1205
83.6k
            y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b);
1206
83.6k
            y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b);
1207
83.6k
            y1_1_8x16b = _mm_mullo_epi16(y1_1_8x16b, wt1_8x16b);
1208
83.6k
            y2_1_8x16b = _mm_mullo_epi16(y2_1_8x16b, wt2_8x16b);
1209
1210
83.6k
            y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b);
1211
83.6k
            y1_1_8x16b = _mm_adds_epi16(y1_1_8x16b, y2_1_8x16b);
1212
1213
83.6k
            y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b);
1214
83.6k
            y1_1_8x16b = _mm_adds_epi16(round_8x16b, y1_1_8x16b);
1215
1216
83.6k
            y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft);
1217
83.6k
            y1_1_8x16b = _mm_srai_epi16(y1_1_8x16b, shft);
1218
1219
83.6k
            y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b);
1220
83.6k
            y1_1_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1_8x16b);
1221
1222
83.6k
            y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_1_8x16b);
1223
83.6k
            y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 8);
1224
1225
83.6k
            _mm_storel_epi64((__m128i *)pu1_dst, y1_0_16x8b);
1226
83.6k
            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b);
1227
1228
83.6k
            ht -= 2;
1229
83.6k
            pu1_src1 += src_strd1 << 1;
1230
83.6k
            pu1_src2 += src_strd2 << 1;
1231
83.6k
            pu1_dst += dst_strd << 1;
1232
83.6k
        }
1233
83.6k
        while(ht > 0);
1234
35.7k
    }
1235
543k
    else // wd == 8
1236
543k
    {
1237
543k
        __m128i y1_0L_8x16b, y1_0H_8x16b, y1_1L_8x16b, y1_1H_8x16b;
1238
543k
        __m128i y2_0L_8x16b, y2_0H_8x16b, y2_1L_8x16b, y2_1H_8x16b;
1239
1240
543k
        __m128i zero_16x8b;
1241
543k
        zero_16x8b = _mm_set1_epi8(0);
1242
1243
543k
        do
1244
2.16M
        {
1245
2.16M
            y1_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src1);
1246
2.16M
            y1_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1));
1247
2.16M
            y2_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src2);
1248
2.16M
            y2_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2));
1249
1250
2.16M
            y1_0L_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
1251
2.16M
            y1_0H_8x16b = _mm_unpackhi_epi8(y1_0_16x8b, zero_16x8b);
1252
2.16M
            y1_1L_8x16b = _mm_cvtepu8_epi16(y1_1_16x8b);
1253
2.16M
            y1_1H_8x16b = _mm_unpackhi_epi8(y1_1_16x8b, zero_16x8b);
1254
1255
2.16M
            y2_0L_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
1256
2.16M
            y2_0H_8x16b = _mm_unpackhi_epi8(y2_0_16x8b, zero_16x8b);
1257
2.16M
            y2_1L_8x16b = _mm_cvtepu8_epi16(y2_1_16x8b);
1258
2.16M
            y2_1H_8x16b = _mm_unpackhi_epi8(y2_1_16x8b, zero_16x8b);
1259
1260
2.16M
            y1_0L_8x16b = _mm_mullo_epi16(y1_0L_8x16b, wt1_8x16b);
1261
2.16M
            y1_0H_8x16b = _mm_mullo_epi16(y1_0H_8x16b, wt1_8x16b);
1262
2.16M
            y1_1L_8x16b = _mm_mullo_epi16(y1_1L_8x16b, wt1_8x16b);
1263
2.16M
            y1_1H_8x16b = _mm_mullo_epi16(y1_1H_8x16b, wt1_8x16b);
1264
1265
2.16M
            y2_0L_8x16b = _mm_mullo_epi16(y2_0L_8x16b, wt2_8x16b);
1266
2.16M
            y2_0H_8x16b = _mm_mullo_epi16(y2_0H_8x16b, wt2_8x16b);
1267
2.16M
            y2_1L_8x16b = _mm_mullo_epi16(y2_1L_8x16b, wt2_8x16b);
1268
2.16M
            y2_1H_8x16b = _mm_mullo_epi16(y2_1H_8x16b, wt2_8x16b);
1269
1270
2.16M
            y1_0L_8x16b = _mm_adds_epi16(y1_0L_8x16b, y2_0L_8x16b);
1271
2.16M
            y1_0H_8x16b = _mm_adds_epi16(y1_0H_8x16b, y2_0H_8x16b);
1272
2.16M
            y1_1L_8x16b = _mm_adds_epi16(y1_1L_8x16b, y2_1L_8x16b);
1273
2.16M
            y1_1H_8x16b = _mm_adds_epi16(y1_1H_8x16b, y2_1H_8x16b);
1274
1275
2.16M
            y1_0L_8x16b = _mm_adds_epi16(round_8x16b, y1_0L_8x16b);
1276
2.16M
            y1_0H_8x16b = _mm_adds_epi16(round_8x16b, y1_0H_8x16b);
1277
2.16M
            y1_1L_8x16b = _mm_adds_epi16(round_8x16b, y1_1L_8x16b);
1278
2.16M
            y1_1H_8x16b = _mm_adds_epi16(round_8x16b, y1_1H_8x16b);
1279
1280
2.16M
            y1_0L_8x16b = _mm_srai_epi16(y1_0L_8x16b, shft);
1281
2.16M
            y1_0H_8x16b = _mm_srai_epi16(y1_0H_8x16b, shft);
1282
2.16M
            y1_1L_8x16b = _mm_srai_epi16(y1_1L_8x16b, shft);
1283
2.16M
            y1_1H_8x16b = _mm_srai_epi16(y1_1H_8x16b, shft);
1284
1285
2.16M
            y1_0L_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0L_8x16b);
1286
2.16M
            y1_0H_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0H_8x16b);
1287
2.16M
            y1_1L_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1L_8x16b);
1288
2.16M
            y1_1H_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1H_8x16b);
1289
1290
2.16M
            y1_0_16x8b = _mm_packus_epi16(y1_0L_8x16b, y1_0H_8x16b);
1291
2.16M
            y1_1_16x8b = _mm_packus_epi16(y1_1L_8x16b, y1_1H_8x16b);
1292
1293
2.16M
            _mm_storeu_si128((__m128i *)pu1_dst, y1_0_16x8b);
1294
2.16M
            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b);
1295
1296
2.16M
            ht -= 2;
1297
2.16M
            pu1_src1 += src_strd1 << 1;
1298
2.16M
            pu1_src2 += src_strd2 << 1;
1299
2.16M
            pu1_dst += dst_strd << 1;
1300
2.16M
        }
1301
2.16M
        while(ht > 0);
1302
543k
    }
1303
616k
}