Coverage Report

Created: 2026-01-10 06:44

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libavc/common/x86/ih264_weighted_pred_sse42.c
Line
Count
Source
1
/******************************************************************************
2
 *
3
 * Copyright (C) 2015 The Android Open Source Project
4
 *
5
 * Licensed under the Apache License, Version 2.0 (the "License");
6
 * you may not use this file except in compliance with the License.
7
 * You may obtain a copy of the License at:
8
 *
9
 * http://www.apache.org/licenses/LICENSE-2.0
10
 *
11
 * Unless required by applicable law or agreed to in writing, software
12
 * distributed under the License is distributed on an "AS IS" BASIS,
13
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 * See the License for the specific language governing permissions and
15
 * limitations under the License.
16
 *
17
 *****************************************************************************
18
 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19
*/
20
/*****************************************************************************/
21
/*                                                                           */
22
/*  File Name         : ih264_weighted_pred_intr_sse42.c                     */
23
/*                                                                           */
24
/*  Description       : Contains function definitions for weighted           */
25
/*                      prediction functions in x86 sse4 intrinsics          */
26
/*                                                                           */
27
/*  List of Functions : ih264_default_weighted_pred_luma_sse42()             */
28
/*                      ih264_default_weighted_pred_chroma_sse42()           */
29
/*                      ih264_weighted_pred_luma_sse42()                     */
30
/*                      ih264_weighted_pred_chroma_sse42()                   */
31
/*                      ih264_weighted_bipred_luma_sse42()                   */
32
/*                      ih264_weighted_bipred_chroma_sse42()                 */
33
/*                                                                           */
34
/*  Issues / Problems : None                                                 */
35
/*                                                                           */
36
/*  Revision History  :                                                      */
37
/*                                                                           */
38
/*         DD MM YYYY   Author(s)       Changes                              */
39
/*         30 01 2015   Kaushik         Initial version                      */
40
/*                      Senthoor                                             */
41
/*                                                                           */
42
/*****************************************************************************/
43
/*****************************************************************************/
44
/* File Includes                                                             */
45
/*****************************************************************************/
46
47
#include <immintrin.h>
48
#include "ih264_typedefs.h"
49
#include "ih264_macros.h"
50
#include "ih264_platform_macros.h"
51
#include "ih264_weighted_pred.h"
52
53
/*****************************************************************************/
54
/*  Function definitions .                                                   */
55
/*****************************************************************************/
56
/*****************************************************************************/
57
/*                                                                           */
58
/*  Function Name : ih264_default_weighted_pred_luma_sse42                   */
59
/*                                                                           */
60
/*  Description   : This function performs the default weighted prediction   */
61
/*                  as described in sec 8.4.2.3.1 titled "Default weighted   */
62
/*                  sample prediction process" for luma. The function gets   */
63
/*                  two ht x wd blocks, calculates their rounded-average and */
64
/*                  stores it in the destination block. (ht,wd) can be       */
65
/*                  (4,4), (8,4), (4,8), (8,8), (16,8), (8,16) or (16,16).   */
66
/*                                                                           */
67
/*  Inputs        : pu1_src1  - Pointer to source 1                          */
68
/*                  pu1_src2  - Pointer to source 2                          */
69
/*                  pu1_dst   - Pointer to destination                       */
70
/*                  src_strd1 - stride for source 1                          */
71
/*                  src_strd1 - stride for source 2                          */
72
/*                  dst_strd  - stride for destination                       */
73
/*                  ht        - height of the block                          */
74
/*                  wd        - width of the block                           */
75
/*                                                                           */
76
/*  Issues        : None                                                     */
77
/*                                                                           */
78
/*  Revision History:                                                        */
79
/*                                                                           */
80
/*         DD MM YYYY   Author(s)       Changes                              */
81
/*         04 02 2015   Kaushik         Initial Version                      */
82
/*                      Senthoor                                             */
83
/*                                                                           */
84
/*****************************************************************************/
85
void ih264_default_weighted_pred_luma_sse42(UWORD8 *pu1_src1,
86
                                            UWORD8 *pu1_src2,
87
                                            UWORD8 *pu1_dst,
88
                                            WORD32 src_strd1,
89
                                            WORD32 src_strd2,
90
                                            WORD32 dst_strd,
91
                                            WORD32 ht,
92
                                            WORD32 wd)
93
1.15M
{
94
1.15M
    __m128i y0_0_16x8b, y0_1_16x8b, y0_2_16x8b, y0_3_16x8b;
95
1.15M
    __m128i y1_0_16x8b, y1_1_16x8b, y1_2_16x8b, y1_3_16x8b;
96
97
1.15M
    if(wd == 4)
98
5.55k
    {
99
5.55k
        do
100
7.05k
        {
101
7.05k
            y0_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
102
7.05k
            y0_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
103
7.05k
            y0_2_16x8b = _mm_loadl_epi64(
104
7.05k
                            (__m128i *)(pu1_src1 + (src_strd1 << 1)));
105
7.05k
            y0_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1 * 3));
106
107
7.05k
            y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
108
7.05k
            y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
109
7.05k
            y1_2_16x8b = _mm_loadl_epi64(
110
7.05k
                            (__m128i *)(pu1_src2 + (src_strd2 << 1)));
111
7.05k
            y1_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2 * 3));
112
113
7.05k
            y0_0_16x8b = _mm_avg_epu8(y0_0_16x8b, y1_0_16x8b);
114
7.05k
            y0_1_16x8b = _mm_avg_epu8(y0_1_16x8b, y1_1_16x8b);
115
7.05k
            y0_2_16x8b = _mm_avg_epu8(y0_2_16x8b, y1_2_16x8b);
116
7.05k
            y0_3_16x8b = _mm_avg_epu8(y0_3_16x8b, y1_3_16x8b);
117
118
7.05k
            *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y0_0_16x8b);
119
7.05k
            *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y0_1_16x8b);
120
7.05k
            *((WORD32 *)(pu1_dst + (dst_strd << 1))) = _mm_cvtsi128_si32(y0_2_16x8b);
121
7.05k
            *((WORD32 *)(pu1_dst + dst_strd * 3)) = _mm_cvtsi128_si32(y0_3_16x8b);
122
123
7.05k
            ht -= 4;
124
7.05k
            pu1_src1 += src_strd1 << 2;
125
7.05k
            pu1_src2 += src_strd2 << 2;
126
7.05k
            pu1_dst += dst_strd << 2;
127
7.05k
        }
128
7.05k
        while(ht > 0);
129
5.55k
    }
130
1.14M
    else if(wd == 8)
131
61.9k
    {
132
61.9k
        do
133
145k
        {
134
145k
            y0_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
135
145k
            y0_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
136
145k
            y0_2_16x8b = _mm_loadl_epi64(
137
145k
                            (__m128i *)(pu1_src1 + (src_strd1 << 1)));
138
145k
            y0_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1 * 3));
139
140
145k
            y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
141
145k
            y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
142
145k
            y1_2_16x8b = _mm_loadl_epi64(
143
145k
                            (__m128i *)(pu1_src2 + (src_strd2 << 1)));
144
145k
            y1_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2 * 3));
145
146
145k
            y0_0_16x8b = _mm_avg_epu8(y0_0_16x8b, y1_0_16x8b);
147
145k
            y0_1_16x8b = _mm_avg_epu8(y0_1_16x8b, y1_1_16x8b);
148
145k
            y0_2_16x8b = _mm_avg_epu8(y0_2_16x8b, y1_2_16x8b);
149
145k
            y0_3_16x8b = _mm_avg_epu8(y0_3_16x8b, y1_3_16x8b);
150
151
145k
            _mm_storel_epi64((__m128i *)pu1_dst, y0_0_16x8b);
152
145k
            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y0_1_16x8b);
153
145k
            _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd << 1)), y0_2_16x8b);
154
145k
            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd * 3), y0_3_16x8b);
155
156
145k
            ht -= 4;
157
145k
            pu1_src1 += src_strd1 << 2;
158
145k
            pu1_src2 += src_strd2 << 2;
159
145k
            pu1_dst += dst_strd << 2;
160
145k
        }
161
145k
        while(ht > 0);
162
61.9k
    }
163
1.08M
    else // wd == 16
164
1.08M
    {
165
1.08M
        __m128i y0_4_16x8b, y0_5_16x8b, y0_6_16x8b, y0_7_16x8b;
166
1.08M
        __m128i y1_4_16x8b, y1_5_16x8b, y1_6_16x8b, y1_7_16x8b;
167
168
1.08M
        do
169
2.12M
        {
170
2.12M
            y0_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src1);
171
2.12M
            y0_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1));
172
2.12M
            y0_2_16x8b = _mm_loadu_si128(
173
2.12M
                            (__m128i *)(pu1_src1 + (src_strd1 << 1)));
174
2.12M
            y0_3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1 * 3));
175
2.12M
            y0_4_16x8b = _mm_loadu_si128(
176
2.12M
                            (__m128i *)(pu1_src1 + (src_strd1 << 2)));
177
2.12M
            y0_5_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1 * 5));
178
2.12M
            y0_6_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1 * 6));
179
2.12M
            y0_7_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1 * 7));
180
181
2.12M
            y1_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src2);
182
2.12M
            y1_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2));
183
2.12M
            y1_2_16x8b = _mm_loadu_si128(
184
2.12M
                            (__m128i *)(pu1_src2 + (src_strd2 << 1)));
185
2.12M
            y1_3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2 * 3));
186
2.12M
            y1_4_16x8b = _mm_loadu_si128(
187
2.12M
                            (__m128i *)(pu1_src2 + (src_strd2 << 2)));
188
2.12M
            y1_5_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2 * 5));
189
2.12M
            y1_6_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2 * 6));
190
2.12M
            y1_7_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2 * 7));
191
192
2.12M
            y0_0_16x8b = _mm_avg_epu8(y0_0_16x8b, y1_0_16x8b);
193
2.12M
            y0_1_16x8b = _mm_avg_epu8(y0_1_16x8b, y1_1_16x8b);
194
2.12M
            y0_2_16x8b = _mm_avg_epu8(y0_2_16x8b, y1_2_16x8b);
195
2.12M
            y0_3_16x8b = _mm_avg_epu8(y0_3_16x8b, y1_3_16x8b);
196
2.12M
            y0_4_16x8b = _mm_avg_epu8(y0_4_16x8b, y1_4_16x8b);
197
2.12M
            y0_5_16x8b = _mm_avg_epu8(y0_5_16x8b, y1_5_16x8b);
198
2.12M
            y0_6_16x8b = _mm_avg_epu8(y0_6_16x8b, y1_6_16x8b);
199
2.12M
            y0_7_16x8b = _mm_avg_epu8(y0_7_16x8b, y1_7_16x8b);
200
201
2.12M
            _mm_storeu_si128((__m128i *)pu1_dst, y0_0_16x8b);
202
2.12M
            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y0_1_16x8b);
203
2.12M
            _mm_storeu_si128((__m128i *)(pu1_dst + (dst_strd << 1)), y0_2_16x8b);
204
2.12M
            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 3), y0_3_16x8b);
205
2.12M
            _mm_storeu_si128((__m128i *)(pu1_dst + (dst_strd << 2)), y0_4_16x8b);
206
2.12M
            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 5), y0_5_16x8b);
207
2.12M
            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 6), y0_6_16x8b);
208
2.12M
            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 7), y0_7_16x8b);
209
210
2.12M
            ht -= 8;
211
2.12M
            pu1_src1 += src_strd1 << 3;
212
2.12M
            pu1_src2 += src_strd2 << 3;
213
2.12M
            pu1_dst += dst_strd << 3;
214
2.12M
        }
215
2.12M
        while(ht > 0);
216
1.08M
    }
217
1.15M
}
218
219
/*****************************************************************************/
220
/*                                                                           */
221
/*  Function Name : ih264_default_weighted_pred_chroma_sse42                 */
222
/*                                                                           */
223
/*  Description   : This function performs the default weighted prediction   */
224
/*                  as described in sec 8.4.2.3.1 titled "Default weighted   */
225
/*                  sample prediction process" for chroma. The function gets */
226
/*                  two ht x wd blocks, calculates their rounded-average and */
227
/*                  stores it in the destination block. (ht,wd) can be       */
228
/*                  (2,2), (4,2) , (2,4), (4,4), (8,4), (4,8) or (8,8).      */
229
/*                                                                           */
230
/*  Inputs        : pu1_src1  - Pointer to source 1                          */
231
/*                  pu1_src2  - Pointer to source 2                          */
232
/*                  pu1_dst   - Pointer to destination                       */
233
/*                  src_strd1 - stride for source 1                          */
234
/*                  src_strd1 - stride for source 2                          */
235
/*                  dst_strd  - stride for destination                       */
236
/*                  ht        - height of the block                          */
237
/*                  wd        - width of the block                           */
238
/*                                                                           */
239
/*  Issues        : None                                                     */
240
/*                                                                           */
241
/*  Revision History:                                                        */
242
/*                                                                           */
243
/*         DD MM YYYY   Author(s)       Changes                              */
244
/*         04 02 2015   Kaushik         Initial Version                      */
245
/*                      Senthoor                                             */
246
/*                                                                           */
247
/*****************************************************************************/
248
void ih264_default_weighted_pred_chroma_sse42(UWORD8 *pu1_src1,
249
                                              UWORD8 *pu1_src2,
250
                                              UWORD8 *pu1_dst,
251
                                              WORD32 src_strd1,
252
                                              WORD32 src_strd2,
253
                                              WORD32 dst_strd,
254
                                              WORD32 ht,
255
                                              WORD32 wd)
256
1.15M
{
257
1.15M
    __m128i uv0_0_16x8b, uv0_1_16x8b;
258
1.15M
    __m128i uv1_0_16x8b, uv1_1_16x8b;
259
260
1.15M
    if(wd == 2)
261
5.55k
    {
262
5.55k
        do
263
7.05k
        {
264
7.05k
            uv0_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
265
7.05k
            uv0_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
266
267
7.05k
            uv1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
268
7.05k
            uv1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
269
270
7.05k
            uv0_0_16x8b = _mm_avg_epu8(uv0_0_16x8b, uv1_0_16x8b);
271
7.05k
            uv0_1_16x8b = _mm_avg_epu8(uv0_1_16x8b, uv1_1_16x8b);
272
273
7.05k
            *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(uv0_0_16x8b);
274
7.05k
            *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(uv0_1_16x8b);
275
276
7.05k
            ht -= 2;
277
7.05k
            pu1_src1 += src_strd1 << 1;
278
7.05k
            pu1_src2 += src_strd2 << 1;
279
7.05k
            pu1_dst += dst_strd << 1;
280
7.05k
        }
281
7.05k
        while(ht > 0);
282
5.55k
    }
283
1.14M
    else if(wd == 4)
284
61.9k
    {
285
61.9k
        do
286
145k
        {
287
145k
            uv0_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
288
145k
            uv0_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
289
290
145k
            uv1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
291
145k
            uv1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
292
293
145k
            uv0_0_16x8b = _mm_avg_epu8(uv0_0_16x8b, uv1_0_16x8b);
294
145k
            uv0_1_16x8b = _mm_avg_epu8(uv0_1_16x8b, uv1_1_16x8b);
295
296
145k
            _mm_storel_epi64((__m128i *)pu1_dst, uv0_0_16x8b);
297
145k
            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), uv0_1_16x8b);
298
299
145k
            ht -= 2;
300
145k
            pu1_src1 += src_strd1 << 1;
301
145k
            pu1_src2 += src_strd2 << 1;
302
145k
            pu1_dst += dst_strd << 1;
303
145k
        }
304
145k
        while(ht > 0);
305
61.9k
    }
306
1.08M
    else // wd == 8
307
1.08M
    {
308
1.08M
        __m128i uv0_2_16x8b, uv0_3_16x8b;
309
1.08M
        __m128i uv1_2_16x8b, uv1_3_16x8b;
310
311
1.08M
        do
312
2.12M
        {
313
2.12M
            uv0_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src1);
314
2.12M
            uv0_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1));
315
2.12M
            uv0_2_16x8b = _mm_loadu_si128(
316
2.12M
                            (__m128i *)(pu1_src1 + (src_strd1 << 1)));
317
2.12M
            uv0_3_16x8b = _mm_loadu_si128(
318
2.12M
                            (__m128i *)(pu1_src1 + src_strd1 * 3));
319
320
2.12M
            uv1_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src2);
321
2.12M
            uv1_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2));
322
2.12M
            uv1_2_16x8b = _mm_loadu_si128(
323
2.12M
                            (__m128i *)(pu1_src2 + (src_strd2 << 1)));
324
2.12M
            uv1_3_16x8b = _mm_loadu_si128(
325
2.12M
                            (__m128i *)(pu1_src2 + src_strd2 * 3));
326
327
2.12M
            uv0_0_16x8b = _mm_avg_epu8(uv0_0_16x8b, uv1_0_16x8b);
328
2.12M
            uv0_1_16x8b = _mm_avg_epu8(uv0_1_16x8b, uv1_1_16x8b);
329
2.12M
            uv0_2_16x8b = _mm_avg_epu8(uv0_2_16x8b, uv1_2_16x8b);
330
2.12M
            uv0_3_16x8b = _mm_avg_epu8(uv0_3_16x8b, uv1_3_16x8b);
331
332
2.12M
            _mm_storeu_si128((__m128i *)pu1_dst, uv0_0_16x8b);
333
2.12M
            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), uv0_1_16x8b);
334
2.12M
            _mm_storeu_si128(
335
2.12M
                            (__m128i *)(pu1_dst + (dst_strd << 1)), uv0_2_16x8b);
336
2.12M
            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 3), uv0_3_16x8b);
337
338
2.12M
            ht -= 4;
339
2.12M
            pu1_src1 += src_strd1 << 2;
340
2.12M
            pu1_src2 += src_strd2 << 2;
341
2.12M
            pu1_dst += dst_strd << 2;
342
2.12M
        }
343
2.12M
        while(ht > 0);
344
1.08M
    }
345
1.15M
}
346
347
/*****************************************************************************/
348
/*                                                                           */
349
/*  Function Name : ih264_weighted_pred_luma_sse42                           */
350
/*                                                                           */
351
/*  Description   : This function performs the weighted prediction as        */
352
/*                  described in sec 8.4.2.3.2 titled "Weighted sample       */
353
/*                  prediction process" for luma. The function gets one      */
354
/*                  ht x wd block, weights it, rounds it off, offsets it,    */
355
/*                  saturates it to unsigned 8-bit and stores it in the      */
356
/*                  destination block. (ht,wd) can be (4,4), (8,4), (4,8),   */
357
/*                  (8,8), (16,8), (8,16) or (16,16).                        */
358
/*                                                                           */
359
/*  Inputs        : pu1_src  - Pointer to source                             */
360
/*                  pu1_dst  - Pointer to destination                        */
361
/*                  src_strd - stride for source                             */
362
/*                  dst_strd - stride for destination                        */
363
/*                  log_wd   - number of bits to be rounded off              */
364
/*                  wt       - weight value                                  */
365
/*                  ofst     - offset value                                  */
366
/*                  ht       - height of the block                           */
367
/*                  wd       - width of the block                            */
368
/*                                                                           */
369
/*  Issues        : None                                                     */
370
/*                                                                           */
371
/*  Revision History:                                                        */
372
/*                                                                           */
373
/*         DD MM YYYY   Author(s)       Changes                              */
374
/*         04 02 2015   Kaushik         Initial Version                      */
375
/*                      Senthoor                                             */
376
/*                                                                           */
377
/*****************************************************************************/
378
void ih264_weighted_pred_luma_sse42(UWORD8 *pu1_src,
379
                                    UWORD8 *pu1_dst,
380
                                    WORD32 src_strd,
381
                                    WORD32 dst_strd,
382
                                    WORD32 log_wd,
383
                                    WORD32 wt,
384
                                    WORD32 ofst,
385
                                    WORD32 ht,
386
                                    WORD32 wd)
387
10.6M
{
388
10.6M
    __m128i y_0_16x8b, y_1_16x8b, y_2_16x8b, y_3_16x8b;
389
390
10.6M
    __m128i wt_8x16b, round_8x16b, ofst_8x16b;
391
392
10.6M
    WORD32 round_val;
393
394
10.6M
    wt = (WORD16)(wt & 0xffff);
395
10.6M
    round_val = 1 << (log_wd - 1);
396
10.6M
    ofst = (WORD8)(ofst & 0xff);
397
398
10.6M
    wt_8x16b = _mm_set1_epi16(wt);
399
10.6M
    round_8x16b = _mm_set1_epi16(round_val);
400
10.6M
    ofst_8x16b = _mm_set1_epi16(ofst);
401
402
10.6M
    if(wd == 4)
403
79.9k
    {
404
79.9k
        __m128i y_0_8x16b, y_2_8x16b;
405
406
79.9k
        do
407
130k
        {
408
130k
            y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
409
130k
            y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
410
130k
            y_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (src_strd << 1)));
411
130k
            y_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd * 3));
412
413
130k
            y_0_16x8b = _mm_unpacklo_epi32(y_0_16x8b, y_1_16x8b);
414
130k
            y_2_16x8b = _mm_unpacklo_epi32(y_2_16x8b, y_3_16x8b);
415
416
130k
            y_0_8x16b = _mm_cvtepu8_epi16(y_0_16x8b);
417
130k
            y_2_8x16b = _mm_cvtepu8_epi16(y_2_16x8b);
418
419
130k
            y_0_8x16b = _mm_mullo_epi16(y_0_8x16b, wt_8x16b);
420
130k
            y_2_8x16b = _mm_mullo_epi16(y_2_8x16b, wt_8x16b);
421
422
130k
            y_0_8x16b = _mm_adds_epi16(round_8x16b, y_0_8x16b);
423
130k
            y_2_8x16b = _mm_adds_epi16(round_8x16b, y_2_8x16b);
424
425
130k
            y_0_8x16b = _mm_srai_epi16(y_0_8x16b, log_wd);
426
130k
            y_2_8x16b = _mm_srai_epi16(y_2_8x16b, log_wd);
427
428
130k
            y_0_8x16b = _mm_adds_epi16(ofst_8x16b, y_0_8x16b);
429
130k
            y_2_8x16b = _mm_adds_epi16(ofst_8x16b, y_2_8x16b);
430
431
130k
            y_0_16x8b = _mm_packus_epi16(y_0_8x16b, y_2_8x16b);
432
130k
            y_1_16x8b = _mm_srli_si128(y_0_16x8b, 4);
433
130k
            y_2_16x8b = _mm_srli_si128(y_0_16x8b, 8);
434
130k
            y_3_16x8b = _mm_srli_si128(y_0_16x8b, 12);
435
436
130k
            *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y_0_16x8b);
437
130k
            *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y_1_16x8b);
438
130k
            *((WORD32 *)(pu1_dst + (dst_strd << 1))) = _mm_cvtsi128_si32(y_2_16x8b);
439
130k
            *((WORD32 *)(pu1_dst + dst_strd * 3)) = _mm_cvtsi128_si32(y_3_16x8b);
440
441
130k
            ht -= 4;
442
130k
            pu1_src += src_strd << 2;
443
130k
            pu1_dst += dst_strd << 2;
444
130k
        }
445
130k
        while(ht > 0);
446
79.9k
    }
447
10.5M
    else if(wd == 8)
448
49.2k
    {
449
49.2k
        __m128i y_0_8x16b, y_1_8x16b, y_2_8x16b, y_3_8x16b;
450
451
49.2k
        do
452
117k
        {
453
117k
            y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
454
117k
            y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
455
117k
            y_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (src_strd << 1)));
456
117k
            y_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd * 3));
457
458
117k
            y_0_8x16b = _mm_cvtepu8_epi16(y_0_16x8b);
459
117k
            y_1_8x16b = _mm_cvtepu8_epi16(y_1_16x8b);
460
117k
            y_2_8x16b = _mm_cvtepu8_epi16(y_2_16x8b);
461
117k
            y_3_8x16b = _mm_cvtepu8_epi16(y_3_16x8b);
462
463
117k
            y_0_8x16b = _mm_mullo_epi16(y_0_8x16b, wt_8x16b);
464
117k
            y_1_8x16b = _mm_mullo_epi16(y_1_8x16b, wt_8x16b);
465
117k
            y_2_8x16b = _mm_mullo_epi16(y_2_8x16b, wt_8x16b);
466
117k
            y_3_8x16b = _mm_mullo_epi16(y_3_8x16b, wt_8x16b);
467
468
117k
            y_0_8x16b = _mm_adds_epi16(round_8x16b, y_0_8x16b);
469
117k
            y_1_8x16b = _mm_adds_epi16(round_8x16b, y_1_8x16b);
470
117k
            y_2_8x16b = _mm_adds_epi16(round_8x16b, y_2_8x16b);
471
117k
            y_3_8x16b = _mm_adds_epi16(round_8x16b, y_3_8x16b);
472
473
117k
            y_0_8x16b = _mm_srai_epi16(y_0_8x16b, log_wd);
474
117k
            y_1_8x16b = _mm_srai_epi16(y_1_8x16b, log_wd);
475
117k
            y_2_8x16b = _mm_srai_epi16(y_2_8x16b, log_wd);
476
117k
            y_3_8x16b = _mm_srai_epi16(y_3_8x16b, log_wd);
477
478
117k
            y_0_8x16b = _mm_adds_epi16(ofst_8x16b, y_0_8x16b);
479
117k
            y_1_8x16b = _mm_adds_epi16(ofst_8x16b, y_1_8x16b);
480
117k
            y_2_8x16b = _mm_adds_epi16(ofst_8x16b, y_2_8x16b);
481
117k
            y_3_8x16b = _mm_adds_epi16(ofst_8x16b, y_3_8x16b);
482
483
117k
            y_0_16x8b = _mm_packus_epi16(y_0_8x16b, y_1_8x16b);
484
117k
            y_2_16x8b = _mm_packus_epi16(y_2_8x16b, y_3_8x16b);
485
117k
            y_1_16x8b = _mm_srli_si128(y_0_16x8b, 8);
486
117k
            y_3_16x8b = _mm_srli_si128(y_2_16x8b, 8);
487
488
117k
            _mm_storel_epi64((__m128i *)pu1_dst, y_0_16x8b);
489
117k
            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y_1_16x8b);
490
117k
            _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd << 1)), y_2_16x8b);
491
117k
            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd * 3), y_3_16x8b);
492
493
117k
            ht -= 4;
494
117k
            pu1_src += src_strd << 2;
495
117k
            pu1_dst += dst_strd << 2;
496
117k
        }
497
117k
        while(ht > 0);
498
49.2k
    }
499
10.4M
    else // wd == 16
500
10.4M
    {
501
10.4M
        __m128i y_0L_8x16b, y_1L_8x16b, y_2L_8x16b, y_3L_8x16b;
502
10.4M
        __m128i y_0H_8x16b, y_1H_8x16b, y_2H_8x16b, y_3H_8x16b;
503
504
10.4M
        __m128i zero_16x8b;
505
10.4M
        zero_16x8b = _mm_set1_epi8(0);
506
507
10.4M
        do
508
41.8M
        {
509
41.8M
            y_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
510
41.8M
            y_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd));
511
41.8M
            y_2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + (src_strd << 1)));
512
41.8M
            y_3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd * 3));
513
514
41.8M
            y_0L_8x16b = _mm_cvtepu8_epi16(y_0_16x8b);
515
41.8M
            y_0H_8x16b = _mm_unpackhi_epi8(y_0_16x8b, zero_16x8b);
516
41.8M
            y_1L_8x16b = _mm_cvtepu8_epi16(y_1_16x8b);
517
41.8M
            y_1H_8x16b = _mm_unpackhi_epi8(y_1_16x8b, zero_16x8b);
518
41.8M
            y_2L_8x16b = _mm_cvtepu8_epi16(y_2_16x8b);
519
41.8M
            y_2H_8x16b = _mm_unpackhi_epi8(y_2_16x8b, zero_16x8b);
520
41.8M
            y_3L_8x16b = _mm_cvtepu8_epi16(y_3_16x8b);
521
41.8M
            y_3H_8x16b = _mm_unpackhi_epi8(y_3_16x8b, zero_16x8b);
522
523
41.8M
            y_0L_8x16b = _mm_mullo_epi16(y_0L_8x16b, wt_8x16b);
524
41.8M
            y_0H_8x16b = _mm_mullo_epi16(y_0H_8x16b, wt_8x16b);
525
41.8M
            y_1L_8x16b = _mm_mullo_epi16(y_1L_8x16b, wt_8x16b);
526
41.8M
            y_1H_8x16b = _mm_mullo_epi16(y_1H_8x16b, wt_8x16b);
527
41.8M
            y_2L_8x16b = _mm_mullo_epi16(y_2L_8x16b, wt_8x16b);
528
41.8M
            y_2H_8x16b = _mm_mullo_epi16(y_2H_8x16b, wt_8x16b);
529
41.8M
            y_3L_8x16b = _mm_mullo_epi16(y_3L_8x16b, wt_8x16b);
530
41.8M
            y_3H_8x16b = _mm_mullo_epi16(y_3H_8x16b, wt_8x16b);
531
532
41.8M
            y_0L_8x16b = _mm_adds_epi16(round_8x16b, y_0L_8x16b);
533
41.8M
            y_0H_8x16b = _mm_adds_epi16(round_8x16b, y_0H_8x16b);
534
41.8M
            y_1L_8x16b = _mm_adds_epi16(round_8x16b, y_1L_8x16b);
535
41.8M
            y_1H_8x16b = _mm_adds_epi16(round_8x16b, y_1H_8x16b);
536
41.8M
            y_2L_8x16b = _mm_adds_epi16(round_8x16b, y_2L_8x16b);
537
41.8M
            y_2H_8x16b = _mm_adds_epi16(round_8x16b, y_2H_8x16b);
538
41.8M
            y_3L_8x16b = _mm_adds_epi16(round_8x16b, y_3L_8x16b);
539
41.8M
            y_3H_8x16b = _mm_adds_epi16(round_8x16b, y_3H_8x16b);
540
541
41.8M
            y_0L_8x16b = _mm_srai_epi16(y_0L_8x16b, log_wd);
542
41.8M
            y_0H_8x16b = _mm_srai_epi16(y_0H_8x16b, log_wd);
543
41.8M
            y_1L_8x16b = _mm_srai_epi16(y_1L_8x16b, log_wd);
544
41.8M
            y_1H_8x16b = _mm_srai_epi16(y_1H_8x16b, log_wd);
545
41.8M
            y_2L_8x16b = _mm_srai_epi16(y_2L_8x16b, log_wd);
546
41.8M
            y_2H_8x16b = _mm_srai_epi16(y_2H_8x16b, log_wd);
547
41.8M
            y_3L_8x16b = _mm_srai_epi16(y_3L_8x16b, log_wd);
548
41.8M
            y_3H_8x16b = _mm_srai_epi16(y_3H_8x16b, log_wd);
549
550
41.8M
            y_0L_8x16b = _mm_adds_epi16(ofst_8x16b, y_0L_8x16b);
551
41.8M
            y_0H_8x16b = _mm_adds_epi16(ofst_8x16b, y_0H_8x16b);
552
41.8M
            y_1L_8x16b = _mm_adds_epi16(ofst_8x16b, y_1L_8x16b);
553
41.8M
            y_1H_8x16b = _mm_adds_epi16(ofst_8x16b, y_1H_8x16b);
554
41.8M
            y_2L_8x16b = _mm_adds_epi16(ofst_8x16b, y_2L_8x16b);
555
41.8M
            y_2H_8x16b = _mm_adds_epi16(ofst_8x16b, y_2H_8x16b);
556
41.8M
            y_3L_8x16b = _mm_adds_epi16(ofst_8x16b, y_3L_8x16b);
557
41.8M
            y_3H_8x16b = _mm_adds_epi16(ofst_8x16b, y_3H_8x16b);
558
559
41.8M
            y_0_16x8b = _mm_packus_epi16(y_0L_8x16b, y_0H_8x16b);
560
41.8M
            y_1_16x8b = _mm_packus_epi16(y_1L_8x16b, y_1H_8x16b);
561
41.8M
            y_2_16x8b = _mm_packus_epi16(y_2L_8x16b, y_2H_8x16b);
562
41.8M
            y_3_16x8b = _mm_packus_epi16(y_3L_8x16b, y_3H_8x16b);
563
564
41.8M
            _mm_storeu_si128((__m128i *)pu1_dst, y_0_16x8b);
565
41.8M
            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y_1_16x8b);
566
41.8M
            _mm_storeu_si128((__m128i *)(pu1_dst + (dst_strd << 1)), y_2_16x8b);
567
41.8M
            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 3), y_3_16x8b);
568
569
41.8M
            ht -= 4;
570
41.8M
            pu1_src += src_strd << 2;
571
41.8M
            pu1_dst += dst_strd << 2;
572
41.8M
        }
573
41.8M
        while(ht > 0);
574
10.4M
    }
575
10.6M
}
576
577
/*****************************************************************************/
578
/*                                                                           */
579
/*  Function Name : ih264_weighted_pred_chroma_sse42                         */
580
/*                                                                           */
581
/*  Description   : This function performs the weighted prediction as        */
582
/*                  described in sec 8.4.2.3.2 titled "Weighted sample       */
583
/*                  prediction process" for chroma. The function gets one    */
584
/*                  ht x wd block, weights it, rounds it off, offsets it,    */
585
/*                  saturates it to unsigned 8-bit and stores it in the      */
586
/*                  destination block. (ht,wd) can be (2,2), (4,2), (2,4),   */
587
/*                  (4,4), (8,4), (4,8) or (8,8).                            */
588
/*                                                                           */
589
/*  Inputs        : pu1_src  - Pointer to source                             */
590
/*                  pu1_dst  - Pointer to destination                        */
591
/*                  src_strd - stride for source                             */
592
/*                  dst_strd - stride for destination                        */
593
/*                  log_wd   - number of bits to be rounded off              */
594
/*                  wt       - weight values for u and v                     */
595
/*                  ofst     - offset values for u and v                     */
596
/*                  ht       - height of the block                           */
597
/*                  wd       - width of the block                            */
598
/*                                                                           */
599
/*  Issues        : None                                                     */
600
/*                                                                           */
601
/*  Revision History:                                                        */
602
/*                                                                           */
603
/*         DD MM YYYY   Author(s)       Changes                              */
604
/*         04 02 2015   Kaushik         Initial Version                      */
605
/*                      Senthoor                                             */
606
/*                                                                           */
607
/*****************************************************************************/
608
void ih264_weighted_pred_chroma_sse42(UWORD8 *pu1_src,
609
                                      UWORD8 *pu1_dst,
610
                                      WORD32 src_strd,
611
                                      WORD32 dst_strd,
612
                                      WORD32 log_wd,
613
                                      WORD32 wt,
614
                                      WORD32 ofst,
615
                                      WORD32 ht,
616
                                      WORD32 wd)
617
10.6M
{
618
10.6M
    __m128i y_0_16x8b, y_1_16x8b;
619
620
10.6M
    __m128i wt_8x16b, round_8x16b, ofst_8x16b;
621
622
10.6M
    WORD32 ofst_u, ofst_v;
623
10.6M
    WORD32 round_val;
624
625
10.6M
    ofst_u = (WORD8)(ofst & 0xff);
626
10.6M
    ofst_v = (WORD8)(ofst >> 8);
627
10.6M
    round_val = 1 << (log_wd - 1);
628
10.6M
    ofst = (ofst_u & 0xffff) | (ofst_v << 16);
629
630
10.6M
    wt_8x16b = _mm_set1_epi32(wt);
631
10.6M
    round_8x16b = _mm_set1_epi16(round_val);
632
10.6M
    ofst_8x16b = _mm_set1_epi32(ofst);
633
634
10.6M
    if(wd == 2)
635
79.9k
    {
636
79.9k
        __m128i y_0_8x16b;
637
638
79.9k
        do
639
130k
        {
640
130k
            y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
641
130k
            y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
642
643
130k
            y_0_16x8b = _mm_unpacklo_epi32(y_0_16x8b, y_1_16x8b);
644
645
130k
            y_0_8x16b = _mm_cvtepu8_epi16(y_0_16x8b);
646
647
130k
            y_0_8x16b = _mm_mullo_epi16(y_0_8x16b, wt_8x16b);
648
649
130k
            y_0_8x16b = _mm_adds_epi16(round_8x16b, y_0_8x16b);
650
651
130k
            y_0_8x16b = _mm_srai_epi16(y_0_8x16b, log_wd);
652
653
130k
            y_0_8x16b = _mm_adds_epi16(ofst_8x16b, y_0_8x16b);
654
655
130k
            y_0_16x8b = _mm_packus_epi16(y_0_8x16b, y_0_8x16b);
656
130k
            y_1_16x8b = _mm_srli_si128(y_0_16x8b, 4);
657
658
130k
            *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y_0_16x8b);
659
130k
            *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y_1_16x8b);
660
661
130k
            ht -= 2;
662
130k
            pu1_src += src_strd << 1;
663
130k
            pu1_dst += dst_strd << 1;
664
130k
        }
665
130k
        while(ht > 0);
666
79.9k
    }
667
10.5M
    else if(wd == 4)
668
49.2k
    {
669
49.2k
        __m128i y_0_8x16b, y_1_8x16b;
670
671
49.2k
        do
672
117k
        {
673
117k
            y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
674
117k
            y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
675
676
117k
            y_0_8x16b = _mm_cvtepu8_epi16(y_0_16x8b);
677
117k
            y_1_8x16b = _mm_cvtepu8_epi16(y_1_16x8b);
678
679
117k
            y_0_8x16b = _mm_mullo_epi16(y_0_8x16b, wt_8x16b);
680
117k
            y_1_8x16b = _mm_mullo_epi16(y_1_8x16b, wt_8x16b);
681
682
117k
            y_0_8x16b = _mm_adds_epi16(round_8x16b, y_0_8x16b);
683
117k
            y_1_8x16b = _mm_adds_epi16(round_8x16b, y_1_8x16b);
684
685
117k
            y_0_8x16b = _mm_srai_epi16(y_0_8x16b, log_wd);
686
117k
            y_1_8x16b = _mm_srai_epi16(y_1_8x16b, log_wd);
687
688
117k
            y_0_8x16b = _mm_adds_epi16(ofst_8x16b, y_0_8x16b);
689
117k
            y_1_8x16b = _mm_adds_epi16(ofst_8x16b, y_1_8x16b);
690
691
117k
            y_0_16x8b = _mm_packus_epi16(y_0_8x16b, y_1_8x16b);
692
117k
            y_1_16x8b = _mm_srli_si128(y_0_16x8b, 8);
693
694
117k
            _mm_storel_epi64((__m128i *)pu1_dst, y_0_16x8b);
695
117k
            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y_1_16x8b);
696
697
117k
            ht -= 2;
698
117k
            pu1_src += src_strd << 1;
699
117k
            pu1_dst += dst_strd << 1;
700
117k
        }
701
117k
        while(ht > 0);
702
49.2k
    }
703
10.4M
    else // wd == 16
704
10.4M
    {
705
10.4M
        __m128i y_2_16x8b, y_3_16x8b;
706
10.4M
        __m128i y_0L_8x16b, y_1L_8x16b, y_2L_8x16b, y_3L_8x16b;
707
10.4M
        __m128i y_0H_8x16b, y_1H_8x16b, y_2H_8x16b, y_3H_8x16b;
708
709
10.4M
        __m128i zero_16x8b;
710
10.4M
        zero_16x8b = _mm_set1_epi8(0);
711
712
10.4M
        do
713
20.9M
        {
714
20.9M
            y_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
715
20.9M
            y_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd));
716
20.9M
            y_2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + (src_strd << 1)));
717
20.9M
            y_3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd * 3));
718
719
20.9M
            y_0L_8x16b = _mm_cvtepu8_epi16(y_0_16x8b);
720
20.9M
            y_0H_8x16b = _mm_unpackhi_epi8(y_0_16x8b, zero_16x8b);
721
20.9M
            y_1L_8x16b = _mm_cvtepu8_epi16(y_1_16x8b);
722
20.9M
            y_1H_8x16b = _mm_unpackhi_epi8(y_1_16x8b, zero_16x8b);
723
20.9M
            y_2L_8x16b = _mm_cvtepu8_epi16(y_2_16x8b);
724
20.9M
            y_2H_8x16b = _mm_unpackhi_epi8(y_2_16x8b, zero_16x8b);
725
20.9M
            y_3L_8x16b = _mm_cvtepu8_epi16(y_3_16x8b);
726
20.9M
            y_3H_8x16b = _mm_unpackhi_epi8(y_3_16x8b, zero_16x8b);
727
728
20.9M
            y_0L_8x16b = _mm_mullo_epi16(y_0L_8x16b, wt_8x16b);
729
20.9M
            y_0H_8x16b = _mm_mullo_epi16(y_0H_8x16b, wt_8x16b);
730
20.9M
            y_1L_8x16b = _mm_mullo_epi16(y_1L_8x16b, wt_8x16b);
731
20.9M
            y_1H_8x16b = _mm_mullo_epi16(y_1H_8x16b, wt_8x16b);
732
20.9M
            y_2L_8x16b = _mm_mullo_epi16(y_2L_8x16b, wt_8x16b);
733
20.9M
            y_2H_8x16b = _mm_mullo_epi16(y_2H_8x16b, wt_8x16b);
734
20.9M
            y_3L_8x16b = _mm_mullo_epi16(y_3L_8x16b, wt_8x16b);
735
20.9M
            y_3H_8x16b = _mm_mullo_epi16(y_3H_8x16b, wt_8x16b);
736
737
20.9M
            y_0L_8x16b = _mm_adds_epi16(round_8x16b, y_0L_8x16b);
738
20.9M
            y_0H_8x16b = _mm_adds_epi16(round_8x16b, y_0H_8x16b);
739
20.9M
            y_1L_8x16b = _mm_adds_epi16(round_8x16b, y_1L_8x16b);
740
20.9M
            y_1H_8x16b = _mm_adds_epi16(round_8x16b, y_1H_8x16b);
741
20.9M
            y_2L_8x16b = _mm_adds_epi16(round_8x16b, y_2L_8x16b);
742
20.9M
            y_2H_8x16b = _mm_adds_epi16(round_8x16b, y_2H_8x16b);
743
20.9M
            y_3L_8x16b = _mm_adds_epi16(round_8x16b, y_3L_8x16b);
744
20.9M
            y_3H_8x16b = _mm_adds_epi16(round_8x16b, y_3H_8x16b);
745
746
20.9M
            y_0L_8x16b = _mm_srai_epi16(y_0L_8x16b, log_wd);
747
20.9M
            y_0H_8x16b = _mm_srai_epi16(y_0H_8x16b, log_wd);
748
20.9M
            y_1L_8x16b = _mm_srai_epi16(y_1L_8x16b, log_wd);
749
20.9M
            y_1H_8x16b = _mm_srai_epi16(y_1H_8x16b, log_wd);
750
20.9M
            y_2L_8x16b = _mm_srai_epi16(y_2L_8x16b, log_wd);
751
20.9M
            y_2H_8x16b = _mm_srai_epi16(y_2H_8x16b, log_wd);
752
20.9M
            y_3L_8x16b = _mm_srai_epi16(y_3L_8x16b, log_wd);
753
20.9M
            y_3H_8x16b = _mm_srai_epi16(y_3H_8x16b, log_wd);
754
755
20.9M
            y_0L_8x16b = _mm_adds_epi16(ofst_8x16b, y_0L_8x16b);
756
20.9M
            y_0H_8x16b = _mm_adds_epi16(ofst_8x16b, y_0H_8x16b);
757
20.9M
            y_1L_8x16b = _mm_adds_epi16(ofst_8x16b, y_1L_8x16b);
758
20.9M
            y_1H_8x16b = _mm_adds_epi16(ofst_8x16b, y_1H_8x16b);
759
20.9M
            y_2L_8x16b = _mm_adds_epi16(ofst_8x16b, y_2L_8x16b);
760
20.9M
            y_2H_8x16b = _mm_adds_epi16(ofst_8x16b, y_2H_8x16b);
761
20.9M
            y_3L_8x16b = _mm_adds_epi16(ofst_8x16b, y_3L_8x16b);
762
20.9M
            y_3H_8x16b = _mm_adds_epi16(ofst_8x16b, y_3H_8x16b);
763
764
20.9M
            y_0_16x8b = _mm_packus_epi16(y_0L_8x16b, y_0H_8x16b);
765
20.9M
            y_1_16x8b = _mm_packus_epi16(y_1L_8x16b, y_1H_8x16b);
766
20.9M
            y_2_16x8b = _mm_packus_epi16(y_2L_8x16b, y_2H_8x16b);
767
20.9M
            y_3_16x8b = _mm_packus_epi16(y_3L_8x16b, y_3H_8x16b);
768
769
20.9M
            _mm_storeu_si128((__m128i *)pu1_dst, y_0_16x8b);
770
20.9M
            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y_1_16x8b);
771
20.9M
            _mm_storeu_si128((__m128i *)(pu1_dst + (dst_strd << 1)), y_2_16x8b);
772
20.9M
            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 3), y_3_16x8b);
773
774
20.9M
            ht -= 4;
775
20.9M
            pu1_src += src_strd << 2;
776
20.9M
            pu1_dst += dst_strd << 2;
777
20.9M
        }
778
20.9M
        while(ht > 0);
779
10.4M
    }
780
10.6M
}
781
782
/*****************************************************************************/
783
/*                                                                           */
784
/*  Function Name : ih264_weighted_bi_pred_luma_sse42                        */
785
/*                                                                           */
786
/*  Description   : This function performs the weighted biprediction as      */
787
/*                  described in sec 8.4.2.3.2 titled "Weighted sample       */
788
/*                  prediction process" for luma. The function gets two      */
789
/*                  ht x wd blocks, weights them, adds them, rounds off the  */
790
/*                  sum, offsets it, saturates it to unsigned 8-bit and      */
791
/*                  stores it in the destination block. (ht,wd) can be       */
792
/*                  (4,4), (8,4), (4,8), (8,8), (16,8), (8,16) or (16,16).   */
793
/*                                                                           */
794
/*  Inputs        : pu1_src1  - Pointer to source 1                          */
795
/*                  pu1_src2  - Pointer to source 2                          */
796
/*                  pu1_dst   - Pointer to destination                       */
797
/*                  src_strd1 - stride for source 1                          */
798
/*                  src_strd2 - stride for source 2                          */
799
/*                  dst_strd2 - stride for destination                       */
800
/*                  log_wd    - number of bits to be rounded off             */
801
/*                  wt1       - weight value for source 1                    */
802
/*                  wt2       - weight value for source 2                    */
803
/*                  ofst1     - offset value for source 1                    */
804
/*                  ofst2     - offset value for source 2                    */
805
/*                  ht        - height of the block                          */
806
/*                  wd        - width of the block                           */
807
/*                                                                           */
808
/*  Issues        : None                                                     */
809
/*                                                                           */
810
/*  Revision History:                                                        */
811
/*                                                                           */
812
/*         DD MM YYYY   Author(s)       Changes                              */
813
/*         04 02 2015   Kaushik         Initial Version                      */
814
/*                      Senthoor                                             */
815
/*                                                                           */
816
/*****************************************************************************/
817
void ih264_weighted_bi_pred_luma_sse42(UWORD8 *pu1_src1,
818
                                       UWORD8 *pu1_src2,
819
                                       UWORD8 *pu1_dst,
820
                                       WORD32 src_strd1,
821
                                       WORD32 src_strd2,
822
                                       WORD32 dst_strd,
823
                                       WORD32 log_wd,
824
                                       WORD32 wt1,
825
                                       WORD32 wt2,
826
                                       WORD32 ofst1,
827
                                       WORD32 ofst2,
828
                                       WORD32 ht,
829
                                       WORD32 wd)
830
2.81M
{
831
2.81M
    __m128i y1_0_16x8b, y1_1_16x8b;
832
2.81M
    __m128i y2_0_16x8b, y2_1_16x8b;
833
834
2.81M
    __m128i wt1_8x16b, wt2_8x16b;
835
2.81M
    __m128i ofst_8x16b, round_8x16b;
836
837
2.81M
    WORD32 ofst;
838
2.81M
    WORD32 round_val, shft;
839
840
2.81M
    wt1 = (WORD16)(wt1 & 0xffff);
841
2.81M
    wt2 = (WORD16)(wt2 & 0xffff);
842
2.81M
    round_val = 1 << log_wd;
843
2.81M
    shft = log_wd + 1;
844
2.81M
    ofst1 = (WORD8)(ofst1 & 0xff);
845
2.81M
    ofst2 = (WORD8)(ofst2 & 0xff);
846
2.81M
    ofst = (ofst1 + ofst2 + 1) >> 1;
847
848
2.81M
    wt1_8x16b = _mm_set1_epi16(wt1);
849
2.81M
    wt2_8x16b = _mm_set1_epi16(wt2);
850
2.81M
    round_8x16b = _mm_set1_epi16(round_val);
851
2.81M
    ofst_8x16b = _mm_set1_epi16(ofst);
852
853
2.81M
    if(wd == 4)
854
3.84k
    {
855
3.84k
        __m128i y1_2_16x8b, y1_3_16x8b;
856
3.84k
        __m128i y2_2_16x8b, y2_3_16x8b;
857
858
3.84k
        __m128i y1_0_8x16b, y1_2_8x16b;
859
3.84k
        __m128i y2_0_8x16b, y2_2_8x16b;
860
861
3.84k
        do
862
4.89k
        {
863
4.89k
            y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
864
4.89k
            y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
865
4.89k
            y1_2_16x8b = _mm_loadl_epi64(
866
4.89k
                            (__m128i *)(pu1_src1 + (src_strd1 << 1)));
867
4.89k
            y1_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1 * 3));
868
869
4.89k
            y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
870
4.89k
            y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
871
4.89k
            y2_2_16x8b = _mm_loadl_epi64(
872
4.89k
                            (__m128i *)(pu1_src2 + (src_strd2 << 1)));
873
4.89k
            y2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2 * 3));
874
875
4.89k
            y1_0_16x8b = _mm_unpacklo_epi32(y1_0_16x8b, y1_1_16x8b);
876
4.89k
            y1_2_16x8b = _mm_unpacklo_epi32(y1_2_16x8b, y1_3_16x8b);
877
4.89k
            y2_0_16x8b = _mm_unpacklo_epi32(y2_0_16x8b, y2_1_16x8b);
878
4.89k
            y2_2_16x8b = _mm_unpacklo_epi32(y2_2_16x8b, y2_3_16x8b);
879
880
4.89k
            y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
881
4.89k
            y1_2_8x16b = _mm_cvtepu8_epi16(y1_2_16x8b);
882
4.89k
            y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
883
4.89k
            y2_2_8x16b = _mm_cvtepu8_epi16(y2_2_16x8b);
884
885
4.89k
            y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b);
886
4.89k
            y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b);
887
4.89k
            y1_2_8x16b = _mm_mullo_epi16(y1_2_8x16b, wt1_8x16b);
888
4.89k
            y2_2_8x16b = _mm_mullo_epi16(y2_2_8x16b, wt2_8x16b);
889
890
4.89k
            y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b);
891
4.89k
            y1_2_8x16b = _mm_adds_epi16(y1_2_8x16b, y2_2_8x16b);
892
893
4.89k
            y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b);
894
4.89k
            y1_2_8x16b = _mm_adds_epi16(round_8x16b, y1_2_8x16b);
895
896
4.89k
            y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft);
897
4.89k
            y1_2_8x16b = _mm_srai_epi16(y1_2_8x16b, shft);
898
899
4.89k
            y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b);
900
4.89k
            y1_2_8x16b = _mm_adds_epi16(ofst_8x16b, y1_2_8x16b);
901
902
4.89k
            y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_2_8x16b);
903
4.89k
            y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 4);
904
4.89k
            y1_2_16x8b = _mm_srli_si128(y1_0_16x8b, 8);
905
4.89k
            y1_3_16x8b = _mm_srli_si128(y1_0_16x8b, 12);
906
907
4.89k
            *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y1_0_16x8b);
908
4.89k
            *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y1_1_16x8b);
909
4.89k
            *((WORD32 *)(pu1_dst + (dst_strd << 1))) = _mm_cvtsi128_si32(y1_2_16x8b);
910
4.89k
            *((WORD32 *)(pu1_dst + dst_strd * 3)) = _mm_cvtsi128_si32(y1_3_16x8b);
911
912
913
4.89k
            ht -= 4;
914
4.89k
            pu1_src1 += src_strd1 << 2;
915
4.89k
            pu1_src2 += src_strd2 << 2;
916
4.89k
            pu1_dst += dst_strd << 2;
917
4.89k
        }
918
4.89k
        while(ht > 0);
919
3.84k
    }
920
2.81M
    else if(wd == 8)
921
29.0k
    {
922
29.0k
        __m128i y1_2_16x8b, y1_3_16x8b;
923
29.0k
        __m128i y2_2_16x8b, y2_3_16x8b;
924
925
29.0k
        __m128i y1_0_8x16b, y1_1_8x16b, y1_2_8x16b, y1_3_8x16b;
926
29.0k
        __m128i y2_0_8x16b, y2_1_8x16b, y2_2_8x16b, y2_3_8x16b;
927
928
29.0k
        do
929
69.7k
        {
930
69.7k
            y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
931
69.7k
            y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
932
69.7k
            y1_2_16x8b = _mm_loadl_epi64(
933
69.7k
                            (__m128i *)(pu1_src1 + (src_strd1 << 1)));
934
69.7k
            y1_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1 * 3));
935
936
69.7k
            y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
937
69.7k
            y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
938
69.7k
            y2_2_16x8b = _mm_loadl_epi64(
939
69.7k
                            (__m128i *)(pu1_src2 + (src_strd2 << 1)));
940
69.7k
            y2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2 * 3));
941
942
69.7k
            y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
943
69.7k
            y1_1_8x16b = _mm_cvtepu8_epi16(y1_1_16x8b);
944
69.7k
            y1_2_8x16b = _mm_cvtepu8_epi16(y1_2_16x8b);
945
69.7k
            y1_3_8x16b = _mm_cvtepu8_epi16(y1_3_16x8b);
946
947
69.7k
            y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
948
69.7k
            y2_1_8x16b = _mm_cvtepu8_epi16(y2_1_16x8b);
949
69.7k
            y2_2_8x16b = _mm_cvtepu8_epi16(y2_2_16x8b);
950
69.7k
            y2_3_8x16b = _mm_cvtepu8_epi16(y2_3_16x8b);
951
952
69.7k
            y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b);
953
69.7k
            y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b);
954
69.7k
            y1_1_8x16b = _mm_mullo_epi16(y1_1_8x16b, wt1_8x16b);
955
69.7k
            y2_1_8x16b = _mm_mullo_epi16(y2_1_8x16b, wt2_8x16b);
956
957
69.7k
            y1_2_8x16b = _mm_mullo_epi16(y1_2_8x16b, wt1_8x16b);
958
69.7k
            y2_2_8x16b = _mm_mullo_epi16(y2_2_8x16b, wt2_8x16b);
959
69.7k
            y1_3_8x16b = _mm_mullo_epi16(y1_3_8x16b, wt1_8x16b);
960
69.7k
            y2_3_8x16b = _mm_mullo_epi16(y2_3_8x16b, wt2_8x16b);
961
962
69.7k
            y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b);
963
69.7k
            y1_1_8x16b = _mm_adds_epi16(y1_1_8x16b, y2_1_8x16b);
964
69.7k
            y1_2_8x16b = _mm_adds_epi16(y1_2_8x16b, y2_2_8x16b);
965
69.7k
            y1_3_8x16b = _mm_adds_epi16(y1_3_8x16b, y2_3_8x16b);
966
967
69.7k
            y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b);
968
69.7k
            y1_1_8x16b = _mm_adds_epi16(round_8x16b, y1_1_8x16b);
969
69.7k
            y1_2_8x16b = _mm_adds_epi16(round_8x16b, y1_2_8x16b);
970
69.7k
            y1_3_8x16b = _mm_adds_epi16(round_8x16b, y1_3_8x16b);
971
972
69.7k
            y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft);
973
69.7k
            y1_1_8x16b = _mm_srai_epi16(y1_1_8x16b, shft);
974
69.7k
            y1_2_8x16b = _mm_srai_epi16(y1_2_8x16b, shft);
975
69.7k
            y1_3_8x16b = _mm_srai_epi16(y1_3_8x16b, shft);
976
977
69.7k
            y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b);
978
69.7k
            y1_1_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1_8x16b);
979
69.7k
            y1_2_8x16b = _mm_adds_epi16(ofst_8x16b, y1_2_8x16b);
980
69.7k
            y1_3_8x16b = _mm_adds_epi16(ofst_8x16b, y1_3_8x16b);
981
982
69.7k
            y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_1_8x16b);
983
69.7k
            y1_2_16x8b = _mm_packus_epi16(y1_2_8x16b, y1_3_8x16b);
984
69.7k
            y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 8);
985
69.7k
            y1_3_16x8b = _mm_srli_si128(y1_2_16x8b, 8);
986
987
69.7k
            _mm_storel_epi64((__m128i *)pu1_dst, y1_0_16x8b);
988
69.7k
            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b);
989
69.7k
            _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd << 1)), y1_2_16x8b);
990
69.7k
            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd * 3), y1_3_16x8b);
991
992
69.7k
            ht -= 4;
993
69.7k
            pu1_src1 += src_strd1 << 2;
994
69.7k
            pu1_src2 += src_strd2 << 2;
995
69.7k
            pu1_dst += dst_strd << 2;
996
69.7k
        }
997
69.7k
        while(ht > 0);
998
29.0k
    }
999
2.78M
    else // wd == 16
1000
2.78M
    {
1001
2.78M
        __m128i y1_0L_8x16b, y1_0H_8x16b, y1_1L_8x16b, y1_1H_8x16b;
1002
2.78M
        __m128i y2_0L_8x16b, y2_0H_8x16b, y2_1L_8x16b, y2_1H_8x16b;
1003
1004
2.78M
        __m128i zero_16x8b;
1005
2.78M
        zero_16x8b = _mm_set1_epi8(0);
1006
1007
2.78M
        do
1008
22.2M
        {
1009
22.2M
            y1_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src1);
1010
22.2M
            y1_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1));
1011
22.2M
            y2_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src2);
1012
22.2M
            y2_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2));
1013
1014
22.2M
            y1_0L_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
1015
22.2M
            y1_0H_8x16b = _mm_unpackhi_epi8(y1_0_16x8b, zero_16x8b);
1016
22.2M
            y1_1L_8x16b = _mm_cvtepu8_epi16(y1_1_16x8b);
1017
22.2M
            y1_1H_8x16b = _mm_unpackhi_epi8(y1_1_16x8b, zero_16x8b);
1018
1019
22.2M
            y2_0L_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
1020
22.2M
            y2_0H_8x16b = _mm_unpackhi_epi8(y2_0_16x8b, zero_16x8b);
1021
22.2M
            y2_1L_8x16b = _mm_cvtepu8_epi16(y2_1_16x8b);
1022
22.2M
            y2_1H_8x16b = _mm_unpackhi_epi8(y2_1_16x8b, zero_16x8b);
1023
1024
22.2M
            y1_0L_8x16b = _mm_mullo_epi16(y1_0L_8x16b, wt1_8x16b);
1025
22.2M
            y1_0H_8x16b = _mm_mullo_epi16(y1_0H_8x16b, wt1_8x16b);
1026
22.2M
            y1_1L_8x16b = _mm_mullo_epi16(y1_1L_8x16b, wt1_8x16b);
1027
22.2M
            y1_1H_8x16b = _mm_mullo_epi16(y1_1H_8x16b, wt1_8x16b);
1028
1029
22.2M
            y2_0L_8x16b = _mm_mullo_epi16(y2_0L_8x16b, wt2_8x16b);
1030
22.2M
            y2_0H_8x16b = _mm_mullo_epi16(y2_0H_8x16b, wt2_8x16b);
1031
22.2M
            y2_1L_8x16b = _mm_mullo_epi16(y2_1L_8x16b, wt2_8x16b);
1032
22.2M
            y2_1H_8x16b = _mm_mullo_epi16(y2_1H_8x16b, wt2_8x16b);
1033
1034
22.2M
            y1_0L_8x16b = _mm_adds_epi16(y1_0L_8x16b, y2_0L_8x16b);
1035
22.2M
            y1_0H_8x16b = _mm_adds_epi16(y1_0H_8x16b, y2_0H_8x16b);
1036
22.2M
            y1_1L_8x16b = _mm_adds_epi16(y1_1L_8x16b, y2_1L_8x16b);
1037
22.2M
            y1_1H_8x16b = _mm_adds_epi16(y1_1H_8x16b, y2_1H_8x16b);
1038
1039
22.2M
            y1_0L_8x16b = _mm_adds_epi16(round_8x16b, y1_0L_8x16b);
1040
22.2M
            y1_0H_8x16b = _mm_adds_epi16(round_8x16b, y1_0H_8x16b);
1041
22.2M
            y1_1L_8x16b = _mm_adds_epi16(round_8x16b, y1_1L_8x16b);
1042
22.2M
            y1_1H_8x16b = _mm_adds_epi16(round_8x16b, y1_1H_8x16b);
1043
1044
22.2M
            y1_0L_8x16b = _mm_srai_epi16(y1_0L_8x16b, shft);
1045
22.2M
            y1_0H_8x16b = _mm_srai_epi16(y1_0H_8x16b, shft);
1046
22.2M
            y1_1L_8x16b = _mm_srai_epi16(y1_1L_8x16b, shft);
1047
22.2M
            y1_1H_8x16b = _mm_srai_epi16(y1_1H_8x16b, shft);
1048
1049
22.2M
            y1_0L_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0L_8x16b);
1050
22.2M
            y1_0H_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0H_8x16b);
1051
22.2M
            y1_1L_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1L_8x16b);
1052
22.2M
            y1_1H_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1H_8x16b);
1053
1054
22.2M
            y1_0_16x8b = _mm_packus_epi16(y1_0L_8x16b, y1_0H_8x16b);
1055
22.2M
            y1_1_16x8b = _mm_packus_epi16(y1_1L_8x16b, y1_1H_8x16b);
1056
1057
22.2M
            _mm_storeu_si128((__m128i *)pu1_dst, y1_0_16x8b);
1058
22.2M
            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b);
1059
1060
22.2M
            ht -= 2;
1061
22.2M
            pu1_src1 += src_strd1 << 1;
1062
22.2M
            pu1_src2 += src_strd2 << 1;
1063
22.2M
            pu1_dst += dst_strd << 1;
1064
22.2M
        }
1065
22.2M
        while(ht > 0);
1066
2.78M
    }
1067
2.81M
}
1068
1069
/*****************************************************************************/
1070
/*                                                                           */
1071
/*  Function Name : ih264_weighted_bi_pred_chroma_sse42                      */
1072
/*                                                                           */
1073
/*  Description   : This function performs the weighted biprediction as      */
1074
/*                  described in sec 8.4.2.3.2 titled "Weighted sample       */
1075
/*                  prediction process" for chroma. The function gets two    */
1076
/*                  ht x wd blocks, weights them, adds them, rounds off the  */
1077
/*                  sum, offsets it, saturates it to unsigned 8-bit and      */
1078
/*                  stores it in the destination block. (ht,wd) can be       */
1079
/*                  (2,2), (4,2), (2,4), (4,4), (8,4), (4,8) or (8,8).       */
1080
/*                                                                           */
1081
/*  Inputs        : pu1_src1  - Pointer to source 1                          */
1082
/*                  pu1_src2  - Pointer to source 2                          */
1083
/*                  pu1_dst   - Pointer to destination                       */
1084
/*                  src_strd1 - stride for source 1                          */
1085
/*                  src_strd2 - stride for source 2                          */
1086
/*                  dst_strd2 - stride for destination                       */
1087
/*                  log_wd    - number of bits to be rounded off             */
1088
/*                  wt1       - weight values for u and v in source 1        */
1089
/*                  wt2       - weight values for u and v in source 2        */
1090
/*                  ofst1     - offset value for u and v in source 1         */
1091
/*                  ofst2     - offset value for u and v in source 2         */
1092
/*                  ht        - height of the block                          */
1093
/*                  wd        - width of the block                           */
1094
/*                                                                           */
1095
/*  Issues        : None                                                     */
1096
/*                                                                           */
1097
/*  Revision History:                                                        */
1098
/*                                                                           */
1099
/*         DD MM YYYY   Author(s)       Changes                              */
1100
/*         04 02 2015   Kaushik         Initial Version                      */
1101
/*                      Senthoor                                             */
1102
/*                                                                           */
1103
/*****************************************************************************/
1104
void ih264_weighted_bi_pred_chroma_sse42(UWORD8 *pu1_src1,
1105
                                         UWORD8 *pu1_src2,
1106
                                         UWORD8 *pu1_dst,
1107
                                         WORD32 src_strd1,
1108
                                         WORD32 src_strd2,
1109
                                         WORD32 dst_strd,
1110
                                         WORD32 log_wd,
1111
                                         WORD32 wt1,
1112
                                         WORD32 wt2,
1113
                                         WORD32 ofst1,
1114
                                         WORD32 ofst2,
1115
                                         WORD32 ht,
1116
                                         WORD32 wd)
1117
2.81M
{
1118
2.81M
    __m128i y1_0_16x8b, y1_1_16x8b;
1119
2.81M
    __m128i y2_0_16x8b, y2_1_16x8b;
1120
1121
2.81M
    __m128i wt1_8x16b, wt2_8x16b;
1122
2.81M
    __m128i ofst_8x16b, round_8x16b;
1123
1124
2.81M
    WORD32 ofst1_u, ofst2_u, ofst_u;
1125
2.81M
    WORD32 ofst1_v, ofst2_v, ofst_v;
1126
2.81M
    WORD32 round_val, shft, ofst_val;
1127
1128
2.81M
    round_val = 1 << log_wd;
1129
2.81M
    shft = log_wd + 1;
1130
1131
2.81M
    ofst1_u = (WORD8)(ofst1 & 0xff);
1132
2.81M
    ofst1_v = (WORD8)(ofst1 >> 8);
1133
2.81M
    ofst2_u = (WORD8)(ofst2 & 0xff);
1134
2.81M
    ofst2_v = (WORD8)(ofst2 >> 8);
1135
1136
2.81M
    wt1_8x16b = _mm_set1_epi32(wt1);
1137
2.81M
    wt2_8x16b = _mm_set1_epi32(wt2);
1138
1139
2.81M
    ofst_u = (ofst1_u + ofst2_u + 1) >> 1;
1140
2.81M
    ofst_v = (ofst1_v + ofst2_v + 1) >> 1;
1141
2.81M
    ofst_val = (ofst_u & 0xffff) | (ofst_v << 16);
1142
1143
2.81M
    round_8x16b = _mm_set1_epi16(round_val);
1144
2.81M
    ofst_8x16b = _mm_set1_epi32(ofst_val);
1145
1146
2.81M
    if(wd == 2)
1147
3.84k
    {
1148
3.84k
        __m128i y1_0_8x16b, y2_0_8x16b;
1149
1150
3.84k
        do
1151
4.89k
        {
1152
4.89k
            y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
1153
4.89k
            y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
1154
1155
4.89k
            y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
1156
4.89k
            y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
1157
1158
4.89k
            y1_0_16x8b = _mm_unpacklo_epi32(y1_0_16x8b, y1_1_16x8b);
1159
4.89k
            y2_0_16x8b = _mm_unpacklo_epi32(y2_0_16x8b, y2_1_16x8b);
1160
1161
4.89k
            y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
1162
4.89k
            y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
1163
1164
4.89k
            y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b);
1165
4.89k
            y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b);
1166
1167
4.89k
            y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b);
1168
4.89k
            y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b);
1169
1170
4.89k
            y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft);
1171
4.89k
            y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b);
1172
1173
4.89k
            y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_0_8x16b);
1174
4.89k
            y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 4);
1175
1176
4.89k
            *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y1_0_16x8b);
1177
4.89k
            *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y1_1_16x8b);
1178
1179
4.89k
            ht -= 2;
1180
4.89k
            pu1_src1 += src_strd1 << 1;
1181
4.89k
            pu1_src2 += src_strd2 << 1;
1182
4.89k
            pu1_dst += dst_strd << 1;
1183
4.89k
        }
1184
4.89k
        while(ht > 0);
1185
3.84k
    }
1186
2.81M
    else if(wd == 4)
1187
29.0k
    {
1188
29.0k
        __m128i y1_0_8x16b, y1_1_8x16b;
1189
29.0k
        __m128i y2_0_8x16b, y2_1_8x16b;
1190
1191
29.0k
        do
1192
69.7k
        {
1193
69.7k
            y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
1194
69.7k
            y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
1195
1196
69.7k
            y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
1197
69.7k
            y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
1198
1199
69.7k
            y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
1200
69.7k
            y1_1_8x16b = _mm_cvtepu8_epi16(y1_1_16x8b);
1201
1202
69.7k
            y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
1203
69.7k
            y2_1_8x16b = _mm_cvtepu8_epi16(y2_1_16x8b);
1204
1205
69.7k
            y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b);
1206
69.7k
            y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b);
1207
69.7k
            y1_1_8x16b = _mm_mullo_epi16(y1_1_8x16b, wt1_8x16b);
1208
69.7k
            y2_1_8x16b = _mm_mullo_epi16(y2_1_8x16b, wt2_8x16b);
1209
1210
69.7k
            y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b);
1211
69.7k
            y1_1_8x16b = _mm_adds_epi16(y1_1_8x16b, y2_1_8x16b);
1212
1213
69.7k
            y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b);
1214
69.7k
            y1_1_8x16b = _mm_adds_epi16(round_8x16b, y1_1_8x16b);
1215
1216
69.7k
            y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft);
1217
69.7k
            y1_1_8x16b = _mm_srai_epi16(y1_1_8x16b, shft);
1218
1219
69.7k
            y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b);
1220
69.7k
            y1_1_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1_8x16b);
1221
1222
69.7k
            y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_1_8x16b);
1223
69.7k
            y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 8);
1224
1225
69.7k
            _mm_storel_epi64((__m128i *)pu1_dst, y1_0_16x8b);
1226
69.7k
            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b);
1227
1228
69.7k
            ht -= 2;
1229
69.7k
            pu1_src1 += src_strd1 << 1;
1230
69.7k
            pu1_src2 += src_strd2 << 1;
1231
69.7k
            pu1_dst += dst_strd << 1;
1232
69.7k
        }
1233
69.7k
        while(ht > 0);
1234
29.0k
    }
1235
2.78M
    else // wd == 8
1236
2.78M
    {
1237
2.78M
        __m128i y1_0L_8x16b, y1_0H_8x16b, y1_1L_8x16b, y1_1H_8x16b;
1238
2.78M
        __m128i y2_0L_8x16b, y2_0H_8x16b, y2_1L_8x16b, y2_1H_8x16b;
1239
1240
2.78M
        __m128i zero_16x8b;
1241
2.78M
        zero_16x8b = _mm_set1_epi8(0);
1242
1243
2.78M
        do
1244
11.1M
        {
1245
11.1M
            y1_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src1);
1246
11.1M
            y1_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1));
1247
11.1M
            y2_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src2);
1248
11.1M
            y2_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2));
1249
1250
11.1M
            y1_0L_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
1251
11.1M
            y1_0H_8x16b = _mm_unpackhi_epi8(y1_0_16x8b, zero_16x8b);
1252
11.1M
            y1_1L_8x16b = _mm_cvtepu8_epi16(y1_1_16x8b);
1253
11.1M
            y1_1H_8x16b = _mm_unpackhi_epi8(y1_1_16x8b, zero_16x8b);
1254
1255
11.1M
            y2_0L_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
1256
11.1M
            y2_0H_8x16b = _mm_unpackhi_epi8(y2_0_16x8b, zero_16x8b);
1257
11.1M
            y2_1L_8x16b = _mm_cvtepu8_epi16(y2_1_16x8b);
1258
11.1M
            y2_1H_8x16b = _mm_unpackhi_epi8(y2_1_16x8b, zero_16x8b);
1259
1260
11.1M
            y1_0L_8x16b = _mm_mullo_epi16(y1_0L_8x16b, wt1_8x16b);
1261
11.1M
            y1_0H_8x16b = _mm_mullo_epi16(y1_0H_8x16b, wt1_8x16b);
1262
11.1M
            y1_1L_8x16b = _mm_mullo_epi16(y1_1L_8x16b, wt1_8x16b);
1263
11.1M
            y1_1H_8x16b = _mm_mullo_epi16(y1_1H_8x16b, wt1_8x16b);
1264
1265
11.1M
            y2_0L_8x16b = _mm_mullo_epi16(y2_0L_8x16b, wt2_8x16b);
1266
11.1M
            y2_0H_8x16b = _mm_mullo_epi16(y2_0H_8x16b, wt2_8x16b);
1267
11.1M
            y2_1L_8x16b = _mm_mullo_epi16(y2_1L_8x16b, wt2_8x16b);
1268
11.1M
            y2_1H_8x16b = _mm_mullo_epi16(y2_1H_8x16b, wt2_8x16b);
1269
1270
11.1M
            y1_0L_8x16b = _mm_adds_epi16(y1_0L_8x16b, y2_0L_8x16b);
1271
11.1M
            y1_0H_8x16b = _mm_adds_epi16(y1_0H_8x16b, y2_0H_8x16b);
1272
11.1M
            y1_1L_8x16b = _mm_adds_epi16(y1_1L_8x16b, y2_1L_8x16b);
1273
11.1M
            y1_1H_8x16b = _mm_adds_epi16(y1_1H_8x16b, y2_1H_8x16b);
1274
1275
11.1M
            y1_0L_8x16b = _mm_adds_epi16(round_8x16b, y1_0L_8x16b);
1276
11.1M
            y1_0H_8x16b = _mm_adds_epi16(round_8x16b, y1_0H_8x16b);
1277
11.1M
            y1_1L_8x16b = _mm_adds_epi16(round_8x16b, y1_1L_8x16b);
1278
11.1M
            y1_1H_8x16b = _mm_adds_epi16(round_8x16b, y1_1H_8x16b);
1279
1280
11.1M
            y1_0L_8x16b = _mm_srai_epi16(y1_0L_8x16b, shft);
1281
11.1M
            y1_0H_8x16b = _mm_srai_epi16(y1_0H_8x16b, shft);
1282
11.1M
            y1_1L_8x16b = _mm_srai_epi16(y1_1L_8x16b, shft);
1283
11.1M
            y1_1H_8x16b = _mm_srai_epi16(y1_1H_8x16b, shft);
1284
1285
11.1M
            y1_0L_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0L_8x16b);
1286
11.1M
            y1_0H_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0H_8x16b);
1287
11.1M
            y1_1L_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1L_8x16b);
1288
11.1M
            y1_1H_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1H_8x16b);
1289
1290
11.1M
            y1_0_16x8b = _mm_packus_epi16(y1_0L_8x16b, y1_0H_8x16b);
1291
11.1M
            y1_1_16x8b = _mm_packus_epi16(y1_1L_8x16b, y1_1H_8x16b);
1292
1293
11.1M
            _mm_storeu_si128((__m128i *)pu1_dst, y1_0_16x8b);
1294
11.1M
            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b);
1295
1296
11.1M
            ht -= 2;
1297
11.1M
            pu1_src1 += src_strd1 << 1;
1298
11.1M
            pu1_src2 += src_strd2 << 1;
1299
11.1M
            pu1_dst += dst_strd << 1;
1300
11.1M
        }
1301
11.1M
        while(ht > 0);
1302
2.78M
    }
1303
2.81M
}