Coverage Report

Created: 2026-02-26 07:09

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libavc/common/x86/ih264_weighted_pred_sse42.c
Line
Count
Source
1
/******************************************************************************
2
 *
3
 * Copyright (C) 2015 The Android Open Source Project
4
 *
5
 * Licensed under the Apache License, Version 2.0 (the "License");
6
 * you may not use this file except in compliance with the License.
7
 * You may obtain a copy of the License at:
8
 *
9
 * http://www.apache.org/licenses/LICENSE-2.0
10
 *
11
 * Unless required by applicable law or agreed to in writing, software
12
 * distributed under the License is distributed on an "AS IS" BASIS,
13
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 * See the License for the specific language governing permissions and
15
 * limitations under the License.
16
 *
17
 *****************************************************************************
18
 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19
*/
20
/*****************************************************************************/
21
/*                                                                           */
22
/*  File Name         : ih264_weighted_pred_intr_sse42.c                     */
23
/*                                                                           */
24
/*  Description       : Contains function definitions for weighted           */
25
/*                      prediction functions in x86 sse4 intrinsics          */
26
/*                                                                           */
27
/*  List of Functions : ih264_default_weighted_pred_luma_sse42()             */
28
/*                      ih264_default_weighted_pred_chroma_sse42()           */
29
/*                      ih264_weighted_pred_luma_sse42()                     */
30
/*                      ih264_weighted_pred_chroma_sse42()                   */
31
/*                      ih264_weighted_bipred_luma_sse42()                   */
32
/*                      ih264_weighted_bipred_chroma_sse42()                 */
33
/*                                                                           */
34
/*  Issues / Problems : None                                                 */
35
/*                                                                           */
36
/*  Revision History  :                                                      */
37
/*                                                                           */
38
/*         DD MM YYYY   Author(s)       Changes                              */
39
/*         30 01 2015   Kaushik         Initial version                      */
40
/*                      Senthoor                                             */
41
/*                                                                           */
42
/*****************************************************************************/
43
/*****************************************************************************/
44
/* File Includes                                                             */
45
/*****************************************************************************/
46
47
#include <immintrin.h>
48
#include "ih264_typedefs.h"
49
#include "ih264_macros.h"
50
#include "ih264_platform_macros.h"
51
#include "ih264_weighted_pred.h"
52
53
/*****************************************************************************/
54
/*  Function definitions .                                                   */
55
/*****************************************************************************/
56
/*****************************************************************************/
57
/*                                                                           */
58
/*  Function Name : ih264_default_weighted_pred_luma_sse42                   */
59
/*                                                                           */
60
/*  Description   : This function performs the default weighted prediction   */
61
/*                  as described in sec 8.4.2.3.1 titled "Default weighted   */
62
/*                  sample prediction process" for luma. The function gets   */
63
/*                  two ht x wd blocks, calculates their rounded-average and */
64
/*                  stores it in the destination block. (ht,wd) can be       */
65
/*                  (4,4), (8,4), (4,8), (8,8), (16,8), (8,16) or (16,16).   */
66
/*                                                                           */
67
/*  Inputs        : pu1_src1  - Pointer to source 1                          */
68
/*                  pu1_src2  - Pointer to source 2                          */
69
/*                  pu1_dst   - Pointer to destination                       */
70
/*                  src_strd1 - stride for source 1                          */
71
/*                  src_strd1 - stride for source 2                          */
72
/*                  dst_strd  - stride for destination                       */
73
/*                  ht        - height of the block                          */
74
/*                  wd        - width of the block                           */
75
/*                                                                           */
76
/*  Issues        : None                                                     */
77
/*                                                                           */
78
/*  Revision History:                                                        */
79
/*                                                                           */
80
/*         DD MM YYYY   Author(s)       Changes                              */
81
/*         04 02 2015   Kaushik         Initial Version                      */
82
/*                      Senthoor                                             */
83
/*                                                                           */
84
/*****************************************************************************/
85
void ih264_default_weighted_pred_luma_sse42(UWORD8 *pu1_src1,
86
                                            UWORD8 *pu1_src2,
87
                                            UWORD8 *pu1_dst,
88
                                            WORD32 src_strd1,
89
                                            WORD32 src_strd2,
90
                                            WORD32 dst_strd,
91
                                            WORD32 ht,
92
                                            WORD32 wd)
93
1.10M
{
94
1.10M
    __m128i y0_0_16x8b, y0_1_16x8b, y0_2_16x8b, y0_3_16x8b;
95
1.10M
    __m128i y1_0_16x8b, y1_1_16x8b, y1_2_16x8b, y1_3_16x8b;
96
97
1.10M
    if(wd == 4)
98
11.1k
    {
99
11.1k
        do
100
13.8k
        {
101
13.8k
            y0_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
102
13.8k
            y0_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
103
13.8k
            y0_2_16x8b = _mm_loadl_epi64(
104
13.8k
                            (__m128i *)(pu1_src1 + (src_strd1 << 1)));
105
13.8k
            y0_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1 * 3));
106
107
13.8k
            y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
108
13.8k
            y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
109
13.8k
            y1_2_16x8b = _mm_loadl_epi64(
110
13.8k
                            (__m128i *)(pu1_src2 + (src_strd2 << 1)));
111
13.8k
            y1_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2 * 3));
112
113
13.8k
            y0_0_16x8b = _mm_avg_epu8(y0_0_16x8b, y1_0_16x8b);
114
13.8k
            y0_1_16x8b = _mm_avg_epu8(y0_1_16x8b, y1_1_16x8b);
115
13.8k
            y0_2_16x8b = _mm_avg_epu8(y0_2_16x8b, y1_2_16x8b);
116
13.8k
            y0_3_16x8b = _mm_avg_epu8(y0_3_16x8b, y1_3_16x8b);
117
118
13.8k
            *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y0_0_16x8b);
119
13.8k
            *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y0_1_16x8b);
120
13.8k
            *((WORD32 *)(pu1_dst + (dst_strd << 1))) = _mm_cvtsi128_si32(y0_2_16x8b);
121
13.8k
            *((WORD32 *)(pu1_dst + dst_strd * 3)) = _mm_cvtsi128_si32(y0_3_16x8b);
122
123
13.8k
            ht -= 4;
124
13.8k
            pu1_src1 += src_strd1 << 2;
125
13.8k
            pu1_src2 += src_strd2 << 2;
126
13.8k
            pu1_dst += dst_strd << 2;
127
13.8k
        }
128
13.8k
        while(ht > 0);
129
11.1k
    }
130
1.09M
    else if(wd == 8)
131
102k
    {
132
102k
        do
133
231k
        {
134
231k
            y0_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
135
231k
            y0_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
136
231k
            y0_2_16x8b = _mm_loadl_epi64(
137
231k
                            (__m128i *)(pu1_src1 + (src_strd1 << 1)));
138
231k
            y0_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1 * 3));
139
140
231k
            y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
141
231k
            y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
142
231k
            y1_2_16x8b = _mm_loadl_epi64(
143
231k
                            (__m128i *)(pu1_src2 + (src_strd2 << 1)));
144
231k
            y1_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2 * 3));
145
146
231k
            y0_0_16x8b = _mm_avg_epu8(y0_0_16x8b, y1_0_16x8b);
147
231k
            y0_1_16x8b = _mm_avg_epu8(y0_1_16x8b, y1_1_16x8b);
148
231k
            y0_2_16x8b = _mm_avg_epu8(y0_2_16x8b, y1_2_16x8b);
149
231k
            y0_3_16x8b = _mm_avg_epu8(y0_3_16x8b, y1_3_16x8b);
150
151
231k
            _mm_storel_epi64((__m128i *)pu1_dst, y0_0_16x8b);
152
231k
            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y0_1_16x8b);
153
231k
            _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd << 1)), y0_2_16x8b);
154
231k
            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd * 3), y0_3_16x8b);
155
156
231k
            ht -= 4;
157
231k
            pu1_src1 += src_strd1 << 2;
158
231k
            pu1_src2 += src_strd2 << 2;
159
231k
            pu1_dst += dst_strd << 2;
160
231k
        }
161
231k
        while(ht > 0);
162
102k
    }
163
991k
    else // wd == 16
164
991k
    {
165
991k
        __m128i y0_4_16x8b, y0_5_16x8b, y0_6_16x8b, y0_7_16x8b;
166
991k
        __m128i y1_4_16x8b, y1_5_16x8b, y1_6_16x8b, y1_7_16x8b;
167
168
991k
        do
169
1.94M
        {
170
1.94M
            y0_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src1);
171
1.94M
            y0_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1));
172
1.94M
            y0_2_16x8b = _mm_loadu_si128(
173
1.94M
                            (__m128i *)(pu1_src1 + (src_strd1 << 1)));
174
1.94M
            y0_3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1 * 3));
175
1.94M
            y0_4_16x8b = _mm_loadu_si128(
176
1.94M
                            (__m128i *)(pu1_src1 + (src_strd1 << 2)));
177
1.94M
            y0_5_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1 * 5));
178
1.94M
            y0_6_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1 * 6));
179
1.94M
            y0_7_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1 * 7));
180
181
1.94M
            y1_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src2);
182
1.94M
            y1_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2));
183
1.94M
            y1_2_16x8b = _mm_loadu_si128(
184
1.94M
                            (__m128i *)(pu1_src2 + (src_strd2 << 1)));
185
1.94M
            y1_3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2 * 3));
186
1.94M
            y1_4_16x8b = _mm_loadu_si128(
187
1.94M
                            (__m128i *)(pu1_src2 + (src_strd2 << 2)));
188
1.94M
            y1_5_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2 * 5));
189
1.94M
            y1_6_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2 * 6));
190
1.94M
            y1_7_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2 * 7));
191
192
1.94M
            y0_0_16x8b = _mm_avg_epu8(y0_0_16x8b, y1_0_16x8b);
193
1.94M
            y0_1_16x8b = _mm_avg_epu8(y0_1_16x8b, y1_1_16x8b);
194
1.94M
            y0_2_16x8b = _mm_avg_epu8(y0_2_16x8b, y1_2_16x8b);
195
1.94M
            y0_3_16x8b = _mm_avg_epu8(y0_3_16x8b, y1_3_16x8b);
196
1.94M
            y0_4_16x8b = _mm_avg_epu8(y0_4_16x8b, y1_4_16x8b);
197
1.94M
            y0_5_16x8b = _mm_avg_epu8(y0_5_16x8b, y1_5_16x8b);
198
1.94M
            y0_6_16x8b = _mm_avg_epu8(y0_6_16x8b, y1_6_16x8b);
199
1.94M
            y0_7_16x8b = _mm_avg_epu8(y0_7_16x8b, y1_7_16x8b);
200
201
1.94M
            _mm_storeu_si128((__m128i *)pu1_dst, y0_0_16x8b);
202
1.94M
            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y0_1_16x8b);
203
1.94M
            _mm_storeu_si128((__m128i *)(pu1_dst + (dst_strd << 1)), y0_2_16x8b);
204
1.94M
            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 3), y0_3_16x8b);
205
1.94M
            _mm_storeu_si128((__m128i *)(pu1_dst + (dst_strd << 2)), y0_4_16x8b);
206
1.94M
            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 5), y0_5_16x8b);
207
1.94M
            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 6), y0_6_16x8b);
208
1.94M
            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 7), y0_7_16x8b);
209
210
1.94M
            ht -= 8;
211
1.94M
            pu1_src1 += src_strd1 << 3;
212
1.94M
            pu1_src2 += src_strd2 << 3;
213
1.94M
            pu1_dst += dst_strd << 3;
214
1.94M
        }
215
1.94M
        while(ht > 0);
216
991k
    }
217
1.10M
}
218
219
/*****************************************************************************/
220
/*                                                                           */
221
/*  Function Name : ih264_default_weighted_pred_chroma_sse42                 */
222
/*                                                                           */
223
/*  Description   : This function performs the default weighted prediction   */
224
/*                  as described in sec 8.4.2.3.1 titled "Default weighted   */
225
/*                  sample prediction process" for chroma. The function gets */
226
/*                  two ht x wd blocks, calculates their rounded-average and */
227
/*                  stores it in the destination block. (ht,wd) can be       */
228
/*                  (2,2), (4,2) , (2,4), (4,4), (8,4), (4,8) or (8,8).      */
229
/*                                                                           */
230
/*  Inputs        : pu1_src1  - Pointer to source 1                          */
231
/*                  pu1_src2  - Pointer to source 2                          */
232
/*                  pu1_dst   - Pointer to destination                       */
233
/*                  src_strd1 - stride for source 1                          */
234
/*                  src_strd1 - stride for source 2                          */
235
/*                  dst_strd  - stride for destination                       */
236
/*                  ht        - height of the block                          */
237
/*                  wd        - width of the block                           */
238
/*                                                                           */
239
/*  Issues        : None                                                     */
240
/*                                                                           */
241
/*  Revision History:                                                        */
242
/*                                                                           */
243
/*         DD MM YYYY   Author(s)       Changes                              */
244
/*         04 02 2015   Kaushik         Initial Version                      */
245
/*                      Senthoor                                             */
246
/*                                                                           */
247
/*****************************************************************************/
248
void ih264_default_weighted_pred_chroma_sse42(UWORD8 *pu1_src1,
249
                                              UWORD8 *pu1_src2,
250
                                              UWORD8 *pu1_dst,
251
                                              WORD32 src_strd1,
252
                                              WORD32 src_strd2,
253
                                              WORD32 dst_strd,
254
                                              WORD32 ht,
255
                                              WORD32 wd)
256
1.10M
{
257
1.10M
    __m128i uv0_0_16x8b, uv0_1_16x8b;
258
1.10M
    __m128i uv1_0_16x8b, uv1_1_16x8b;
259
260
1.10M
    if(wd == 2)
261
11.1k
    {
262
11.1k
        do
263
13.8k
        {
264
13.8k
            uv0_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
265
13.8k
            uv0_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
266
267
13.8k
            uv1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
268
13.8k
            uv1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
269
270
13.8k
            uv0_0_16x8b = _mm_avg_epu8(uv0_0_16x8b, uv1_0_16x8b);
271
13.8k
            uv0_1_16x8b = _mm_avg_epu8(uv0_1_16x8b, uv1_1_16x8b);
272
273
13.8k
            *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(uv0_0_16x8b);
274
13.8k
            *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(uv0_1_16x8b);
275
276
13.8k
            ht -= 2;
277
13.8k
            pu1_src1 += src_strd1 << 1;
278
13.8k
            pu1_src2 += src_strd2 << 1;
279
13.8k
            pu1_dst += dst_strd << 1;
280
13.8k
        }
281
13.8k
        while(ht > 0);
282
11.1k
    }
283
1.09M
    else if(wd == 4)
284
102k
    {
285
102k
        do
286
231k
        {
287
231k
            uv0_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
288
231k
            uv0_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
289
290
231k
            uv1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
291
231k
            uv1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
292
293
231k
            uv0_0_16x8b = _mm_avg_epu8(uv0_0_16x8b, uv1_0_16x8b);
294
231k
            uv0_1_16x8b = _mm_avg_epu8(uv0_1_16x8b, uv1_1_16x8b);
295
296
231k
            _mm_storel_epi64((__m128i *)pu1_dst, uv0_0_16x8b);
297
231k
            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), uv0_1_16x8b);
298
299
231k
            ht -= 2;
300
231k
            pu1_src1 += src_strd1 << 1;
301
231k
            pu1_src2 += src_strd2 << 1;
302
231k
            pu1_dst += dst_strd << 1;
303
231k
        }
304
231k
        while(ht > 0);
305
102k
    }
306
991k
    else // wd == 8
307
991k
    {
308
991k
        __m128i uv0_2_16x8b, uv0_3_16x8b;
309
991k
        __m128i uv1_2_16x8b, uv1_3_16x8b;
310
311
991k
        do
312
1.94M
        {
313
1.94M
            uv0_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src1);
314
1.94M
            uv0_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1));
315
1.94M
            uv0_2_16x8b = _mm_loadu_si128(
316
1.94M
                            (__m128i *)(pu1_src1 + (src_strd1 << 1)));
317
1.94M
            uv0_3_16x8b = _mm_loadu_si128(
318
1.94M
                            (__m128i *)(pu1_src1 + src_strd1 * 3));
319
320
1.94M
            uv1_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src2);
321
1.94M
            uv1_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2));
322
1.94M
            uv1_2_16x8b = _mm_loadu_si128(
323
1.94M
                            (__m128i *)(pu1_src2 + (src_strd2 << 1)));
324
1.94M
            uv1_3_16x8b = _mm_loadu_si128(
325
1.94M
                            (__m128i *)(pu1_src2 + src_strd2 * 3));
326
327
1.94M
            uv0_0_16x8b = _mm_avg_epu8(uv0_0_16x8b, uv1_0_16x8b);
328
1.94M
            uv0_1_16x8b = _mm_avg_epu8(uv0_1_16x8b, uv1_1_16x8b);
329
1.94M
            uv0_2_16x8b = _mm_avg_epu8(uv0_2_16x8b, uv1_2_16x8b);
330
1.94M
            uv0_3_16x8b = _mm_avg_epu8(uv0_3_16x8b, uv1_3_16x8b);
331
332
1.94M
            _mm_storeu_si128((__m128i *)pu1_dst, uv0_0_16x8b);
333
1.94M
            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), uv0_1_16x8b);
334
1.94M
            _mm_storeu_si128(
335
1.94M
                            (__m128i *)(pu1_dst + (dst_strd << 1)), uv0_2_16x8b);
336
1.94M
            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 3), uv0_3_16x8b);
337
338
1.94M
            ht -= 4;
339
1.94M
            pu1_src1 += src_strd1 << 2;
340
1.94M
            pu1_src2 += src_strd2 << 2;
341
1.94M
            pu1_dst += dst_strd << 2;
342
1.94M
        }
343
1.94M
        while(ht > 0);
344
991k
    }
345
1.10M
}
346
347
/*****************************************************************************/
348
/*                                                                           */
349
/*  Function Name : ih264_weighted_pred_luma_sse42                           */
350
/*                                                                           */
351
/*  Description   : This function performs the weighted prediction as        */
352
/*                  described in sec 8.4.2.3.2 titled "Weighted sample       */
353
/*                  prediction process" for luma. The function gets one      */
354
/*                  ht x wd block, weights it, rounds it off, offsets it,    */
355
/*                  saturates it to unsigned 8-bit and stores it in the      */
356
/*                  destination block. (ht,wd) can be (4,4), (8,4), (4,8),   */
357
/*                  (8,8), (16,8), (8,16) or (16,16).                        */
358
/*                                                                           */
359
/*  Inputs        : pu1_src  - Pointer to source                             */
360
/*                  pu1_dst  - Pointer to destination                        */
361
/*                  src_strd - stride for source                             */
362
/*                  dst_strd - stride for destination                        */
363
/*                  log_wd   - number of bits to be rounded off              */
364
/*                  wt       - weight value                                  */
365
/*                  ofst     - offset value                                  */
366
/*                  ht       - height of the block                           */
367
/*                  wd       - width of the block                            */
368
/*                                                                           */
369
/*  Issues        : None                                                     */
370
/*                                                                           */
371
/*  Revision History:                                                        */
372
/*                                                                           */
373
/*         DD MM YYYY   Author(s)       Changes                              */
374
/*         04 02 2015   Kaushik         Initial Version                      */
375
/*                      Senthoor                                             */
376
/*                                                                           */
377
/*****************************************************************************/
378
void ih264_weighted_pred_luma_sse42(UWORD8 *pu1_src,
379
                                    UWORD8 *pu1_dst,
380
                                    WORD32 src_strd,
381
                                    WORD32 dst_strd,
382
                                    WORD32 log_wd,
383
                                    WORD32 wt,
384
                                    WORD32 ofst,
385
                                    WORD32 ht,
386
                                    WORD32 wd)
387
11.0M
{
388
11.0M
    __m128i y_0_16x8b, y_1_16x8b, y_2_16x8b, y_3_16x8b;
389
390
11.0M
    __m128i wt_8x16b, round_8x16b, ofst_8x16b;
391
392
11.0M
    WORD32 round_val;
393
394
11.0M
    wt = (WORD16)(wt & 0xffff);
395
11.0M
    round_val = 1 << (log_wd - 1);
396
11.0M
    ofst = (WORD8)(ofst & 0xff);
397
398
11.0M
    wt_8x16b = _mm_set1_epi16(wt);
399
11.0M
    round_8x16b = _mm_set1_epi16(round_val);
400
11.0M
    ofst_8x16b = _mm_set1_epi16(ofst);
401
402
11.0M
    if(wd == 4)
403
89.6k
    {
404
89.6k
        __m128i y_0_8x16b, y_2_8x16b;
405
406
89.6k
        do
407
146k
        {
408
146k
            y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
409
146k
            y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
410
146k
            y_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (src_strd << 1)));
411
146k
            y_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd * 3));
412
413
146k
            y_0_16x8b = _mm_unpacklo_epi32(y_0_16x8b, y_1_16x8b);
414
146k
            y_2_16x8b = _mm_unpacklo_epi32(y_2_16x8b, y_3_16x8b);
415
416
146k
            y_0_8x16b = _mm_cvtepu8_epi16(y_0_16x8b);
417
146k
            y_2_8x16b = _mm_cvtepu8_epi16(y_2_16x8b);
418
419
146k
            y_0_8x16b = _mm_mullo_epi16(y_0_8x16b, wt_8x16b);
420
146k
            y_2_8x16b = _mm_mullo_epi16(y_2_8x16b, wt_8x16b);
421
422
146k
            y_0_8x16b = _mm_adds_epi16(round_8x16b, y_0_8x16b);
423
146k
            y_2_8x16b = _mm_adds_epi16(round_8x16b, y_2_8x16b);
424
425
146k
            y_0_8x16b = _mm_srai_epi16(y_0_8x16b, log_wd);
426
146k
            y_2_8x16b = _mm_srai_epi16(y_2_8x16b, log_wd);
427
428
146k
            y_0_8x16b = _mm_adds_epi16(ofst_8x16b, y_0_8x16b);
429
146k
            y_2_8x16b = _mm_adds_epi16(ofst_8x16b, y_2_8x16b);
430
431
146k
            y_0_16x8b = _mm_packus_epi16(y_0_8x16b, y_2_8x16b);
432
146k
            y_1_16x8b = _mm_srli_si128(y_0_16x8b, 4);
433
146k
            y_2_16x8b = _mm_srli_si128(y_0_16x8b, 8);
434
146k
            y_3_16x8b = _mm_srli_si128(y_0_16x8b, 12);
435
436
146k
            *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y_0_16x8b);
437
146k
            *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y_1_16x8b);
438
146k
            *((WORD32 *)(pu1_dst + (dst_strd << 1))) = _mm_cvtsi128_si32(y_2_16x8b);
439
146k
            *((WORD32 *)(pu1_dst + dst_strd * 3)) = _mm_cvtsi128_si32(y_3_16x8b);
440
441
146k
            ht -= 4;
442
146k
            pu1_src += src_strd << 2;
443
146k
            pu1_dst += dst_strd << 2;
444
146k
        }
445
146k
        while(ht > 0);
446
89.6k
    }
447
10.9M
    else if(wd == 8)
448
82.8k
    {
449
82.8k
        __m128i y_0_8x16b, y_1_8x16b, y_2_8x16b, y_3_8x16b;
450
451
82.8k
        do
452
199k
        {
453
199k
            y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
454
199k
            y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
455
199k
            y_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (src_strd << 1)));
456
199k
            y_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd * 3));
457
458
199k
            y_0_8x16b = _mm_cvtepu8_epi16(y_0_16x8b);
459
199k
            y_1_8x16b = _mm_cvtepu8_epi16(y_1_16x8b);
460
199k
            y_2_8x16b = _mm_cvtepu8_epi16(y_2_16x8b);
461
199k
            y_3_8x16b = _mm_cvtepu8_epi16(y_3_16x8b);
462
463
199k
            y_0_8x16b = _mm_mullo_epi16(y_0_8x16b, wt_8x16b);
464
199k
            y_1_8x16b = _mm_mullo_epi16(y_1_8x16b, wt_8x16b);
465
199k
            y_2_8x16b = _mm_mullo_epi16(y_2_8x16b, wt_8x16b);
466
199k
            y_3_8x16b = _mm_mullo_epi16(y_3_8x16b, wt_8x16b);
467
468
199k
            y_0_8x16b = _mm_adds_epi16(round_8x16b, y_0_8x16b);
469
199k
            y_1_8x16b = _mm_adds_epi16(round_8x16b, y_1_8x16b);
470
199k
            y_2_8x16b = _mm_adds_epi16(round_8x16b, y_2_8x16b);
471
199k
            y_3_8x16b = _mm_adds_epi16(round_8x16b, y_3_8x16b);
472
473
199k
            y_0_8x16b = _mm_srai_epi16(y_0_8x16b, log_wd);
474
199k
            y_1_8x16b = _mm_srai_epi16(y_1_8x16b, log_wd);
475
199k
            y_2_8x16b = _mm_srai_epi16(y_2_8x16b, log_wd);
476
199k
            y_3_8x16b = _mm_srai_epi16(y_3_8x16b, log_wd);
477
478
199k
            y_0_8x16b = _mm_adds_epi16(ofst_8x16b, y_0_8x16b);
479
199k
            y_1_8x16b = _mm_adds_epi16(ofst_8x16b, y_1_8x16b);
480
199k
            y_2_8x16b = _mm_adds_epi16(ofst_8x16b, y_2_8x16b);
481
199k
            y_3_8x16b = _mm_adds_epi16(ofst_8x16b, y_3_8x16b);
482
483
199k
            y_0_16x8b = _mm_packus_epi16(y_0_8x16b, y_1_8x16b);
484
199k
            y_2_16x8b = _mm_packus_epi16(y_2_8x16b, y_3_8x16b);
485
199k
            y_1_16x8b = _mm_srli_si128(y_0_16x8b, 8);
486
199k
            y_3_16x8b = _mm_srli_si128(y_2_16x8b, 8);
487
488
199k
            _mm_storel_epi64((__m128i *)pu1_dst, y_0_16x8b);
489
199k
            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y_1_16x8b);
490
199k
            _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd << 1)), y_2_16x8b);
491
199k
            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd * 3), y_3_16x8b);
492
493
199k
            ht -= 4;
494
199k
            pu1_src += src_strd << 2;
495
199k
            pu1_dst += dst_strd << 2;
496
199k
        }
497
199k
        while(ht > 0);
498
82.8k
    }
499
10.8M
    else // wd == 16
500
10.8M
    {
501
10.8M
        __m128i y_0L_8x16b, y_1L_8x16b, y_2L_8x16b, y_3L_8x16b;
502
10.8M
        __m128i y_0H_8x16b, y_1H_8x16b, y_2H_8x16b, y_3H_8x16b;
503
504
10.8M
        __m128i zero_16x8b;
505
10.8M
        zero_16x8b = _mm_set1_epi8(0);
506
507
10.8M
        do
508
43.3M
        {
509
43.3M
            y_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
510
43.3M
            y_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd));
511
43.3M
            y_2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + (src_strd << 1)));
512
43.3M
            y_3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd * 3));
513
514
43.3M
            y_0L_8x16b = _mm_cvtepu8_epi16(y_0_16x8b);
515
43.3M
            y_0H_8x16b = _mm_unpackhi_epi8(y_0_16x8b, zero_16x8b);
516
43.3M
            y_1L_8x16b = _mm_cvtepu8_epi16(y_1_16x8b);
517
43.3M
            y_1H_8x16b = _mm_unpackhi_epi8(y_1_16x8b, zero_16x8b);
518
43.3M
            y_2L_8x16b = _mm_cvtepu8_epi16(y_2_16x8b);
519
43.3M
            y_2H_8x16b = _mm_unpackhi_epi8(y_2_16x8b, zero_16x8b);
520
43.3M
            y_3L_8x16b = _mm_cvtepu8_epi16(y_3_16x8b);
521
43.3M
            y_3H_8x16b = _mm_unpackhi_epi8(y_3_16x8b, zero_16x8b);
522
523
43.3M
            y_0L_8x16b = _mm_mullo_epi16(y_0L_8x16b, wt_8x16b);
524
43.3M
            y_0H_8x16b = _mm_mullo_epi16(y_0H_8x16b, wt_8x16b);
525
43.3M
            y_1L_8x16b = _mm_mullo_epi16(y_1L_8x16b, wt_8x16b);
526
43.3M
            y_1H_8x16b = _mm_mullo_epi16(y_1H_8x16b, wt_8x16b);
527
43.3M
            y_2L_8x16b = _mm_mullo_epi16(y_2L_8x16b, wt_8x16b);
528
43.3M
            y_2H_8x16b = _mm_mullo_epi16(y_2H_8x16b, wt_8x16b);
529
43.3M
            y_3L_8x16b = _mm_mullo_epi16(y_3L_8x16b, wt_8x16b);
530
43.3M
            y_3H_8x16b = _mm_mullo_epi16(y_3H_8x16b, wt_8x16b);
531
532
43.3M
            y_0L_8x16b = _mm_adds_epi16(round_8x16b, y_0L_8x16b);
533
43.3M
            y_0H_8x16b = _mm_adds_epi16(round_8x16b, y_0H_8x16b);
534
43.3M
            y_1L_8x16b = _mm_adds_epi16(round_8x16b, y_1L_8x16b);
535
43.3M
            y_1H_8x16b = _mm_adds_epi16(round_8x16b, y_1H_8x16b);
536
43.3M
            y_2L_8x16b = _mm_adds_epi16(round_8x16b, y_2L_8x16b);
537
43.3M
            y_2H_8x16b = _mm_adds_epi16(round_8x16b, y_2H_8x16b);
538
43.3M
            y_3L_8x16b = _mm_adds_epi16(round_8x16b, y_3L_8x16b);
539
43.3M
            y_3H_8x16b = _mm_adds_epi16(round_8x16b, y_3H_8x16b);
540
541
43.3M
            y_0L_8x16b = _mm_srai_epi16(y_0L_8x16b, log_wd);
542
43.3M
            y_0H_8x16b = _mm_srai_epi16(y_0H_8x16b, log_wd);
543
43.3M
            y_1L_8x16b = _mm_srai_epi16(y_1L_8x16b, log_wd);
544
43.3M
            y_1H_8x16b = _mm_srai_epi16(y_1H_8x16b, log_wd);
545
43.3M
            y_2L_8x16b = _mm_srai_epi16(y_2L_8x16b, log_wd);
546
43.3M
            y_2H_8x16b = _mm_srai_epi16(y_2H_8x16b, log_wd);
547
43.3M
            y_3L_8x16b = _mm_srai_epi16(y_3L_8x16b, log_wd);
548
43.3M
            y_3H_8x16b = _mm_srai_epi16(y_3H_8x16b, log_wd);
549
550
43.3M
            y_0L_8x16b = _mm_adds_epi16(ofst_8x16b, y_0L_8x16b);
551
43.3M
            y_0H_8x16b = _mm_adds_epi16(ofst_8x16b, y_0H_8x16b);
552
43.3M
            y_1L_8x16b = _mm_adds_epi16(ofst_8x16b, y_1L_8x16b);
553
43.3M
            y_1H_8x16b = _mm_adds_epi16(ofst_8x16b, y_1H_8x16b);
554
43.3M
            y_2L_8x16b = _mm_adds_epi16(ofst_8x16b, y_2L_8x16b);
555
43.3M
            y_2H_8x16b = _mm_adds_epi16(ofst_8x16b, y_2H_8x16b);
556
43.3M
            y_3L_8x16b = _mm_adds_epi16(ofst_8x16b, y_3L_8x16b);
557
43.3M
            y_3H_8x16b = _mm_adds_epi16(ofst_8x16b, y_3H_8x16b);
558
559
43.3M
            y_0_16x8b = _mm_packus_epi16(y_0L_8x16b, y_0H_8x16b);
560
43.3M
            y_1_16x8b = _mm_packus_epi16(y_1L_8x16b, y_1H_8x16b);
561
43.3M
            y_2_16x8b = _mm_packus_epi16(y_2L_8x16b, y_2H_8x16b);
562
43.3M
            y_3_16x8b = _mm_packus_epi16(y_3L_8x16b, y_3H_8x16b);
563
564
43.3M
            _mm_storeu_si128((__m128i *)pu1_dst, y_0_16x8b);
565
43.3M
            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y_1_16x8b);
566
43.3M
            _mm_storeu_si128((__m128i *)(pu1_dst + (dst_strd << 1)), y_2_16x8b);
567
43.3M
            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 3), y_3_16x8b);
568
569
43.3M
            ht -= 4;
570
43.3M
            pu1_src += src_strd << 2;
571
43.3M
            pu1_dst += dst_strd << 2;
572
43.3M
        }
573
43.3M
        while(ht > 0);
574
10.8M
    }
575
11.0M
}
576
577
/*****************************************************************************/
578
/*                                                                           */
579
/*  Function Name : ih264_weighted_pred_chroma_sse42                         */
580
/*                                                                           */
581
/*  Description   : This function performs the weighted prediction as        */
582
/*                  described in sec 8.4.2.3.2 titled "Weighted sample       */
583
/*                  prediction process" for chroma. The function gets one    */
584
/*                  ht x wd block, weights it, rounds it off, offsets it,    */
585
/*                  saturates it to unsigned 8-bit and stores it in the      */
586
/*                  destination block. (ht,wd) can be (2,2), (4,2), (2,4),   */
587
/*                  (4,4), (8,4), (4,8) or (8,8).                            */
588
/*                                                                           */
589
/*  Inputs        : pu1_src  - Pointer to source                             */
590
/*                  pu1_dst  - Pointer to destination                        */
591
/*                  src_strd - stride for source                             */
592
/*                  dst_strd - stride for destination                        */
593
/*                  log_wd   - number of bits to be rounded off              */
594
/*                  wt       - weight values for u and v                     */
595
/*                  ofst     - offset values for u and v                     */
596
/*                  ht       - height of the block                           */
597
/*                  wd       - width of the block                            */
598
/*                                                                           */
599
/*  Issues        : None                                                     */
600
/*                                                                           */
601
/*  Revision History:                                                        */
602
/*                                                                           */
603
/*         DD MM YYYY   Author(s)       Changes                              */
604
/*         04 02 2015   Kaushik         Initial Version                      */
605
/*                      Senthoor                                             */
606
/*                                                                           */
607
/*****************************************************************************/
608
void ih264_weighted_pred_chroma_sse42(UWORD8 *pu1_src,
609
                                      UWORD8 *pu1_dst,
610
                                      WORD32 src_strd,
611
                                      WORD32 dst_strd,
612
                                      WORD32 log_wd,
613
                                      WORD32 wt,
614
                                      WORD32 ofst,
615
                                      WORD32 ht,
616
                                      WORD32 wd)
617
11.0M
{
618
11.0M
    __m128i y_0_16x8b, y_1_16x8b;
619
620
11.0M
    __m128i wt_8x16b, round_8x16b, ofst_8x16b;
621
622
11.0M
    WORD32 ofst_u, ofst_v;
623
11.0M
    WORD32 round_val;
624
625
11.0M
    ofst_u = (WORD8)(ofst & 0xff);
626
11.0M
    ofst_v = (WORD8)(ofst >> 8);
627
11.0M
    round_val = 1 << (log_wd - 1);
628
11.0M
    ofst = (ofst_u & 0xffff) | (ofst_v << 16);
629
630
11.0M
    wt_8x16b = _mm_set1_epi32(wt);
631
11.0M
    round_8x16b = _mm_set1_epi16(round_val);
632
11.0M
    ofst_8x16b = _mm_set1_epi32(ofst);
633
634
11.0M
    if(wd == 2)
635
89.6k
    {
636
89.6k
        __m128i y_0_8x16b;
637
638
89.6k
        do
639
146k
        {
640
146k
            y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
641
146k
            y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
642
643
146k
            y_0_16x8b = _mm_unpacklo_epi32(y_0_16x8b, y_1_16x8b);
644
645
146k
            y_0_8x16b = _mm_cvtepu8_epi16(y_0_16x8b);
646
647
146k
            y_0_8x16b = _mm_mullo_epi16(y_0_8x16b, wt_8x16b);
648
649
146k
            y_0_8x16b = _mm_adds_epi16(round_8x16b, y_0_8x16b);
650
651
146k
            y_0_8x16b = _mm_srai_epi16(y_0_8x16b, log_wd);
652
653
146k
            y_0_8x16b = _mm_adds_epi16(ofst_8x16b, y_0_8x16b);
654
655
146k
            y_0_16x8b = _mm_packus_epi16(y_0_8x16b, y_0_8x16b);
656
146k
            y_1_16x8b = _mm_srli_si128(y_0_16x8b, 4);
657
658
146k
            *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y_0_16x8b);
659
146k
            *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y_1_16x8b);
660
661
146k
            ht -= 2;
662
146k
            pu1_src += src_strd << 1;
663
146k
            pu1_dst += dst_strd << 1;
664
146k
        }
665
146k
        while(ht > 0);
666
89.6k
    }
667
10.9M
    else if(wd == 4)
668
82.8k
    {
669
82.8k
        __m128i y_0_8x16b, y_1_8x16b;
670
671
82.8k
        do
672
199k
        {
673
199k
            y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
674
199k
            y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
675
676
199k
            y_0_8x16b = _mm_cvtepu8_epi16(y_0_16x8b);
677
199k
            y_1_8x16b = _mm_cvtepu8_epi16(y_1_16x8b);
678
679
199k
            y_0_8x16b = _mm_mullo_epi16(y_0_8x16b, wt_8x16b);
680
199k
            y_1_8x16b = _mm_mullo_epi16(y_1_8x16b, wt_8x16b);
681
682
199k
            y_0_8x16b = _mm_adds_epi16(round_8x16b, y_0_8x16b);
683
199k
            y_1_8x16b = _mm_adds_epi16(round_8x16b, y_1_8x16b);
684
685
199k
            y_0_8x16b = _mm_srai_epi16(y_0_8x16b, log_wd);
686
199k
            y_1_8x16b = _mm_srai_epi16(y_1_8x16b, log_wd);
687
688
199k
            y_0_8x16b = _mm_adds_epi16(ofst_8x16b, y_0_8x16b);
689
199k
            y_1_8x16b = _mm_adds_epi16(ofst_8x16b, y_1_8x16b);
690
691
199k
            y_0_16x8b = _mm_packus_epi16(y_0_8x16b, y_1_8x16b);
692
199k
            y_1_16x8b = _mm_srli_si128(y_0_16x8b, 8);
693
694
199k
            _mm_storel_epi64((__m128i *)pu1_dst, y_0_16x8b);
695
199k
            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y_1_16x8b);
696
697
199k
            ht -= 2;
698
199k
            pu1_src += src_strd << 1;
699
199k
            pu1_dst += dst_strd << 1;
700
199k
        }
701
199k
        while(ht > 0);
702
82.8k
    }
703
10.8M
    else // wd == 16
704
10.8M
    {
705
10.8M
        __m128i y_2_16x8b, y_3_16x8b;
706
10.8M
        __m128i y_0L_8x16b, y_1L_8x16b, y_2L_8x16b, y_3L_8x16b;
707
10.8M
        __m128i y_0H_8x16b, y_1H_8x16b, y_2H_8x16b, y_3H_8x16b;
708
709
10.8M
        __m128i zero_16x8b;
710
10.8M
        zero_16x8b = _mm_set1_epi8(0);
711
712
10.8M
        do
713
21.6M
        {
714
21.6M
            y_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
715
21.6M
            y_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd));
716
21.6M
            y_2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + (src_strd << 1)));
717
21.6M
            y_3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd * 3));
718
719
21.6M
            y_0L_8x16b = _mm_cvtepu8_epi16(y_0_16x8b);
720
21.6M
            y_0H_8x16b = _mm_unpackhi_epi8(y_0_16x8b, zero_16x8b);
721
21.6M
            y_1L_8x16b = _mm_cvtepu8_epi16(y_1_16x8b);
722
21.6M
            y_1H_8x16b = _mm_unpackhi_epi8(y_1_16x8b, zero_16x8b);
723
21.6M
            y_2L_8x16b = _mm_cvtepu8_epi16(y_2_16x8b);
724
21.6M
            y_2H_8x16b = _mm_unpackhi_epi8(y_2_16x8b, zero_16x8b);
725
21.6M
            y_3L_8x16b = _mm_cvtepu8_epi16(y_3_16x8b);
726
21.6M
            y_3H_8x16b = _mm_unpackhi_epi8(y_3_16x8b, zero_16x8b);
727
728
21.6M
            y_0L_8x16b = _mm_mullo_epi16(y_0L_8x16b, wt_8x16b);
729
21.6M
            y_0H_8x16b = _mm_mullo_epi16(y_0H_8x16b, wt_8x16b);
730
21.6M
            y_1L_8x16b = _mm_mullo_epi16(y_1L_8x16b, wt_8x16b);
731
21.6M
            y_1H_8x16b = _mm_mullo_epi16(y_1H_8x16b, wt_8x16b);
732
21.6M
            y_2L_8x16b = _mm_mullo_epi16(y_2L_8x16b, wt_8x16b);
733
21.6M
            y_2H_8x16b = _mm_mullo_epi16(y_2H_8x16b, wt_8x16b);
734
21.6M
            y_3L_8x16b = _mm_mullo_epi16(y_3L_8x16b, wt_8x16b);
735
21.6M
            y_3H_8x16b = _mm_mullo_epi16(y_3H_8x16b, wt_8x16b);
736
737
21.6M
            y_0L_8x16b = _mm_adds_epi16(round_8x16b, y_0L_8x16b);
738
21.6M
            y_0H_8x16b = _mm_adds_epi16(round_8x16b, y_0H_8x16b);
739
21.6M
            y_1L_8x16b = _mm_adds_epi16(round_8x16b, y_1L_8x16b);
740
21.6M
            y_1H_8x16b = _mm_adds_epi16(round_8x16b, y_1H_8x16b);
741
21.6M
            y_2L_8x16b = _mm_adds_epi16(round_8x16b, y_2L_8x16b);
742
21.6M
            y_2H_8x16b = _mm_adds_epi16(round_8x16b, y_2H_8x16b);
743
21.6M
            y_3L_8x16b = _mm_adds_epi16(round_8x16b, y_3L_8x16b);
744
21.6M
            y_3H_8x16b = _mm_adds_epi16(round_8x16b, y_3H_8x16b);
745
746
21.6M
            y_0L_8x16b = _mm_srai_epi16(y_0L_8x16b, log_wd);
747
21.6M
            y_0H_8x16b = _mm_srai_epi16(y_0H_8x16b, log_wd);
748
21.6M
            y_1L_8x16b = _mm_srai_epi16(y_1L_8x16b, log_wd);
749
21.6M
            y_1H_8x16b = _mm_srai_epi16(y_1H_8x16b, log_wd);
750
21.6M
            y_2L_8x16b = _mm_srai_epi16(y_2L_8x16b, log_wd);
751
21.6M
            y_2H_8x16b = _mm_srai_epi16(y_2H_8x16b, log_wd);
752
21.6M
            y_3L_8x16b = _mm_srai_epi16(y_3L_8x16b, log_wd);
753
21.6M
            y_3H_8x16b = _mm_srai_epi16(y_3H_8x16b, log_wd);
754
755
21.6M
            y_0L_8x16b = _mm_adds_epi16(ofst_8x16b, y_0L_8x16b);
756
21.6M
            y_0H_8x16b = _mm_adds_epi16(ofst_8x16b, y_0H_8x16b);
757
21.6M
            y_1L_8x16b = _mm_adds_epi16(ofst_8x16b, y_1L_8x16b);
758
21.6M
            y_1H_8x16b = _mm_adds_epi16(ofst_8x16b, y_1H_8x16b);
759
21.6M
            y_2L_8x16b = _mm_adds_epi16(ofst_8x16b, y_2L_8x16b);
760
21.6M
            y_2H_8x16b = _mm_adds_epi16(ofst_8x16b, y_2H_8x16b);
761
21.6M
            y_3L_8x16b = _mm_adds_epi16(ofst_8x16b, y_3L_8x16b);
762
21.6M
            y_3H_8x16b = _mm_adds_epi16(ofst_8x16b, y_3H_8x16b);
763
764
21.6M
            y_0_16x8b = _mm_packus_epi16(y_0L_8x16b, y_0H_8x16b);
765
21.6M
            y_1_16x8b = _mm_packus_epi16(y_1L_8x16b, y_1H_8x16b);
766
21.6M
            y_2_16x8b = _mm_packus_epi16(y_2L_8x16b, y_2H_8x16b);
767
21.6M
            y_3_16x8b = _mm_packus_epi16(y_3L_8x16b, y_3H_8x16b);
768
769
21.6M
            _mm_storeu_si128((__m128i *)pu1_dst, y_0_16x8b);
770
21.6M
            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y_1_16x8b);
771
21.6M
            _mm_storeu_si128((__m128i *)(pu1_dst + (dst_strd << 1)), y_2_16x8b);
772
21.6M
            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 3), y_3_16x8b);
773
774
21.6M
            ht -= 4;
775
21.6M
            pu1_src += src_strd << 2;
776
21.6M
            pu1_dst += dst_strd << 2;
777
21.6M
        }
778
21.6M
        while(ht > 0);
779
10.8M
    }
780
11.0M
}
781
782
/*****************************************************************************/
783
/*                                                                           */
784
/*  Function Name : ih264_weighted_bi_pred_luma_sse42                        */
785
/*                                                                           */
786
/*  Description   : This function performs the weighted biprediction as      */
787
/*                  described in sec 8.4.2.3.2 titled "Weighted sample       */
788
/*                  prediction process" for luma. The function gets two      */
789
/*                  ht x wd blocks, weights them, adds them, rounds off the  */
790
/*                  sum, offsets it, saturates it to unsigned 8-bit and      */
791
/*                  stores it in the destination block. (ht,wd) can be       */
792
/*                  (4,4), (8,4), (4,8), (8,8), (16,8), (8,16) or (16,16).   */
793
/*                                                                           */
794
/*  Inputs        : pu1_src1  - Pointer to source 1                          */
795
/*                  pu1_src2  - Pointer to source 2                          */
796
/*                  pu1_dst   - Pointer to destination                       */
797
/*                  src_strd1 - stride for source 1                          */
798
/*                  src_strd2 - stride for source 2                          */
799
/*                  dst_strd2 - stride for destination                       */
800
/*                  log_wd    - number of bits to be rounded off             */
801
/*                  wt1       - weight value for source 1                    */
802
/*                  wt2       - weight value for source 2                    */
803
/*                  ofst1     - offset value for source 1                    */
804
/*                  ofst2     - offset value for source 2                    */
805
/*                  ht        - height of the block                          */
806
/*                  wd        - width of the block                           */
807
/*                                                                           */
808
/*  Issues        : None                                                     */
809
/*                                                                           */
810
/*  Revision History:                                                        */
811
/*                                                                           */
812
/*         DD MM YYYY   Author(s)       Changes                              */
813
/*         04 02 2015   Kaushik         Initial Version                      */
814
/*                      Senthoor                                             */
815
/*                                                                           */
816
/*****************************************************************************/
817
void ih264_weighted_bi_pred_luma_sse42(UWORD8 *pu1_src1,
818
                                       UWORD8 *pu1_src2,
819
                                       UWORD8 *pu1_dst,
820
                                       WORD32 src_strd1,
821
                                       WORD32 src_strd2,
822
                                       WORD32 dst_strd,
823
                                       WORD32 log_wd,
824
                                       WORD32 wt1,
825
                                       WORD32 wt2,
826
                                       WORD32 ofst1,
827
                                       WORD32 ofst2,
828
                                       WORD32 ht,
829
                                       WORD32 wd)
830
2.91M
{
831
2.91M
    __m128i y1_0_16x8b, y1_1_16x8b;
832
2.91M
    __m128i y2_0_16x8b, y2_1_16x8b;
833
834
2.91M
    __m128i wt1_8x16b, wt2_8x16b;
835
2.91M
    __m128i ofst_8x16b, round_8x16b;
836
837
2.91M
    WORD32 ofst;
838
2.91M
    WORD32 round_val, shft;
839
840
2.91M
    wt1 = (WORD16)(wt1 & 0xffff);
841
2.91M
    wt2 = (WORD16)(wt2 & 0xffff);
842
2.91M
    round_val = 1 << log_wd;
843
2.91M
    shft = log_wd + 1;
844
2.91M
    ofst1 = (WORD8)(ofst1 & 0xff);
845
2.91M
    ofst2 = (WORD8)(ofst2 & 0xff);
846
2.91M
    ofst = (ofst1 + ofst2 + 1) >> 1;
847
848
2.91M
    wt1_8x16b = _mm_set1_epi16(wt1);
849
2.91M
    wt2_8x16b = _mm_set1_epi16(wt2);
850
2.91M
    round_8x16b = _mm_set1_epi16(round_val);
851
2.91M
    ofst_8x16b = _mm_set1_epi16(ofst);
852
853
2.91M
    if(wd == 4)
854
3.30k
    {
855
3.30k
        __m128i y1_2_16x8b, y1_3_16x8b;
856
3.30k
        __m128i y2_2_16x8b, y2_3_16x8b;
857
858
3.30k
        __m128i y1_0_8x16b, y1_2_8x16b;
859
3.30k
        __m128i y2_0_8x16b, y2_2_8x16b;
860
861
3.30k
        do
862
4.02k
        {
863
4.02k
            y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
864
4.02k
            y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
865
4.02k
            y1_2_16x8b = _mm_loadl_epi64(
866
4.02k
                            (__m128i *)(pu1_src1 + (src_strd1 << 1)));
867
4.02k
            y1_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1 * 3));
868
869
4.02k
            y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
870
4.02k
            y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
871
4.02k
            y2_2_16x8b = _mm_loadl_epi64(
872
4.02k
                            (__m128i *)(pu1_src2 + (src_strd2 << 1)));
873
4.02k
            y2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2 * 3));
874
875
4.02k
            y1_0_16x8b = _mm_unpacklo_epi32(y1_0_16x8b, y1_1_16x8b);
876
4.02k
            y1_2_16x8b = _mm_unpacklo_epi32(y1_2_16x8b, y1_3_16x8b);
877
4.02k
            y2_0_16x8b = _mm_unpacklo_epi32(y2_0_16x8b, y2_1_16x8b);
878
4.02k
            y2_2_16x8b = _mm_unpacklo_epi32(y2_2_16x8b, y2_3_16x8b);
879
880
4.02k
            y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
881
4.02k
            y1_2_8x16b = _mm_cvtepu8_epi16(y1_2_16x8b);
882
4.02k
            y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
883
4.02k
            y2_2_8x16b = _mm_cvtepu8_epi16(y2_2_16x8b);
884
885
4.02k
            y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b);
886
4.02k
            y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b);
887
4.02k
            y1_2_8x16b = _mm_mullo_epi16(y1_2_8x16b, wt1_8x16b);
888
4.02k
            y2_2_8x16b = _mm_mullo_epi16(y2_2_8x16b, wt2_8x16b);
889
890
4.02k
            y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b);
891
4.02k
            y1_2_8x16b = _mm_adds_epi16(y1_2_8x16b, y2_2_8x16b);
892
893
4.02k
            y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b);
894
4.02k
            y1_2_8x16b = _mm_adds_epi16(round_8x16b, y1_2_8x16b);
895
896
4.02k
            y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft);
897
4.02k
            y1_2_8x16b = _mm_srai_epi16(y1_2_8x16b, shft);
898
899
4.02k
            y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b);
900
4.02k
            y1_2_8x16b = _mm_adds_epi16(ofst_8x16b, y1_2_8x16b);
901
902
4.02k
            y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_2_8x16b);
903
4.02k
            y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 4);
904
4.02k
            y1_2_16x8b = _mm_srli_si128(y1_0_16x8b, 8);
905
4.02k
            y1_3_16x8b = _mm_srli_si128(y1_0_16x8b, 12);
906
907
4.02k
            *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y1_0_16x8b);
908
4.02k
            *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y1_1_16x8b);
909
4.02k
            *((WORD32 *)(pu1_dst + (dst_strd << 1))) = _mm_cvtsi128_si32(y1_2_16x8b);
910
4.02k
            *((WORD32 *)(pu1_dst + dst_strd * 3)) = _mm_cvtsi128_si32(y1_3_16x8b);
911
912
913
4.02k
            ht -= 4;
914
4.02k
            pu1_src1 += src_strd1 << 2;
915
4.02k
            pu1_src2 += src_strd2 << 2;
916
4.02k
            pu1_dst += dst_strd << 2;
917
4.02k
        }
918
4.02k
        while(ht > 0);
919
3.30k
    }
920
2.90M
    else if(wd == 8)
921
30.0k
    {
922
30.0k
        __m128i y1_2_16x8b, y1_3_16x8b;
923
30.0k
        __m128i y2_2_16x8b, y2_3_16x8b;
924
925
30.0k
        __m128i y1_0_8x16b, y1_1_8x16b, y1_2_8x16b, y1_3_8x16b;
926
30.0k
        __m128i y2_0_8x16b, y2_1_8x16b, y2_2_8x16b, y2_3_8x16b;
927
928
30.0k
        do
929
77.1k
        {
930
77.1k
            y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
931
77.1k
            y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
932
77.1k
            y1_2_16x8b = _mm_loadl_epi64(
933
77.1k
                            (__m128i *)(pu1_src1 + (src_strd1 << 1)));
934
77.1k
            y1_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1 * 3));
935
936
77.1k
            y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
937
77.1k
            y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
938
77.1k
            y2_2_16x8b = _mm_loadl_epi64(
939
77.1k
                            (__m128i *)(pu1_src2 + (src_strd2 << 1)));
940
77.1k
            y2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2 * 3));
941
942
77.1k
            y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
943
77.1k
            y1_1_8x16b = _mm_cvtepu8_epi16(y1_1_16x8b);
944
77.1k
            y1_2_8x16b = _mm_cvtepu8_epi16(y1_2_16x8b);
945
77.1k
            y1_3_8x16b = _mm_cvtepu8_epi16(y1_3_16x8b);
946
947
77.1k
            y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
948
77.1k
            y2_1_8x16b = _mm_cvtepu8_epi16(y2_1_16x8b);
949
77.1k
            y2_2_8x16b = _mm_cvtepu8_epi16(y2_2_16x8b);
950
77.1k
            y2_3_8x16b = _mm_cvtepu8_epi16(y2_3_16x8b);
951
952
77.1k
            y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b);
953
77.1k
            y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b);
954
77.1k
            y1_1_8x16b = _mm_mullo_epi16(y1_1_8x16b, wt1_8x16b);
955
77.1k
            y2_1_8x16b = _mm_mullo_epi16(y2_1_8x16b, wt2_8x16b);
956
957
77.1k
            y1_2_8x16b = _mm_mullo_epi16(y1_2_8x16b, wt1_8x16b);
958
77.1k
            y2_2_8x16b = _mm_mullo_epi16(y2_2_8x16b, wt2_8x16b);
959
77.1k
            y1_3_8x16b = _mm_mullo_epi16(y1_3_8x16b, wt1_8x16b);
960
77.1k
            y2_3_8x16b = _mm_mullo_epi16(y2_3_8x16b, wt2_8x16b);
961
962
77.1k
            y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b);
963
77.1k
            y1_1_8x16b = _mm_adds_epi16(y1_1_8x16b, y2_1_8x16b);
964
77.1k
            y1_2_8x16b = _mm_adds_epi16(y1_2_8x16b, y2_2_8x16b);
965
77.1k
            y1_3_8x16b = _mm_adds_epi16(y1_3_8x16b, y2_3_8x16b);
966
967
77.1k
            y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b);
968
77.1k
            y1_1_8x16b = _mm_adds_epi16(round_8x16b, y1_1_8x16b);
969
77.1k
            y1_2_8x16b = _mm_adds_epi16(round_8x16b, y1_2_8x16b);
970
77.1k
            y1_3_8x16b = _mm_adds_epi16(round_8x16b, y1_3_8x16b);
971
972
77.1k
            y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft);
973
77.1k
            y1_1_8x16b = _mm_srai_epi16(y1_1_8x16b, shft);
974
77.1k
            y1_2_8x16b = _mm_srai_epi16(y1_2_8x16b, shft);
975
77.1k
            y1_3_8x16b = _mm_srai_epi16(y1_3_8x16b, shft);
976
977
77.1k
            y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b);
978
77.1k
            y1_1_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1_8x16b);
979
77.1k
            y1_2_8x16b = _mm_adds_epi16(ofst_8x16b, y1_2_8x16b);
980
77.1k
            y1_3_8x16b = _mm_adds_epi16(ofst_8x16b, y1_3_8x16b);
981
982
77.1k
            y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_1_8x16b);
983
77.1k
            y1_2_16x8b = _mm_packus_epi16(y1_2_8x16b, y1_3_8x16b);
984
77.1k
            y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 8);
985
77.1k
            y1_3_16x8b = _mm_srli_si128(y1_2_16x8b, 8);
986
987
77.1k
            _mm_storel_epi64((__m128i *)pu1_dst, y1_0_16x8b);
988
77.1k
            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b);
989
77.1k
            _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd << 1)), y1_2_16x8b);
990
77.1k
            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd * 3), y1_3_16x8b);
991
992
77.1k
            ht -= 4;
993
77.1k
            pu1_src1 += src_strd1 << 2;
994
77.1k
            pu1_src2 += src_strd2 << 2;
995
77.1k
            pu1_dst += dst_strd << 2;
996
77.1k
        }
997
77.1k
        while(ht > 0);
998
30.0k
    }
999
2.87M
    else // wd == 16
1000
2.87M
    {
1001
2.87M
        __m128i y1_0L_8x16b, y1_0H_8x16b, y1_1L_8x16b, y1_1H_8x16b;
1002
2.87M
        __m128i y2_0L_8x16b, y2_0H_8x16b, y2_1L_8x16b, y2_1H_8x16b;
1003
1004
2.87M
        __m128i zero_16x8b;
1005
2.87M
        zero_16x8b = _mm_set1_epi8(0);
1006
1007
2.87M
        do
1008
22.9M
        {
1009
22.9M
            y1_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src1);
1010
22.9M
            y1_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1));
1011
22.9M
            y2_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src2);
1012
22.9M
            y2_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2));
1013
1014
22.9M
            y1_0L_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
1015
22.9M
            y1_0H_8x16b = _mm_unpackhi_epi8(y1_0_16x8b, zero_16x8b);
1016
22.9M
            y1_1L_8x16b = _mm_cvtepu8_epi16(y1_1_16x8b);
1017
22.9M
            y1_1H_8x16b = _mm_unpackhi_epi8(y1_1_16x8b, zero_16x8b);
1018
1019
22.9M
            y2_0L_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
1020
22.9M
            y2_0H_8x16b = _mm_unpackhi_epi8(y2_0_16x8b, zero_16x8b);
1021
22.9M
            y2_1L_8x16b = _mm_cvtepu8_epi16(y2_1_16x8b);
1022
22.9M
            y2_1H_8x16b = _mm_unpackhi_epi8(y2_1_16x8b, zero_16x8b);
1023
1024
22.9M
            y1_0L_8x16b = _mm_mullo_epi16(y1_0L_8x16b, wt1_8x16b);
1025
22.9M
            y1_0H_8x16b = _mm_mullo_epi16(y1_0H_8x16b, wt1_8x16b);
1026
22.9M
            y1_1L_8x16b = _mm_mullo_epi16(y1_1L_8x16b, wt1_8x16b);
1027
22.9M
            y1_1H_8x16b = _mm_mullo_epi16(y1_1H_8x16b, wt1_8x16b);
1028
1029
22.9M
            y2_0L_8x16b = _mm_mullo_epi16(y2_0L_8x16b, wt2_8x16b);
1030
22.9M
            y2_0H_8x16b = _mm_mullo_epi16(y2_0H_8x16b, wt2_8x16b);
1031
22.9M
            y2_1L_8x16b = _mm_mullo_epi16(y2_1L_8x16b, wt2_8x16b);
1032
22.9M
            y2_1H_8x16b = _mm_mullo_epi16(y2_1H_8x16b, wt2_8x16b);
1033
1034
22.9M
            y1_0L_8x16b = _mm_adds_epi16(y1_0L_8x16b, y2_0L_8x16b);
1035
22.9M
            y1_0H_8x16b = _mm_adds_epi16(y1_0H_8x16b, y2_0H_8x16b);
1036
22.9M
            y1_1L_8x16b = _mm_adds_epi16(y1_1L_8x16b, y2_1L_8x16b);
1037
22.9M
            y1_1H_8x16b = _mm_adds_epi16(y1_1H_8x16b, y2_1H_8x16b);
1038
1039
22.9M
            y1_0L_8x16b = _mm_adds_epi16(round_8x16b, y1_0L_8x16b);
1040
22.9M
            y1_0H_8x16b = _mm_adds_epi16(round_8x16b, y1_0H_8x16b);
1041
22.9M
            y1_1L_8x16b = _mm_adds_epi16(round_8x16b, y1_1L_8x16b);
1042
22.9M
            y1_1H_8x16b = _mm_adds_epi16(round_8x16b, y1_1H_8x16b);
1043
1044
22.9M
            y1_0L_8x16b = _mm_srai_epi16(y1_0L_8x16b, shft);
1045
22.9M
            y1_0H_8x16b = _mm_srai_epi16(y1_0H_8x16b, shft);
1046
22.9M
            y1_1L_8x16b = _mm_srai_epi16(y1_1L_8x16b, shft);
1047
22.9M
            y1_1H_8x16b = _mm_srai_epi16(y1_1H_8x16b, shft);
1048
1049
22.9M
            y1_0L_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0L_8x16b);
1050
22.9M
            y1_0H_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0H_8x16b);
1051
22.9M
            y1_1L_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1L_8x16b);
1052
22.9M
            y1_1H_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1H_8x16b);
1053
1054
22.9M
            y1_0_16x8b = _mm_packus_epi16(y1_0L_8x16b, y1_0H_8x16b);
1055
22.9M
            y1_1_16x8b = _mm_packus_epi16(y1_1L_8x16b, y1_1H_8x16b);
1056
1057
22.9M
            _mm_storeu_si128((__m128i *)pu1_dst, y1_0_16x8b);
1058
22.9M
            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b);
1059
1060
22.9M
            ht -= 2;
1061
22.9M
            pu1_src1 += src_strd1 << 1;
1062
22.9M
            pu1_src2 += src_strd2 << 1;
1063
22.9M
            pu1_dst += dst_strd << 1;
1064
22.9M
        }
1065
22.9M
        while(ht > 0);
1066
2.87M
    }
1067
2.91M
}
1068
1069
/*****************************************************************************/
1070
/*                                                                           */
1071
/*  Function Name : ih264_weighted_bi_pred_chroma_sse42                      */
1072
/*                                                                           */
1073
/*  Description   : This function performs the weighted biprediction as      */
1074
/*                  described in sec 8.4.2.3.2 titled "Weighted sample       */
1075
/*                  prediction process" for chroma. The function gets two    */
1076
/*                  ht x wd blocks, weights them, adds them, rounds off the  */
1077
/*                  sum, offsets it, saturates it to unsigned 8-bit and      */
1078
/*                  stores it in the destination block. (ht,wd) can be       */
1079
/*                  (2,2), (4,2), (2,4), (4,4), (8,4), (4,8) or (8,8).       */
1080
/*                                                                           */
1081
/*  Inputs        : pu1_src1  - Pointer to source 1                          */
1082
/*                  pu1_src2  - Pointer to source 2                          */
1083
/*                  pu1_dst   - Pointer to destination                       */
1084
/*                  src_strd1 - stride for source 1                          */
1085
/*                  src_strd2 - stride for source 2                          */
1086
/*                  dst_strd2 - stride for destination                       */
1087
/*                  log_wd    - number of bits to be rounded off             */
1088
/*                  wt1       - weight values for u and v in source 1        */
1089
/*                  wt2       - weight values for u and v in source 2        */
1090
/*                  ofst1     - offset value for u and v in source 1         */
1091
/*                  ofst2     - offset value for u and v in source 2         */
1092
/*                  ht        - height of the block                          */
1093
/*                  wd        - width of the block                           */
1094
/*                                                                           */
1095
/*  Issues        : None                                                     */
1096
/*                                                                           */
1097
/*  Revision History:                                                        */
1098
/*                                                                           */
1099
/*         DD MM YYYY   Author(s)       Changes                              */
1100
/*         04 02 2015   Kaushik         Initial Version                      */
1101
/*                      Senthoor                                             */
1102
/*                                                                           */
1103
/*****************************************************************************/
1104
void ih264_weighted_bi_pred_chroma_sse42(UWORD8 *pu1_src1,
1105
                                         UWORD8 *pu1_src2,
1106
                                         UWORD8 *pu1_dst,
1107
                                         WORD32 src_strd1,
1108
                                         WORD32 src_strd2,
1109
                                         WORD32 dst_strd,
1110
                                         WORD32 log_wd,
1111
                                         WORD32 wt1,
1112
                                         WORD32 wt2,
1113
                                         WORD32 ofst1,
1114
                                         WORD32 ofst2,
1115
                                         WORD32 ht,
1116
                                         WORD32 wd)
1117
2.91M
{
1118
2.91M
    __m128i y1_0_16x8b, y1_1_16x8b;
1119
2.91M
    __m128i y2_0_16x8b, y2_1_16x8b;
1120
1121
2.91M
    __m128i wt1_8x16b, wt2_8x16b;
1122
2.91M
    __m128i ofst_8x16b, round_8x16b;
1123
1124
2.91M
    WORD32 ofst1_u, ofst2_u, ofst_u;
1125
2.91M
    WORD32 ofst1_v, ofst2_v, ofst_v;
1126
2.91M
    WORD32 round_val, shft, ofst_val;
1127
1128
2.91M
    round_val = 1 << log_wd;
1129
2.91M
    shft = log_wd + 1;
1130
1131
2.91M
    ofst1_u = (WORD8)(ofst1 & 0xff);
1132
2.91M
    ofst1_v = (WORD8)(ofst1 >> 8);
1133
2.91M
    ofst2_u = (WORD8)(ofst2 & 0xff);
1134
2.91M
    ofst2_v = (WORD8)(ofst2 >> 8);
1135
1136
2.91M
    wt1_8x16b = _mm_set1_epi32(wt1);
1137
2.91M
    wt2_8x16b = _mm_set1_epi32(wt2);
1138
1139
2.91M
    ofst_u = (ofst1_u + ofst2_u + 1) >> 1;
1140
2.91M
    ofst_v = (ofst1_v + ofst2_v + 1) >> 1;
1141
2.91M
    ofst_val = (ofst_u & 0xffff) | (ofst_v << 16);
1142
1143
2.91M
    round_8x16b = _mm_set1_epi16(round_val);
1144
2.91M
    ofst_8x16b = _mm_set1_epi32(ofst_val);
1145
1146
2.91M
    if(wd == 2)
1147
3.30k
    {
1148
3.30k
        __m128i y1_0_8x16b, y2_0_8x16b;
1149
1150
3.30k
        do
1151
4.02k
        {
1152
4.02k
            y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
1153
4.02k
            y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
1154
1155
4.02k
            y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
1156
4.02k
            y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
1157
1158
4.02k
            y1_0_16x8b = _mm_unpacklo_epi32(y1_0_16x8b, y1_1_16x8b);
1159
4.02k
            y2_0_16x8b = _mm_unpacklo_epi32(y2_0_16x8b, y2_1_16x8b);
1160
1161
4.02k
            y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
1162
4.02k
            y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
1163
1164
4.02k
            y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b);
1165
4.02k
            y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b);
1166
1167
4.02k
            y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b);
1168
4.02k
            y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b);
1169
1170
4.02k
            y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft);
1171
4.02k
            y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b);
1172
1173
4.02k
            y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_0_8x16b);
1174
4.02k
            y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 4);
1175
1176
4.02k
            *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y1_0_16x8b);
1177
4.02k
            *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y1_1_16x8b);
1178
1179
4.02k
            ht -= 2;
1180
4.02k
            pu1_src1 += src_strd1 << 1;
1181
4.02k
            pu1_src2 += src_strd2 << 1;
1182
4.02k
            pu1_dst += dst_strd << 1;
1183
4.02k
        }
1184
4.02k
        while(ht > 0);
1185
3.30k
    }
1186
2.90M
    else if(wd == 4)
1187
30.0k
    {
1188
30.0k
        __m128i y1_0_8x16b, y1_1_8x16b;
1189
30.0k
        __m128i y2_0_8x16b, y2_1_8x16b;
1190
1191
30.0k
        do
1192
77.1k
        {
1193
77.1k
            y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
1194
77.1k
            y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
1195
1196
77.1k
            y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
1197
77.1k
            y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
1198
1199
77.1k
            y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
1200
77.1k
            y1_1_8x16b = _mm_cvtepu8_epi16(y1_1_16x8b);
1201
1202
77.1k
            y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
1203
77.1k
            y2_1_8x16b = _mm_cvtepu8_epi16(y2_1_16x8b);
1204
1205
77.1k
            y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b);
1206
77.1k
            y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b);
1207
77.1k
            y1_1_8x16b = _mm_mullo_epi16(y1_1_8x16b, wt1_8x16b);
1208
77.1k
            y2_1_8x16b = _mm_mullo_epi16(y2_1_8x16b, wt2_8x16b);
1209
1210
77.1k
            y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b);
1211
77.1k
            y1_1_8x16b = _mm_adds_epi16(y1_1_8x16b, y2_1_8x16b);
1212
1213
77.1k
            y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b);
1214
77.1k
            y1_1_8x16b = _mm_adds_epi16(round_8x16b, y1_1_8x16b);
1215
1216
77.1k
            y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft);
1217
77.1k
            y1_1_8x16b = _mm_srai_epi16(y1_1_8x16b, shft);
1218
1219
77.1k
            y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b);
1220
77.1k
            y1_1_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1_8x16b);
1221
1222
77.1k
            y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_1_8x16b);
1223
77.1k
            y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 8);
1224
1225
77.1k
            _mm_storel_epi64((__m128i *)pu1_dst, y1_0_16x8b);
1226
77.1k
            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b);
1227
1228
77.1k
            ht -= 2;
1229
77.1k
            pu1_src1 += src_strd1 << 1;
1230
77.1k
            pu1_src2 += src_strd2 << 1;
1231
77.1k
            pu1_dst += dst_strd << 1;
1232
77.1k
        }
1233
77.1k
        while(ht > 0);
1234
30.0k
    }
1235
2.87M
    else // wd == 8
1236
2.87M
    {
1237
2.87M
        __m128i y1_0L_8x16b, y1_0H_8x16b, y1_1L_8x16b, y1_1H_8x16b;
1238
2.87M
        __m128i y2_0L_8x16b, y2_0H_8x16b, y2_1L_8x16b, y2_1H_8x16b;
1239
1240
2.87M
        __m128i zero_16x8b;
1241
2.87M
        zero_16x8b = _mm_set1_epi8(0);
1242
1243
2.87M
        do
1244
11.4M
        {
1245
11.4M
            y1_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src1);
1246
11.4M
            y1_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1));
1247
11.4M
            y2_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src2);
1248
11.4M
            y2_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2));
1249
1250
11.4M
            y1_0L_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
1251
11.4M
            y1_0H_8x16b = _mm_unpackhi_epi8(y1_0_16x8b, zero_16x8b);
1252
11.4M
            y1_1L_8x16b = _mm_cvtepu8_epi16(y1_1_16x8b);
1253
11.4M
            y1_1H_8x16b = _mm_unpackhi_epi8(y1_1_16x8b, zero_16x8b);
1254
1255
11.4M
            y2_0L_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
1256
11.4M
            y2_0H_8x16b = _mm_unpackhi_epi8(y2_0_16x8b, zero_16x8b);
1257
11.4M
            y2_1L_8x16b = _mm_cvtepu8_epi16(y2_1_16x8b);
1258
11.4M
            y2_1H_8x16b = _mm_unpackhi_epi8(y2_1_16x8b, zero_16x8b);
1259
1260
11.4M
            y1_0L_8x16b = _mm_mullo_epi16(y1_0L_8x16b, wt1_8x16b);
1261
11.4M
            y1_0H_8x16b = _mm_mullo_epi16(y1_0H_8x16b, wt1_8x16b);
1262
11.4M
            y1_1L_8x16b = _mm_mullo_epi16(y1_1L_8x16b, wt1_8x16b);
1263
11.4M
            y1_1H_8x16b = _mm_mullo_epi16(y1_1H_8x16b, wt1_8x16b);
1264
1265
11.4M
            y2_0L_8x16b = _mm_mullo_epi16(y2_0L_8x16b, wt2_8x16b);
1266
11.4M
            y2_0H_8x16b = _mm_mullo_epi16(y2_0H_8x16b, wt2_8x16b);
1267
11.4M
            y2_1L_8x16b = _mm_mullo_epi16(y2_1L_8x16b, wt2_8x16b);
1268
11.4M
            y2_1H_8x16b = _mm_mullo_epi16(y2_1H_8x16b, wt2_8x16b);
1269
1270
11.4M
            y1_0L_8x16b = _mm_adds_epi16(y1_0L_8x16b, y2_0L_8x16b);
1271
11.4M
            y1_0H_8x16b = _mm_adds_epi16(y1_0H_8x16b, y2_0H_8x16b);
1272
11.4M
            y1_1L_8x16b = _mm_adds_epi16(y1_1L_8x16b, y2_1L_8x16b);
1273
11.4M
            y1_1H_8x16b = _mm_adds_epi16(y1_1H_8x16b, y2_1H_8x16b);
1274
1275
11.4M
            y1_0L_8x16b = _mm_adds_epi16(round_8x16b, y1_0L_8x16b);
1276
11.4M
            y1_0H_8x16b = _mm_adds_epi16(round_8x16b, y1_0H_8x16b);
1277
11.4M
            y1_1L_8x16b = _mm_adds_epi16(round_8x16b, y1_1L_8x16b);
1278
11.4M
            y1_1H_8x16b = _mm_adds_epi16(round_8x16b, y1_1H_8x16b);
1279
1280
11.4M
            y1_0L_8x16b = _mm_srai_epi16(y1_0L_8x16b, shft);
1281
11.4M
            y1_0H_8x16b = _mm_srai_epi16(y1_0H_8x16b, shft);
1282
11.4M
            y1_1L_8x16b = _mm_srai_epi16(y1_1L_8x16b, shft);
1283
11.4M
            y1_1H_8x16b = _mm_srai_epi16(y1_1H_8x16b, shft);
1284
1285
11.4M
            y1_0L_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0L_8x16b);
1286
11.4M
            y1_0H_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0H_8x16b);
1287
11.4M
            y1_1L_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1L_8x16b);
1288
11.4M
            y1_1H_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1H_8x16b);
1289
1290
11.4M
            y1_0_16x8b = _mm_packus_epi16(y1_0L_8x16b, y1_0H_8x16b);
1291
11.4M
            y1_1_16x8b = _mm_packus_epi16(y1_1L_8x16b, y1_1H_8x16b);
1292
1293
11.4M
            _mm_storeu_si128((__m128i *)pu1_dst, y1_0_16x8b);
1294
11.4M
            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b);
1295
1296
11.4M
            ht -= 2;
1297
11.4M
            pu1_src1 += src_strd1 << 1;
1298
11.4M
            pu1_src2 += src_strd2 << 1;
1299
11.4M
            pu1_dst += dst_strd << 1;
1300
11.4M
        }
1301
11.4M
        while(ht > 0);
1302
2.87M
    }
1303
2.91M
}