Coverage Report

Created: 2025-07-12 06:37

/src/libavc/common/x86/ih264_deblk_luma_ssse3.c
Line
Count
Source
1
/******************************************************************************
2
 *
3
 * Copyright (C) 2015 The Android Open Source Project
4
 *
5
 * Licensed under the Apache License, Version 2.0 (the "License");
6
 * you may not use this file except in compliance with the License.
7
 * You may obtain a copy of the License at:
8
 *
9
 * http://www.apache.org/licenses/LICENSE-2.0
10
 *
11
 * Unless required by applicable law or agreed to in writing, software
12
 * distributed under the License is distributed on an "AS IS" BASIS,
13
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 * See the License for the specific language governing permissions and
15
 * limitations under the License.
16
 *
17
 *****************************************************************************
18
 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19
*/
20
/*****************************************************************************/
21
/*                                                                           */
22
/*  File Name         : ih264_deblk_luma_ssse3.c                             */
23
/*                                                                           */
24
/*  Description       : Contains function definitions for deblocking         */
25
/*                                                                           */
26
/*  List of Functions : ih264_deblk_luma_vert_bs4_ssse3()                    */
27
/*                      ih264_deblk_luma_horz_bs4_ssse3()                    */
28
/*                      ih264_deblk_luma_vert_bslt4_ssse3()                  */
29
/*                      ih264_deblk_luma_horz_bslt4_ssse3()                  */
30
/*                      ih264_deblk_luma_vert_bs4_mbaff_ssse3()              */
31
/*                      ih264_deblk_luma_vert_bslt4_mbaff_ssse3()            */
32
/*                                                                           */
33
/*  Issues / Problems : None                                                 */
34
/*                                                                           */
35
/*  Revision History  :                                                      */
36
/*                                                                           */
37
/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
38
/*         12 02 2015   Naveen Kumar P  Added luma deblocking ssse3          */
39
/*                                      intrinsics                           */
40
/*                                                                           */
41
/*****************************************************************************/
42
43
/*****************************************************************************/
44
/* File Includes                                                             */
45
/*****************************************************************************/
46
47
/* System include files */
48
#include <stdio.h>
49
50
/* User include files */
51
#include "ih264_typedefs.h"
52
#include "ih264_platform_macros.h"
53
#include "ih264_deblk_edge_filters.h"
54
#include "ih264_macros.h"
55
56
/*****************************************************************************/
57
/* Function Definitions                                                      */
58
/*****************************************************************************/
59
60
/*****************************************************************************/
61
/*                                                                           */
62
/*  Function Name : ih264_deblk_luma_vert_bs4_ssse3()                        */
63
/*                                                                           */
64
/*  Description   : This function performs filtering of a luma block         */
65
/*                  vertical edge when the boundary strength is set to 4.    */
66
/*                                                                           */
67
/*  Inputs        : pu1_src    - pointer to the src sample q0                */
68
/*                  src_strd   - source stride                               */
69
/*                  alpha      - alpha value for the boundary                */
70
/*                  beta       - beta value for the boundary                 */
71
/*                                                                           */
72
/*  Globals       : None                                                     */
73
/*                                                                           */
74
/*  Processing    : This operation is described in Sec. 8.7.2.4 under the    */
75
/*                  title "Filtering process for edges for bS equal to 4" in */
76
/*                  ITU T Rec H.264.                                         */
77
/*                                                                           */
78
/*  Outputs       : None                                                     */
79
/*                                                                           */
80
/*  Returns       : None                                                     */
81
/*                                                                           */
82
/*  Issues        : None                                                     */
83
/*                                                                           */
84
/*  Revision History:                                                        */
85
/*                                                                           */
86
/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
87
/*         12 02 2015   Naveen Kumar P  Initial version                      */
88
/*                                                                           */
89
/*****************************************************************************/
90
void ih264_deblk_luma_vert_bs4_ssse3(UWORD8 *pu1_src,
91
                                     WORD32 src_strd,
92
                                     WORD32 alpha,
93
                                     WORD32 beta)
94
8.40M
{
95
8.40M
    __m128i zero = _mm_setzero_si128();
96
8.40M
    __m128i q0_16x8, q1_16x8, q2_16x8, q3_16x8;
97
8.40M
    __m128i p0_16x8, p1_16x8, p2_16x8, p3_16x8;
98
8.40M
    __m128i q0_8x16, q1_8x16, q2_8x16, q3_8x16;
99
8.40M
    __m128i p0_8x16, p1_8x16, p2_8x16, p3_8x16;
100
8.40M
    __m128i q0_16x8_1;
101
8.40M
    __m128i p0_16x8_1;
102
8.40M
    __m128i q0_16x8_2, q1_16x8_2, q2_16x8_2;
103
8.40M
    __m128i p0_16x8_2, p1_16x8_2, p2_16x8_2;
104
8.40M
    __m128i temp1, temp2, temp3, temp4, temp5, temp6;
105
8.40M
    __m128i Alpha_8x16, Beta_8x16;
106
8.40M
    __m128i flag1_16x8, flag2_16x8, flag3_16x8, flag4_16x8;
107
8.40M
    __m128i const_val2_16x8 = _mm_set1_epi16(2);
108
8.40M
    __m128i line1, line2, line3, line4, line5, line6, line7, line8;
109
110
8.40M
    Alpha_8x16 = _mm_set1_epi16(alpha);
111
8.40M
    Beta_8x16 = _mm_set1_epi16(beta);
112
113
8.40M
    line1 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd));
114
8.40M
    line2 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd));
115
8.40M
    line3 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd));
116
8.40M
    line4 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd));
117
8.40M
    line5 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd));
118
8.40M
    line6 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd));
119
8.40M
    line7 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd));
120
8.40M
    line8 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd));
121
122
8.40M
    temp1 = _mm_unpacklo_epi8(line1, line2);
123
8.40M
    temp2 = _mm_unpacklo_epi8(line3, line4);
124
8.40M
    temp3 = _mm_unpacklo_epi8(line5, line6);
125
8.40M
    temp4 = _mm_unpacklo_epi8(line7, line8);
126
127
8.40M
    line1 = _mm_unpacklo_epi16(temp1, temp2);
128
8.40M
    line2 = _mm_unpackhi_epi16(temp1, temp2);
129
8.40M
    line3 = _mm_unpacklo_epi16(temp3, temp4);
130
8.40M
    line4 = _mm_unpackhi_epi16(temp3, temp4);
131
132
8.40M
    p1_8x16 = _mm_unpacklo_epi32(line1, line3);
133
8.40M
    p0_8x16 = _mm_unpackhi_epi32(line1, line3);
134
8.40M
    q0_8x16 = _mm_unpacklo_epi32(line2, line4);
135
8.40M
    q1_8x16 = _mm_unpackhi_epi32(line2, line4);
136
137
8.40M
    line1 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 8 * src_strd));
138
8.40M
    line2 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 9 * src_strd));
139
8.40M
    line3 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 10 * src_strd));
140
8.40M
    line4 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 11 * src_strd));
141
8.40M
    line5 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 12 * src_strd));
142
8.40M
    line6 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 13 * src_strd));
143
8.40M
    line7 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 14 * src_strd));
144
8.40M
    line8 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 15 * src_strd));
145
146
8.40M
    temp1 = _mm_unpacklo_epi8(line1, line2);
147
8.40M
    temp2 = _mm_unpacklo_epi8(line3, line4);
148
8.40M
    temp3 = _mm_unpacklo_epi8(line5, line6);
149
8.40M
    temp4 = _mm_unpacklo_epi8(line7, line8);
150
151
8.40M
    line1 = _mm_unpacklo_epi16(temp1, temp2);
152
8.40M
    line2 = _mm_unpackhi_epi16(temp1, temp2);
153
8.40M
    line3 = _mm_unpacklo_epi16(temp3, temp4);
154
8.40M
    line4 = _mm_unpackhi_epi16(temp3, temp4);
155
156
8.40M
    temp1 = _mm_unpacklo_epi32(line1, line3);
157
8.40M
    temp2 = _mm_unpackhi_epi32(line1, line3);
158
8.40M
    temp3 = _mm_unpacklo_epi32(line2, line4);
159
8.40M
    temp4 = _mm_unpackhi_epi32(line2, line4);
160
161
8.40M
    p3_16x8 = _mm_unpacklo_epi64(p1_8x16, temp1);
162
8.40M
    p2_16x8 = _mm_unpackhi_epi64(p1_8x16, temp1);
163
8.40M
    q2_16x8 = _mm_unpacklo_epi64(q1_8x16, temp4);
164
8.40M
    q3_16x8 = _mm_unpackhi_epi64(q1_8x16, temp4);
165
8.40M
    p1_16x8 = _mm_unpacklo_epi64(p0_8x16, temp2);
166
8.40M
    p0_16x8 = _mm_unpackhi_epi64(p0_8x16, temp2);
167
8.40M
    q0_16x8 = _mm_unpacklo_epi64(q0_8x16, temp3);
168
8.40M
    q1_16x8 = _mm_unpackhi_epi64(q0_8x16, temp3);
169
170
    //Cond1 (ABS(p0 - q0) < alpha)
171
8.40M
    temp1 = _mm_subs_epu8(q0_16x8, p0_16x8);
172
8.40M
    temp2 = _mm_subs_epu8(p0_16x8, q0_16x8);
173
8.40M
    temp1 = _mm_add_epi8(temp1, temp2);
174
175
8.40M
    temp2 = _mm_unpacklo_epi8(temp1, zero);
176
8.40M
    temp1 = _mm_unpackhi_epi8(temp1, zero);
177
178
8.40M
    temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
179
8.40M
    temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);
180
181
8.40M
    flag1_16x8 = _mm_packs_epi16(temp2, temp1);
182
183
    //Cond2 (ABS(q1 - q0) < beta)
184
8.40M
    temp1 = _mm_subs_epu8(q0_16x8, q1_16x8);
185
8.40M
    temp2 = _mm_subs_epu8(q1_16x8, q0_16x8);
186
8.40M
    temp1 = _mm_add_epi8(temp1, temp2);
187
188
8.40M
    temp2 = _mm_unpacklo_epi8(temp1, zero);
189
8.40M
    temp1 = _mm_unpackhi_epi8(temp1, zero);
190
191
8.40M
    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
192
8.40M
    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
193
194
8.40M
    flag2_16x8 = _mm_packs_epi16(temp2, temp1);
195
196
8.40M
    flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
197
198
    //Cond3 (ABS(p1 - p0) < beta)
199
8.40M
    temp1 = _mm_subs_epu8(p0_16x8, p1_16x8);
200
8.40M
    temp2 = _mm_subs_epu8(p1_16x8, p0_16x8);
201
8.40M
    temp1 = _mm_add_epi8(temp1, temp2);
202
203
8.40M
    temp2 = _mm_unpacklo_epi8(temp1, zero);
204
8.40M
    temp1 = _mm_unpackhi_epi8(temp1, zero);
205
206
8.40M
    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
207
8.40M
    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
208
209
8.40M
    flag2_16x8 = _mm_packs_epi16(temp2, temp1);
210
211
    // !((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta))
212
8.40M
    flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
213
214
    // (ABS(p0 - q0) < ((alpha >> 2) + 2))
215
8.40M
    temp1 = _mm_subs_epu8(p0_16x8, q0_16x8);
216
8.40M
    temp2 = _mm_subs_epu8(q0_16x8, p0_16x8);
217
8.40M
    temp1 = _mm_add_epi8(temp1, temp2);
218
8.40M
    Alpha_8x16 = _mm_srai_epi16(Alpha_8x16, 2);
219
8.40M
    Alpha_8x16 = _mm_add_epi16(Alpha_8x16, const_val2_16x8);
220
221
8.40M
    temp2 = _mm_unpacklo_epi8(temp1, zero);
222
8.40M
    temp1 = _mm_unpackhi_epi8(temp1, zero);
223
8.40M
    temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
224
8.40M
    temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);
225
226
8.40M
    flag2_16x8 = _mm_packs_epi16(temp2, temp1);
227
8.40M
    flag2_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
228
229
    // (ABS(p2 - p0) < beta)
230
8.40M
    temp1 = _mm_subs_epu8(p0_16x8, p2_16x8);
231
8.40M
    temp2 = _mm_subs_epu8(p2_16x8, p0_16x8);
232
8.40M
    temp1 = _mm_add_epi8(temp1, temp2);
233
234
8.40M
    temp2 = _mm_unpacklo_epi8(temp1, zero);
235
8.40M
    temp1 = _mm_unpackhi_epi8(temp1, zero);
236
8.40M
    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
237
8.40M
    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
238
239
8.40M
    flag3_16x8 = _mm_packs_epi16(temp2, temp1);
240
8.40M
    flag3_16x8 = _mm_and_si128(flag3_16x8, flag2_16x8);
241
242
    // (ABS(q2 - q0) < beta)
243
8.40M
    temp1 = _mm_subs_epu8(q0_16x8, q2_16x8);
244
8.40M
    temp2 = _mm_subs_epu8(q2_16x8, q0_16x8);
245
8.40M
    temp1 = _mm_add_epi8(temp1, temp2);
246
247
8.40M
    temp2 = _mm_unpacklo_epi8(temp1, zero);
248
8.40M
    temp1 = _mm_unpackhi_epi8(temp1, zero);
249
8.40M
    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
250
8.40M
    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
251
252
8.40M
    flag4_16x8 = _mm_packs_epi16(temp2, temp1);
253
8.40M
    flag4_16x8 = _mm_and_si128(flag4_16x8, flag2_16x8);
254
255
    // First 8 pixels
256
8.40M
    p3_8x16 = _mm_unpacklo_epi8(p3_16x8, zero);
257
8.40M
    p2_8x16 = _mm_unpacklo_epi8(p2_16x8, zero);
258
8.40M
    p1_8x16 = _mm_unpacklo_epi8(p1_16x8, zero);
259
8.40M
    p0_8x16 = _mm_unpacklo_epi8(p0_16x8, zero);
260
8.40M
    q0_8x16 = _mm_unpacklo_epi8(q0_16x8, zero);
261
8.40M
    q1_8x16 = _mm_unpacklo_epi8(q1_16x8, zero);
262
8.40M
    q2_8x16 = _mm_unpacklo_epi8(q2_16x8, zero);
263
8.40M
    q3_8x16 = _mm_unpacklo_epi8(q3_16x8, zero);
264
265
    // p0_1 and q0_1
266
8.40M
    temp1 = _mm_add_epi16(p0_8x16, q1_8x16);
267
8.40M
    temp2 = _mm_add_epi16(p1_8x16, q0_8x16);
268
8.40M
    temp5 = _mm_add_epi16(temp1, const_val2_16x8);
269
8.40M
    temp6 = _mm_add_epi16(temp2, const_val2_16x8);
270
8.40M
    temp3 = _mm_slli_epi16(p1_8x16, 1);
271
8.40M
    temp4 = _mm_slli_epi16(q1_8x16, 1);
272
8.40M
    temp1 = _mm_add_epi16(temp5, temp3);
273
8.40M
    temp2 = _mm_add_epi16(temp6, temp4);
274
8.40M
    p0_16x8_1 = _mm_srai_epi16(temp1, 2);
275
8.40M
    q0_16x8_1 = _mm_srai_epi16(temp2, 2);
276
277
    // p1_2 and q1_2
278
8.40M
    temp6 = _mm_add_epi16(temp6, p0_8x16);
279
8.40M
    temp5 = _mm_add_epi16(temp5, q0_8x16);
280
8.40M
    temp1 = _mm_add_epi16(temp6, p2_8x16);
281
8.40M
    temp2 = _mm_add_epi16(temp5, q2_8x16);
282
8.40M
    p1_16x8_2 = _mm_srai_epi16(temp1, 2);
283
8.40M
    q1_16x8_2 = _mm_srai_epi16(temp2, 2);
284
285
    // p0_2 and q0_2
286
8.40M
    temp1 = _mm_add_epi16(temp3, p2_8x16);
287
8.40M
    temp2 = _mm_add_epi16(temp4, q2_8x16);
288
8.40M
    temp1 = _mm_add_epi16(temp1, q1_8x16);
289
8.40M
    temp2 = _mm_add_epi16(temp2, p1_8x16);
290
8.40M
    temp3 = _mm_add_epi16(p0_8x16, q0_8x16);
291
8.40M
    temp3 = _mm_slli_epi16(temp3, 1);
292
8.40M
    temp1 = _mm_add_epi16(temp1, temp3);
293
8.40M
    temp2 = _mm_add_epi16(temp2, temp3);
294
8.40M
    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(4));
295
8.40M
    temp2 = _mm_add_epi16(temp2, _mm_set1_epi16(4));
296
8.40M
    p0_16x8_2 = _mm_srai_epi16(temp1, 3);
297
8.40M
    q0_16x8_2 = _mm_srai_epi16(temp2, 3);
298
299
    // p2_2 and q2_2
300
8.40M
    temp1 = _mm_add_epi16(temp6, const_val2_16x8);
301
8.40M
    temp2 = _mm_add_epi16(temp5, const_val2_16x8);
302
8.40M
    temp3 = _mm_slli_epi16(p2_8x16, 1);
303
8.40M
    temp4 = _mm_slli_epi16(q2_8x16, 1);
304
8.40M
    temp3 = _mm_add_epi16(p2_8x16, temp3);
305
8.40M
    temp4 = _mm_add_epi16(q2_8x16, temp4);
306
8.40M
    temp5 = _mm_slli_epi16(p3_8x16, 1);
307
8.40M
    temp6 = _mm_slli_epi16(q3_8x16, 1);
308
8.40M
    temp1 = _mm_add_epi16(temp1, temp3);
309
8.40M
    temp2 = _mm_add_epi16(temp2, temp4);
310
8.40M
    temp1 = _mm_add_epi16(temp1, temp5);
311
8.40M
    temp2 = _mm_add_epi16(temp2, temp6);
312
8.40M
    p2_16x8_2 = _mm_srai_epi16(temp1, 3);
313
8.40M
    q2_16x8_2 = _mm_srai_epi16(temp2, 3);
314
315
    // Second 8 pixels and packing with first 8 pixels
316
8.40M
    p3_8x16 = _mm_unpackhi_epi8(p3_16x8, zero);
317
8.40M
    p2_8x16 = _mm_unpackhi_epi8(p2_16x8, zero);
318
8.40M
    p1_8x16 = _mm_unpackhi_epi8(p1_16x8, zero);
319
8.40M
    p0_8x16 = _mm_unpackhi_epi8(p0_16x8, zero);
320
8.40M
    q0_8x16 = _mm_unpackhi_epi8(q0_16x8, zero);
321
8.40M
    q1_8x16 = _mm_unpackhi_epi8(q1_16x8, zero);
322
8.40M
    q2_8x16 = _mm_unpackhi_epi8(q2_16x8, zero);
323
8.40M
    q3_8x16 = _mm_unpackhi_epi8(q3_16x8, zero);
324
325
    // p0_1 and q0_1
326
8.40M
    temp1 = _mm_add_epi16(p0_8x16, q1_8x16);
327
8.40M
    temp2 = _mm_add_epi16(p1_8x16, q0_8x16);
328
8.40M
    temp5 = _mm_add_epi16(temp1, const_val2_16x8);
329
8.40M
    temp6 = _mm_add_epi16(temp2, const_val2_16x8);
330
8.40M
    temp3 = _mm_slli_epi16(p1_8x16, 1);
331
8.40M
    temp4 = _mm_slli_epi16(q1_8x16, 1);
332
8.40M
    temp1 = _mm_add_epi16(temp5, temp3);
333
8.40M
    temp2 = _mm_add_epi16(temp6, temp4);
334
8.40M
    temp1 = _mm_srai_epi16(temp1, 2);
335
8.40M
    temp2 = _mm_srai_epi16(temp2, 2);
336
8.40M
    p0_16x8_1 = _mm_packus_epi16(p0_16x8_1, temp1);
337
8.40M
    q0_16x8_1 = _mm_packus_epi16(q0_16x8_1, temp2);
338
339
    // p1_2 and q1_2
340
8.40M
    temp6 = _mm_add_epi16(temp6, p0_8x16);
341
8.40M
    temp5 = _mm_add_epi16(temp5, q0_8x16);
342
8.40M
    temp1 = _mm_add_epi16(temp6, p2_8x16);
343
8.40M
    temp2 = _mm_add_epi16(temp5, q2_8x16);
344
8.40M
    temp1 = _mm_srai_epi16(temp1, 2);
345
8.40M
    temp2 = _mm_srai_epi16(temp2, 2);
346
8.40M
    p1_16x8_2 = _mm_packus_epi16(p1_16x8_2, temp1);
347
8.40M
    q1_16x8_2 = _mm_packus_epi16(q1_16x8_2, temp2);
348
349
    // p0_2 and q0_2
350
8.40M
    temp1 = _mm_add_epi16(temp3, p2_8x16);
351
8.40M
    temp2 = _mm_add_epi16(temp4, q2_8x16);
352
8.40M
    temp1 = _mm_add_epi16(temp1, q1_8x16);
353
8.40M
    temp2 = _mm_add_epi16(temp2, p1_8x16);
354
8.40M
    temp3 = _mm_add_epi16(p0_8x16, q0_8x16);
355
8.40M
    temp3 = _mm_slli_epi16(temp3, 1);
356
8.40M
    temp1 = _mm_add_epi16(temp1, temp3);
357
8.40M
    temp2 = _mm_add_epi16(temp2, temp3);
358
8.40M
    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(4));
359
8.40M
    temp2 = _mm_add_epi16(temp2, _mm_set1_epi16(4));
360
8.40M
    temp1 = _mm_srai_epi16(temp1, 3);
361
8.40M
    temp2 = _mm_srai_epi16(temp2, 3);
362
8.40M
    p0_16x8_2 = _mm_packus_epi16(p0_16x8_2, temp1);
363
8.40M
    q0_16x8_2 = _mm_packus_epi16(q0_16x8_2, temp2);
364
365
    // p2_2 and q2_2
366
8.40M
    temp1 = _mm_add_epi16(temp6, const_val2_16x8);
367
8.40M
    temp2 = _mm_add_epi16(temp5, const_val2_16x8);
368
8.40M
    temp3 = _mm_slli_epi16(p2_8x16, 1);
369
8.40M
    temp4 = _mm_slli_epi16(q2_8x16, 1);
370
8.40M
    temp3 = _mm_add_epi16(p2_8x16, temp3);
371
8.40M
    temp4 = _mm_add_epi16(q2_8x16, temp4);
372
8.40M
    temp5 = _mm_slli_epi16(p3_8x16, 1);
373
8.40M
    temp6 = _mm_slli_epi16(q3_8x16, 1);
374
8.40M
    temp1 = _mm_add_epi16(temp1, temp3);
375
8.40M
    temp2 = _mm_add_epi16(temp2, temp4);
376
8.40M
    temp1 = _mm_add_epi16(temp1, temp5);
377
8.40M
    temp2 = _mm_add_epi16(temp2, temp6);
378
8.40M
    temp1 = _mm_srai_epi16(temp1, 3);
379
8.40M
    temp2 = _mm_srai_epi16(temp2, 3);
380
8.40M
    p2_16x8_2 = _mm_packus_epi16(p2_16x8_2, temp1);
381
8.40M
    q2_16x8_2 = _mm_packus_epi16(q2_16x8_2, temp2);
382
383
    // p0 and q0
384
8.40M
    p0_16x8 = _mm_and_si128(p0_16x8,
385
8.40M
                            _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF)));
386
8.40M
    p0_16x8_1 = _mm_and_si128(p0_16x8_1, flag1_16x8);
387
8.40M
    p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_1);
388
8.40M
    q0_16x8 = _mm_and_si128(q0_16x8,
389
8.40M
                            _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF)));
390
8.40M
    q0_16x8_1 = _mm_and_si128(q0_16x8_1, flag1_16x8);
391
8.40M
    q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_1);
392
393
    // p0 and q0
394
8.40M
    p0_16x8 = _mm_and_si128(p0_16x8,
395
8.40M
                            _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
396
8.40M
    p0_16x8_2 = _mm_and_si128(p0_16x8_2, flag3_16x8);
397
8.40M
    p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_2);
398
8.40M
    q0_16x8 = _mm_and_si128(q0_16x8,
399
8.40M
                            _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
400
8.40M
    q0_16x8_2 = _mm_and_si128(q0_16x8_2, flag4_16x8);
401
8.40M
    q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_2);
402
403
    // p1 and q1
404
8.40M
    p1_16x8 = _mm_and_si128(p1_16x8,
405
8.40M
                            _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
406
8.40M
    p1_16x8_2 = _mm_and_si128(p1_16x8_2, flag3_16x8);
407
8.40M
    p1_16x8 = _mm_add_epi8(p1_16x8, p1_16x8_2);
408
8.40M
    q1_16x8 = _mm_and_si128(q1_16x8,
409
8.40M
                            _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
410
8.40M
    q1_16x8_2 = _mm_and_si128(q1_16x8_2, flag4_16x8);
411
8.40M
    q1_16x8 = _mm_add_epi8(q1_16x8, q1_16x8_2);
412
413
    // p2 and q2
414
8.40M
    p2_16x8 = _mm_and_si128(p2_16x8,
415
8.40M
                            _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
416
8.40M
    p2_16x8_2 = _mm_and_si128(p2_16x8_2, flag3_16x8);
417
8.40M
    p2_16x8 = _mm_add_epi8(p2_16x8, p2_16x8_2);
418
8.40M
    q2_16x8 = _mm_and_si128(q2_16x8,
419
8.40M
                            _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
420
8.40M
    q2_16x8_2 = _mm_and_si128(q2_16x8_2, flag4_16x8);
421
8.40M
    q2_16x8 = _mm_add_epi8(q2_16x8, q2_16x8_2);
422
423
8.40M
    temp1 = _mm_unpacklo_epi8(p3_16x8, p2_16x8);
424
8.40M
    temp2 = _mm_unpacklo_epi8(p1_16x8, p0_16x8);
425
8.40M
    temp3 = _mm_unpacklo_epi8(q0_16x8, q1_16x8);
426
8.40M
    temp4 = _mm_unpacklo_epi8(q2_16x8, q3_16x8);
427
428
8.40M
    p3_8x16 = _mm_unpacklo_epi16(temp1, temp2);
429
8.40M
    p2_8x16 = _mm_unpackhi_epi16(temp1, temp2);
430
8.40M
    q2_8x16 = _mm_unpacklo_epi16(temp3, temp4);
431
8.40M
    q3_8x16 = _mm_unpackhi_epi16(temp3, temp4);
432
433
8.40M
    line1 = _mm_unpacklo_epi32(p3_8x16, q2_8x16);
434
8.40M
    line2 = _mm_srli_si128(line1, 8);
435
8.40M
    line3 = _mm_unpackhi_epi32(p3_8x16, q2_8x16);
436
8.40M
    line4 = _mm_srli_si128(line3, 8);
437
8.40M
    line5 = _mm_unpacklo_epi32(p2_8x16, q3_8x16);
438
8.40M
    line6 = _mm_srli_si128(line5, 8);
439
8.40M
    line7 = _mm_unpackhi_epi32(p2_8x16, q3_8x16);
440
8.40M
    line8 = _mm_srli_si128(line7, 8);
441
442
8.40M
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd), line1);
443
8.40M
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd), line2);
444
8.40M
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd), line3);
445
8.40M
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd), line4);
446
8.40M
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd), line5);
447
8.40M
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd), line6);
448
8.40M
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd), line7);
449
8.40M
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd), line8);
450
451
8.40M
    temp1 = _mm_unpackhi_epi8(p3_16x8, p2_16x8);
452
8.40M
    temp2 = _mm_unpackhi_epi8(p1_16x8, p0_16x8);
453
8.40M
    temp3 = _mm_unpackhi_epi8(q0_16x8, q1_16x8);
454
8.40M
    temp4 = _mm_unpackhi_epi8(q2_16x8, q3_16x8);
455
456
8.40M
    p3_8x16 = _mm_unpacklo_epi16(temp1, temp2);
457
8.40M
    p2_8x16 = _mm_unpackhi_epi16(temp1, temp2);
458
8.40M
    q2_8x16 = _mm_unpacklo_epi16(temp3, temp4);
459
8.40M
    q3_8x16 = _mm_unpackhi_epi16(temp3, temp4);
460
461
8.40M
    line1 = _mm_unpacklo_epi32(p3_8x16, q2_8x16);
462
8.40M
    line2 = _mm_srli_si128(line1, 8);
463
8.40M
    line3 = _mm_unpackhi_epi32(p3_8x16, q2_8x16);
464
8.40M
    line4 = _mm_srli_si128(line3, 8);
465
8.40M
    line5 = _mm_unpacklo_epi32(p2_8x16, q3_8x16);
466
8.40M
    line6 = _mm_srli_si128(line5, 8);
467
8.40M
    line7 = _mm_unpackhi_epi32(p2_8x16, q3_8x16);
468
8.40M
    line8 = _mm_srli_si128(line7, 8);
469
470
8.40M
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 8 * src_strd), line1);
471
8.40M
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 9 * src_strd), line2);
472
8.40M
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 10 * src_strd), line3);
473
8.40M
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 11 * src_strd), line4);
474
8.40M
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 12 * src_strd), line5);
475
8.40M
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 13 * src_strd), line6);
476
8.40M
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 14 * src_strd), line7);
477
8.40M
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 15 * src_strd), line8);
478
479
8.40M
}
480
481
/*****************************************************************************/
482
/*                                                                           */
483
/*  Function Name : ih264_deblk_luma_horz_bs4_ssse3()                        */
484
/*                                                                           */
485
/*  Description   : This function performs filtering of a luma block         */
486
/*                  horizontal edge when the boundary strength is set to 4.  */
487
/*                                                                           */
488
/*  Inputs        : pu1_src    - pointer to the src sample q0                */
489
/*                  src_strd   - source stride                               */
490
/*                  alpha      - alpha value for the boundary                */
491
/*                  beta       - beta value for the boundary                 */
492
/*                                                                           */
493
/*  Globals       : None                                                     */
494
/*                                                                           */
495
/*  Processing    : This operation is described in Sec. 8.7.2.4 under the    */
496
/*                  title "Filtering process for edges for bS equal to 4" in */
497
/*                  ITU T Rec H.264.                                         */
498
/*                                                                           */
499
/*  Outputs       : None                                                     */
500
/*                                                                           */
501
/*  Returns       : None                                                     */
502
/*                                                                           */
503
/*  Issues        : None                                                     */
504
/*                                                                           */
505
/*  Revision History:                                                        */
506
/*                                                                           */
507
/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
508
/*         12 02 2015   Naveen Kumar P  Initial version                      */
509
/*                                                                           */
510
/*****************************************************************************/
511
void ih264_deblk_luma_horz_bs4_ssse3(UWORD8 *pu1_src,
512
                                     WORD32 src_strd,
513
                                     WORD32 alpha,
514
                                     WORD32 beta)
515
8.38M
{
516
8.38M
    WORD16 i16_posP3, i16_posP2, i16_posP1, i16_posP0;
517
8.38M
    WORD16 i16_posQ1, i16_posQ2, i16_posQ3;
518
8.38M
    UWORD8 *pu1_HorzPixel;
519
8.38M
    __m128i zero = _mm_setzero_si128();
520
8.38M
    __m128i q0_16x8, q1_16x8, q2_16x8, q3_16x8;
521
8.38M
    __m128i p0_16x8, p1_16x8, p2_16x8, p3_16x8;
522
8.38M
    __m128i q0_8x16, q1_8x16, q2_8x16, q3_8x16;
523
8.38M
    __m128i p0_8x16, p1_8x16, p2_8x16, p3_8x16;
524
8.38M
    __m128i q0_16x8_1;
525
8.38M
    __m128i p0_16x8_1;
526
8.38M
    __m128i q0_16x8_2, q1_16x8_2, q2_16x8_2;
527
8.38M
    __m128i p0_16x8_2, p1_16x8_2, p2_16x8_2;
528
8.38M
    __m128i temp1, temp2, temp3, temp4, temp5, temp6;
529
8.38M
    __m128i Alpha_8x16, Beta_8x16;
530
8.38M
    __m128i flag1_16x8, flag2_16x8, flag3_16x8, flag4_16x8;
531
8.38M
    __m128i const_val2_16x8 = _mm_set1_epi16(2);
532
533
8.38M
    pu1_HorzPixel = pu1_src - (src_strd << 2);
534
535
8.38M
    i16_posQ1 = src_strd;
536
8.38M
    i16_posQ2 = X2(src_strd);
537
8.38M
    i16_posQ3 = X3(src_strd);
538
8.38M
    i16_posP0 = X3(src_strd);
539
8.38M
    i16_posP1 = X2(src_strd);
540
8.38M
    i16_posP2 = src_strd;
541
8.38M
    i16_posP3 = 0;
542
543
8.38M
    Alpha_8x16 = _mm_set1_epi16(alpha);
544
8.38M
    Beta_8x16 = _mm_set1_epi16(beta);
545
546
8.38M
    p3_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP3));
547
8.38M
    p2_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP2));
548
8.38M
    p1_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP1));
549
8.38M
    p0_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP0));
550
8.38M
    q0_16x8 = _mm_loadu_si128((__m128i *)(pu1_src));
551
8.38M
    q1_16x8 = _mm_loadu_si128((__m128i *)(pu1_src + i16_posQ1));
552
8.38M
    q2_16x8 = _mm_loadu_si128((__m128i *)(pu1_src + i16_posQ2));
553
8.38M
    q3_16x8 = _mm_loadu_si128((__m128i *)(pu1_src + i16_posQ3));
554
555
    //Cond1 (ABS(p0 - q0) < alpha)
556
8.38M
    temp1 = _mm_subs_epu8(q0_16x8, p0_16x8);
557
8.38M
    temp2 = _mm_subs_epu8(p0_16x8, q0_16x8);
558
8.38M
    temp1 = _mm_add_epi8(temp1, temp2);
559
560
8.38M
    temp2 = _mm_unpacklo_epi8(temp1, zero);
561
8.38M
    temp1 = _mm_unpackhi_epi8(temp1, zero);
562
563
8.38M
    temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
564
8.38M
    temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);
565
566
8.38M
    flag1_16x8 = _mm_packs_epi16(temp2, temp1);
567
568
    //Cond2 (ABS(q1 - q0) < beta)
569
8.38M
    temp1 = _mm_subs_epu8(q0_16x8, q1_16x8);
570
8.38M
    temp2 = _mm_subs_epu8(q1_16x8, q0_16x8);
571
8.38M
    temp1 = _mm_add_epi8(temp1, temp2);
572
573
8.38M
    temp2 = _mm_unpacklo_epi8(temp1, zero);
574
8.38M
    temp1 = _mm_unpackhi_epi8(temp1, zero);
575
576
8.38M
    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
577
8.38M
    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
578
579
8.38M
    flag2_16x8 = _mm_packs_epi16(temp2, temp1);
580
581
8.38M
    flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
582
583
    //Cond3 (ABS(p1 - p0) < beta)
584
8.38M
    temp1 = _mm_subs_epu8(p0_16x8, p1_16x8);
585
8.38M
    temp2 = _mm_subs_epu8(p1_16x8, p0_16x8);
586
8.38M
    temp1 = _mm_add_epi8(temp1, temp2);
587
588
8.38M
    temp2 = _mm_unpacklo_epi8(temp1, zero);
589
8.38M
    temp1 = _mm_unpackhi_epi8(temp1, zero);
590
591
8.38M
    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
592
8.38M
    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
593
594
8.38M
    flag2_16x8 = _mm_packs_epi16(temp2, temp1);
595
596
    // !((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta))
597
8.38M
    flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
598
599
    // (ABS(p0 - q0) < ((alpha >> 2) + 2))
600
8.38M
    temp1 = _mm_subs_epu8(p0_16x8, q0_16x8);
601
8.38M
    temp2 = _mm_subs_epu8(q0_16x8, p0_16x8);
602
8.38M
    temp1 = _mm_add_epi8(temp1, temp2);
603
8.38M
    Alpha_8x16 = _mm_srai_epi16(Alpha_8x16, 2);
604
8.38M
    Alpha_8x16 = _mm_add_epi16(Alpha_8x16, const_val2_16x8);
605
606
8.38M
    temp2 = _mm_unpacklo_epi8(temp1, zero);
607
8.38M
    temp1 = _mm_unpackhi_epi8(temp1, zero);
608
8.38M
    temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
609
8.38M
    temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);
610
611
8.38M
    flag2_16x8 = _mm_packs_epi16(temp2, temp1);
612
8.38M
    flag2_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
613
614
    // (ABS(p2 - p0) < beta)
615
8.38M
    temp1 = _mm_subs_epu8(p0_16x8, p2_16x8);
616
8.38M
    temp2 = _mm_subs_epu8(p2_16x8, p0_16x8);
617
8.38M
    temp1 = _mm_add_epi8(temp1, temp2);
618
619
8.38M
    temp2 = _mm_unpacklo_epi8(temp1, zero);
620
8.38M
    temp1 = _mm_unpackhi_epi8(temp1, zero);
621
8.38M
    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
622
8.38M
    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
623
624
8.38M
    flag3_16x8 = _mm_packs_epi16(temp2, temp1);
625
8.38M
    flag3_16x8 = _mm_and_si128(flag3_16x8, flag2_16x8);
626
627
    // (ABS(q2 - q0) < beta)
628
8.38M
    temp1 = _mm_subs_epu8(q0_16x8, q2_16x8);
629
8.38M
    temp2 = _mm_subs_epu8(q2_16x8, q0_16x8);
630
8.38M
    temp1 = _mm_add_epi8(temp1, temp2);
631
632
8.38M
    temp2 = _mm_unpacklo_epi8(temp1, zero);
633
8.38M
    temp1 = _mm_unpackhi_epi8(temp1, zero);
634
8.38M
    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
635
8.38M
    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
636
637
8.38M
    flag4_16x8 = _mm_packs_epi16(temp2, temp1);
638
8.38M
    flag4_16x8 = _mm_and_si128(flag4_16x8, flag2_16x8);
639
640
    // First 8 pixels
641
8.38M
    p3_8x16 = _mm_unpacklo_epi8(p3_16x8, zero);
642
8.38M
    p2_8x16 = _mm_unpacklo_epi8(p2_16x8, zero);
643
8.38M
    p1_8x16 = _mm_unpacklo_epi8(p1_16x8, zero);
644
8.38M
    p0_8x16 = _mm_unpacklo_epi8(p0_16x8, zero);
645
8.38M
    q0_8x16 = _mm_unpacklo_epi8(q0_16x8, zero);
646
8.38M
    q1_8x16 = _mm_unpacklo_epi8(q1_16x8, zero);
647
8.38M
    q2_8x16 = _mm_unpacklo_epi8(q2_16x8, zero);
648
8.38M
    q3_8x16 = _mm_unpacklo_epi8(q3_16x8, zero);
649
650
    // p0_1 and q0_1
651
8.38M
    temp1 = _mm_add_epi16(p0_8x16, q1_8x16);
652
8.38M
    temp2 = _mm_add_epi16(p1_8x16, q0_8x16);
653
8.38M
    temp5 = _mm_add_epi16(temp1, const_val2_16x8);
654
8.38M
    temp6 = _mm_add_epi16(temp2, const_val2_16x8);
655
8.38M
    temp3 = _mm_slli_epi16(p1_8x16, 1);
656
8.38M
    temp4 = _mm_slli_epi16(q1_8x16, 1);
657
8.38M
    temp1 = _mm_add_epi16(temp5, temp3);
658
8.38M
    temp2 = _mm_add_epi16(temp6, temp4);
659
8.38M
    p0_16x8_1 = _mm_srai_epi16(temp1, 2);
660
8.38M
    q0_16x8_1 = _mm_srai_epi16(temp2, 2);
661
662
    // p1_2 and q1_2
663
8.38M
    temp6 = _mm_add_epi16(temp6, p0_8x16);
664
8.38M
    temp5 = _mm_add_epi16(temp5, q0_8x16);
665
8.38M
    temp1 = _mm_add_epi16(temp6, p2_8x16);
666
8.38M
    temp2 = _mm_add_epi16(temp5, q2_8x16);
667
8.38M
    p1_16x8_2 = _mm_srai_epi16(temp1, 2);
668
8.38M
    q1_16x8_2 = _mm_srai_epi16(temp2, 2);
669
670
    // p0_2 and q0_2
671
8.38M
    temp1 = _mm_add_epi16(temp3, p2_8x16);
672
8.38M
    temp2 = _mm_add_epi16(temp4, q2_8x16);
673
8.38M
    temp1 = _mm_add_epi16(temp1, q1_8x16);
674
8.38M
    temp2 = _mm_add_epi16(temp2, p1_8x16);
675
8.38M
    temp3 = _mm_add_epi16(p0_8x16, q0_8x16);
676
8.38M
    temp3 = _mm_slli_epi16(temp3, 1);
677
8.38M
    temp1 = _mm_add_epi16(temp1, temp3);
678
8.38M
    temp2 = _mm_add_epi16(temp2, temp3);
679
8.38M
    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(4));
680
8.38M
    temp2 = _mm_add_epi16(temp2, _mm_set1_epi16(4));
681
8.38M
    p0_16x8_2 = _mm_srai_epi16(temp1, 3);
682
8.38M
    q0_16x8_2 = _mm_srai_epi16(temp2, 3);
683
684
    // p2_2 and q2_2
685
8.38M
    temp1 = _mm_add_epi16(temp6, const_val2_16x8);
686
8.38M
    temp2 = _mm_add_epi16(temp5, const_val2_16x8);
687
8.38M
    temp3 = _mm_slli_epi16(p2_8x16, 1);
688
8.38M
    temp4 = _mm_slli_epi16(q2_8x16, 1);
689
8.38M
    temp3 = _mm_add_epi16(p2_8x16, temp3);
690
8.38M
    temp4 = _mm_add_epi16(q2_8x16, temp4);
691
8.38M
    temp5 = _mm_slli_epi16(p3_8x16, 1);
692
8.38M
    temp6 = _mm_slli_epi16(q3_8x16, 1);
693
8.38M
    temp1 = _mm_add_epi16(temp1, temp3);
694
8.38M
    temp2 = _mm_add_epi16(temp2, temp4);
695
8.38M
    temp1 = _mm_add_epi16(temp1, temp5);
696
8.38M
    temp2 = _mm_add_epi16(temp2, temp6);
697
8.38M
    p2_16x8_2 = _mm_srai_epi16(temp1, 3);
698
8.38M
    q2_16x8_2 = _mm_srai_epi16(temp2, 3);
699
700
    // Second 8 pixels and packing with first 8 pixels
701
8.38M
    p3_8x16 = _mm_unpackhi_epi8(p3_16x8, zero);
702
8.38M
    p2_8x16 = _mm_unpackhi_epi8(p2_16x8, zero);
703
8.38M
    p1_8x16 = _mm_unpackhi_epi8(p1_16x8, zero);
704
8.38M
    p0_8x16 = _mm_unpackhi_epi8(p0_16x8, zero);
705
8.38M
    q0_8x16 = _mm_unpackhi_epi8(q0_16x8, zero);
706
8.38M
    q1_8x16 = _mm_unpackhi_epi8(q1_16x8, zero);
707
8.38M
    q2_8x16 = _mm_unpackhi_epi8(q2_16x8, zero);
708
8.38M
    q3_8x16 = _mm_unpackhi_epi8(q3_16x8, zero);
709
710
    // p0_1 and q0_1
711
8.38M
    temp1 = _mm_add_epi16(p0_8x16, q1_8x16);
712
8.38M
    temp2 = _mm_add_epi16(p1_8x16, q0_8x16);
713
8.38M
    temp5 = _mm_add_epi16(temp1, const_val2_16x8);
714
8.38M
    temp6 = _mm_add_epi16(temp2, const_val2_16x8);
715
8.38M
    temp3 = _mm_slli_epi16(p1_8x16, 1);
716
8.38M
    temp4 = _mm_slli_epi16(q1_8x16, 1);
717
8.38M
    temp1 = _mm_add_epi16(temp5, temp3);
718
8.38M
    temp2 = _mm_add_epi16(temp6, temp4);
719
8.38M
    temp1 = _mm_srai_epi16(temp1, 2);
720
8.38M
    temp2 = _mm_srai_epi16(temp2, 2);
721
8.38M
    p0_16x8_1 = _mm_packus_epi16(p0_16x8_1, temp1);
722
8.38M
    q0_16x8_1 = _mm_packus_epi16(q0_16x8_1, temp2);
723
724
    // p1_2 and q1_2
725
8.38M
    temp6 = _mm_add_epi16(temp6, p0_8x16);
726
8.38M
    temp5 = _mm_add_epi16(temp5, q0_8x16);
727
8.38M
    temp1 = _mm_add_epi16(temp6, p2_8x16);
728
8.38M
    temp2 = _mm_add_epi16(temp5, q2_8x16);
729
8.38M
    temp1 = _mm_srai_epi16(temp1, 2);
730
8.38M
    temp2 = _mm_srai_epi16(temp2, 2);
731
8.38M
    p1_16x8_2 = _mm_packus_epi16(p1_16x8_2, temp1);
732
8.38M
    q1_16x8_2 = _mm_packus_epi16(q1_16x8_2, temp2);
733
734
    // p0_2 and q0_2
735
8.38M
    temp1 = _mm_add_epi16(temp3, p2_8x16);
736
8.38M
    temp2 = _mm_add_epi16(temp4, q2_8x16);
737
8.38M
    temp1 = _mm_add_epi16(temp1, q1_8x16);
738
8.38M
    temp2 = _mm_add_epi16(temp2, p1_8x16);
739
8.38M
    temp3 = _mm_add_epi16(p0_8x16, q0_8x16);
740
8.38M
    temp3 = _mm_slli_epi16(temp3, 1);
741
8.38M
    temp1 = _mm_add_epi16(temp1, temp3);
742
8.38M
    temp2 = _mm_add_epi16(temp2, temp3);
743
8.38M
    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(4));
744
8.38M
    temp2 = _mm_add_epi16(temp2, _mm_set1_epi16(4));
745
8.38M
    temp1 = _mm_srai_epi16(temp1, 3);
746
8.38M
    temp2 = _mm_srai_epi16(temp2, 3);
747
8.38M
    p0_16x8_2 = _mm_packus_epi16(p0_16x8_2, temp1);
748
8.38M
    q0_16x8_2 = _mm_packus_epi16(q0_16x8_2, temp2);
749
750
    // p2_2 and q2_2
751
8.38M
    temp1 = _mm_add_epi16(temp6, const_val2_16x8);
752
8.38M
    temp2 = _mm_add_epi16(temp5, const_val2_16x8);
753
8.38M
    temp3 = _mm_slli_epi16(p2_8x16, 1);
754
8.38M
    temp4 = _mm_slli_epi16(q2_8x16, 1);
755
8.38M
    temp3 = _mm_add_epi16(p2_8x16, temp3);
756
8.38M
    temp4 = _mm_add_epi16(q2_8x16, temp4);
757
8.38M
    temp5 = _mm_slli_epi16(p3_8x16, 1);
758
8.38M
    temp6 = _mm_slli_epi16(q3_8x16, 1);
759
8.38M
    temp1 = _mm_add_epi16(temp1, temp3);
760
8.38M
    temp2 = _mm_add_epi16(temp2, temp4);
761
8.38M
    temp1 = _mm_add_epi16(temp1, temp5);
762
8.38M
    temp2 = _mm_add_epi16(temp2, temp6);
763
8.38M
    temp1 = _mm_srai_epi16(temp1, 3);
764
8.38M
    temp2 = _mm_srai_epi16(temp2, 3);
765
8.38M
    p2_16x8_2 = _mm_packus_epi16(p2_16x8_2, temp1);
766
8.38M
    q2_16x8_2 = _mm_packus_epi16(q2_16x8_2, temp2);
767
768
    // p0 and q0
769
8.38M
    p0_16x8 = _mm_and_si128(p0_16x8,
770
8.38M
                            _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF)));
771
8.38M
    p0_16x8_1 = _mm_and_si128(p0_16x8_1, flag1_16x8);
772
8.38M
    p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_1);
773
8.38M
    q0_16x8 = _mm_and_si128(q0_16x8,
774
8.38M
                            _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF)));
775
8.38M
    q0_16x8_1 = _mm_and_si128(q0_16x8_1, flag1_16x8);
776
8.38M
    q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_1);
777
778
    // p0 and q0
779
8.38M
    p0_16x8 = _mm_and_si128(p0_16x8,
780
8.38M
                            _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
781
8.38M
    p0_16x8_2 = _mm_and_si128(p0_16x8_2, flag3_16x8);
782
8.38M
    p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_2);
783
8.38M
    q0_16x8 = _mm_and_si128(q0_16x8,
784
8.38M
                            _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
785
8.38M
    q0_16x8_2 = _mm_and_si128(q0_16x8_2, flag4_16x8);
786
8.38M
    q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_2);
787
788
    // p1 and q1
789
8.38M
    p1_16x8 = _mm_and_si128(p1_16x8,
790
8.38M
                            _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
791
8.38M
    p1_16x8_2 = _mm_and_si128(p1_16x8_2, flag3_16x8);
792
8.38M
    p1_16x8 = _mm_add_epi8(p1_16x8, p1_16x8_2);
793
8.38M
    q1_16x8 = _mm_and_si128(q1_16x8,
794
8.38M
                            _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
795
8.38M
    q1_16x8_2 = _mm_and_si128(q1_16x8_2, flag4_16x8);
796
8.38M
    q1_16x8 = _mm_add_epi8(q1_16x8, q1_16x8_2);
797
798
    // p2 and q2
799
8.38M
    p2_16x8 = _mm_and_si128(p2_16x8,
800
8.38M
                            _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
801
8.38M
    p2_16x8_2 = _mm_and_si128(p2_16x8_2, flag3_16x8);
802
8.38M
    p2_16x8 = _mm_add_epi8(p2_16x8, p2_16x8_2);
803
8.38M
    q2_16x8 = _mm_and_si128(q2_16x8,
804
8.38M
                            _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
805
8.38M
    q2_16x8_2 = _mm_and_si128(q2_16x8_2, flag4_16x8);
806
8.38M
    q2_16x8 = _mm_add_epi8(q2_16x8, q2_16x8_2);
807
808
8.38M
    _mm_storeu_si128((__m128i *)(pu1_HorzPixel + i16_posP2), p2_16x8);
809
8.38M
    _mm_storeu_si128((__m128i *)(pu1_HorzPixel + i16_posP1), p1_16x8);
810
8.38M
    _mm_storeu_si128((__m128i *)(pu1_HorzPixel + i16_posP0), p0_16x8);
811
812
8.38M
    _mm_storeu_si128((__m128i *)(pu1_src), q0_16x8);
813
8.38M
    _mm_storeu_si128((__m128i *)(pu1_src + i16_posQ1), q1_16x8);
814
8.38M
    _mm_storeu_si128((__m128i *)(pu1_src + i16_posQ2), q2_16x8);
815
816
8.38M
}
817
818
/*****************************************************************************/
819
/*                                                                           */
820
/*  Function Name : ih264_deblk_luma_vert_bslt4_ssse3()                      */
821
/*                                                                           */
822
/*  Description   : This function performs filtering of a luma block         */
823
/*                  vertical edge when the boundary strength is less than 4. */
824
/*                                                                           */
825
/*  Inputs        : pu1_src       - pointer to the src sample q0             */
826
/*                  src_strd      - source stride                            */
827
/*                  alpha         - alpha value for the boundary             */
828
/*                  beta          - beta value for the boundary              */
829
/*                  u4_bs         - packed Boundary strength array           */
830
/*                  pu1_cliptab   - tc0_table                                */
831
/*                                                                           */
832
/*  Globals       : None                                                     */
833
/*                                                                           */
834
/*  Processing    : This operation is described in Sec. 8.7.2.3 under the    */
835
/*                  title "Filtering process for edges for bS less than 4"   */
836
/*                  in ITU T Rec H.264.                                      */
837
/*                                                                           */
838
/*  Outputs       : None                                                     */
839
/*                                                                           */
840
/*  Returns       : None                                                     */
841
/*                                                                           */
842
/*  Issues        : None                                                     */
843
/*                                                                           */
844
/*  Revision History:                                                        */
845
/*                                                                           */
846
/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
847
/*         12 02 2015   Naveen Kumar P  Initial version                      */
848
/*                                                                           */
849
/*****************************************************************************/
850
void ih264_deblk_luma_vert_bslt4_ssse3(UWORD8 *pu1_src,
851
                                       WORD32 src_strd,
852
                                       WORD32 alpha,
853
                                       WORD32 beta,
854
                                       UWORD32 u4_bs,
855
                                       const UWORD8 *pu1_cliptab)
856
29.8M
{
857
29.8M
    UWORD8 u1_Bs, u1_Bs1;
858
859
29.8M
    WORD32 j = 0;
860
861
29.8M
    __m128i linea, lineb, linec, lined, linee, linef, lineg, lineh;
862
29.8M
    __m128i int1, int2, int3, int4, high1, high2;
863
29.8M
    __m128i flag, flag1, i_C, i_C0;
864
29.8M
    __m128i i_Ap, i_Aq, diff, const1, const2, in_macro, in_macrotemp, temp,
865
29.8M
                    temp1;
866
29.8M
    __m128i zero = _mm_setzero_si128();
867
868
89.4M
    for(j = 0; j <= 8 * src_strd; j += 8 * src_strd)
869
59.6M
    {
870
        //Transpose
871
59.6M
        linea = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + j));
872
59.6M
        lineb = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + src_strd + j));
873
59.6M
        linec = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 2 * src_strd + j));
874
59.6M
        lined = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 3 * src_strd + j));
875
876
59.6M
        linea = _mm_unpacklo_epi8(linea, zero);
877
59.6M
        lineb = _mm_unpacklo_epi8(lineb, zero);
878
59.6M
        linec = _mm_unpacklo_epi8(linec, zero);
879
59.6M
        lined = _mm_unpacklo_epi8(lined, zero);
880
881
59.6M
        int1 = _mm_unpacklo_epi16(linea, lineb);
882
59.6M
        lineb = _mm_unpackhi_epi16(linea, lineb);
883
884
59.6M
        int2 = _mm_unpacklo_epi16(linec, lined);
885
59.6M
        lined = _mm_unpackhi_epi16(linec, lined);
886
887
59.6M
        linea = _mm_unpacklo_epi16(int1, int2);
888
59.6M
        int1 = _mm_unpackhi_epi16(int1, int2);
889
890
59.6M
        linec = _mm_unpacklo_epi16(lineb, lined);
891
59.6M
        high1 = _mm_unpackhi_epi16(lineb, lined);
892
893
59.6M
        linee = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 4 * src_strd + j));
894
59.6M
        linef = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 5 * src_strd + j));
895
59.6M
        lineg = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 6 * src_strd + j));
896
59.6M
        lineh = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 7 * src_strd + j));
897
898
59.6M
        linee = _mm_unpacklo_epi8(linee, zero);
899
59.6M
        linef = _mm_unpacklo_epi8(linef, zero);
900
59.6M
        lineg = _mm_unpacklo_epi8(lineg, zero);
901
59.6M
        lineh = _mm_unpacklo_epi8(lineh, zero);
902
903
59.6M
        int2 = _mm_unpacklo_epi16(linee, linef);
904
59.6M
        linef = _mm_unpackhi_epi16(linee, linef);
905
906
59.6M
        int3 = _mm_unpacklo_epi16(lineg, lineh);
907
59.6M
        lineh = _mm_unpackhi_epi16(lineg, lineh);
908
909
59.6M
        linee = _mm_unpacklo_epi16(int2, int3);
910
59.6M
        int2 = _mm_unpackhi_epi16(int2, int3);
911
912
59.6M
        lineg = _mm_unpacklo_epi16(linef, lineh);
913
59.6M
        high2 = _mm_unpackhi_epi16(linef, lineh);
914
915
59.6M
        int4 = _mm_unpacklo_epi16(linea, linee);
916
59.6M
        lineb = _mm_unpackhi_epi16(linea, linee);
917
918
59.6M
        int3 = _mm_unpacklo_epi16(int1, int2);
919
59.6M
        lined = _mm_unpackhi_epi16(int1, int2);
920
921
59.6M
        int2 = _mm_unpacklo_epi16(linec, lineg);
922
59.6M
        linef = _mm_unpackhi_epi16(linec, lineg);
923
924
59.6M
        linea = int4;
925
59.6M
        linec = int3;
926
59.6M
        linee = int2;
927
928
59.6M
        lineg = _mm_unpacklo_epi16(high1, high2);
929
59.6M
        lineh = _mm_unpackhi_epi16(high1, high2);
930
931
        //end of transpose
932
933
59.6M
        u1_Bs = (u4_bs >> 24) & 0xff;
934
59.6M
        u1_Bs1 = (u4_bs >> 16) & 0xff;
935
59.6M
        u4_bs <<= 16;
936
937
59.6M
        flag1 = _mm_set_epi16(u1_Bs1, u1_Bs, u1_Bs1, u1_Bs, u1_Bs1, u1_Bs,
938
59.6M
                              u1_Bs1, u1_Bs);
939
59.6M
        flag1 = _mm_cmpeq_epi16(flag1, zero); //Set flag to 1s and 0s
940
59.6M
        flag1 = _mm_xor_si128(flag1, _mm_set1_epi16(0xFFFF)); //Invert for required mask
941
942
59.6M
        i_C0 = _mm_set_epi16(pu1_cliptab[u1_Bs1], pu1_cliptab[u1_Bs],
943
59.6M
                             pu1_cliptab[u1_Bs1], pu1_cliptab[u1_Bs],
944
59.6M
                             pu1_cliptab[u1_Bs1], pu1_cliptab[u1_Bs],
945
59.6M
                             pu1_cliptab[u1_Bs1], pu1_cliptab[u1_Bs]);
946
947
59.6M
        diff = _mm_subs_epi16(linec, lined); //Condn 1
948
59.6M
        diff = _mm_abs_epi16(diff);
949
59.6M
        const1 = _mm_set1_epi16(alpha);
950
59.6M
        flag = _mm_cmpgt_epi16(const1, diff);
951
952
59.6M
        diff = _mm_subs_epi16(linee, lined); //Condtn 2
953
59.6M
        diff = _mm_abs_epi16(diff);
954
59.6M
        const1 = _mm_set1_epi16(beta);
955
59.6M
        flag = _mm_and_si128(flag, _mm_cmpgt_epi16(const1, diff));
956
957
59.6M
        diff = _mm_subs_epi16(lineb, linec); //Condtn 3
958
59.6M
        diff = _mm_abs_epi16(diff);
959
59.6M
        flag = _mm_and_si128(flag, _mm_cmpgt_epi16(const1, diff)); //Const 1= Beta from now on
960
961
59.6M
        flag = _mm_and_si128(flag, flag1); //Final flag (ui_B condition + other 3 conditions)
962
963
        //Adding Ap<Beta and Aq<Beta
964
59.6M
        i_Ap = _mm_subs_epi16(linea, linec);
965
59.6M
        i_Ap = _mm_abs_epi16(i_Ap);
966
59.6M
        const2 = _mm_cmpgt_epi16(const1, i_Ap);
967
59.6M
        const2 = _mm_subs_epi16(zero, const2); //Make FFFF=1 and 0000=0
968
59.6M
        i_C = _mm_add_epi16(i_C0, const2);
969
970
59.6M
        i_Aq = _mm_subs_epi16(linef, lined);
971
59.6M
        i_Aq = _mm_abs_epi16(i_Aq);
972
59.6M
        const2 = _mm_cmpgt_epi16(const1, i_Aq);
973
59.6M
        const2 = _mm_subs_epi16(zero, const2);
974
59.6M
        i_C = _mm_add_epi16(i_C, const2);
975
976
        //Calculate in_macro
977
59.6M
        diff = _mm_subs_epi16(lined, linec);
978
59.6M
        diff = _mm_slli_epi16(diff, 2);
979
59.6M
        const2 = _mm_subs_epi16(lineb, linee);
980
59.6M
        diff = _mm_add_epi16(diff, const2);
981
59.6M
        const2 = _mm_set1_epi16(4);
982
59.6M
        diff = _mm_add_epi16(diff, const2);
983
59.6M
        in_macro = _mm_srai_epi16(diff, 3);
984
985
59.6M
        in_macro = _mm_min_epi16(i_C, in_macro); //CLIP3
986
59.6M
        i_C = _mm_subs_epi16(zero, i_C);
987
59.6M
        in_macro = _mm_max_epi16(i_C, in_macro);
988
989
        //Compute and store
990
59.6M
        in_macrotemp = _mm_add_epi16(linec, in_macro);
991
59.6M
        in_macrotemp = _mm_and_si128(in_macrotemp, flag);
992
59.6M
        temp = _mm_and_si128(linec,
993
59.6M
                             _mm_xor_si128(flag, _mm_set1_epi16(0xFFFF)));
994
59.6M
        temp = _mm_add_epi16(temp, in_macrotemp);
995
        //temp= _mm_packus_epi16 (temp, zero);
996
        //_mm_storel_epi64(uc_HorzPixel+i16_posP0+i, in_macrotemp);
997
998
59.6M
        in_macrotemp = _mm_subs_epi16(lined, in_macro);
999
59.6M
        in_macrotemp = _mm_and_si128(in_macrotemp, flag);
1000
59.6M
        temp1 = _mm_and_si128(lined,
1001
59.6M
                              _mm_xor_si128(flag, _mm_set1_epi16(0xFFFF)));
1002
59.6M
        temp1 = _mm_add_epi16(temp1, in_macrotemp);
1003
        //temp1= _mm_packus_epi16 (temp1, zero);
1004
        //_mm_storel_epi64(pu1_src+i, in_macrotemp);
1005
1006
        //If Ap<Beta
1007
59.6M
        flag1 = _mm_cmpgt_epi16(const1, i_Ap);
1008
59.6M
        flag1 = _mm_and_si128(flag, flag1);
1009
59.6M
        in_macrotemp = _mm_add_epi16(linec, lined);
1010
59.6M
        in_macrotemp = _mm_add_epi16(in_macrotemp, _mm_set1_epi16(1));
1011
59.6M
        in_macrotemp = _mm_srai_epi16(in_macrotemp, 1);
1012
59.6M
        in_macro = _mm_add_epi16(in_macrotemp, linea);
1013
59.6M
        in_macro = _mm_subs_epi16(in_macro, _mm_slli_epi16(lineb, 1));
1014
59.6M
        in_macro = _mm_srai_epi16(in_macro, 1);
1015
1016
59.6M
        in_macro = _mm_min_epi16(i_C0, in_macro); //CLIP3
1017
59.6M
        i_C0 = _mm_subs_epi16(zero, i_C0);
1018
59.6M
        in_macro = _mm_max_epi16(i_C0, in_macro);
1019
1020
59.6M
        in_macro = _mm_and_si128(in_macro, flag1);
1021
59.6M
        lineb = _mm_add_epi16(lineb, in_macro);
1022
        //in_macro= _mm_packus_epi16 (i_p1, zero);
1023
        //_mm_storel_epi64(uc_HorzPixel+i16_posP1+i, in_macro);
1024
1025
59.6M
        flag1 = _mm_cmpgt_epi16(const1, i_Aq);
1026
59.6M
        flag1 = _mm_and_si128(flag, flag1);
1027
59.6M
        in_macro = _mm_add_epi16(in_macrotemp, linef);
1028
59.6M
        in_macro = _mm_subs_epi16(in_macro, _mm_slli_epi16(linee, 1));
1029
59.6M
        in_macro = _mm_srai_epi16(in_macro, 1);
1030
1031
59.6M
        i_C0 = _mm_abs_epi16(i_C0);
1032
59.6M
        in_macro = _mm_min_epi16(i_C0, in_macro); //CLIP3
1033
59.6M
        i_C0 = _mm_subs_epi16(zero, i_C0);
1034
59.6M
        in_macro = _mm_max_epi16(i_C0, in_macro);
1035
1036
59.6M
        in_macro = _mm_and_si128(in_macro, flag1);
1037
59.6M
        linee = _mm_add_epi16(linee, in_macro);
1038
        //in_macro= _mm_packus_epi16 (i_q1, zero);
1039
        //_mm_storel_epi64(pu1_src+i16_posQ1+i, in_macro);
1040
59.6M
        linec = temp;
1041
59.6M
        lined = temp1;
1042
        //End of filtering
1043
1044
59.6M
        int1 = _mm_unpacklo_epi16(linea, linee);
1045
59.6M
        linee = _mm_unpackhi_epi16(linea, linee);
1046
1047
59.6M
        int2 = _mm_unpacklo_epi16(linec, lineg);
1048
59.6M
        lineg = _mm_unpackhi_epi16(linec, lineg);
1049
1050
59.6M
        linea = _mm_unpacklo_epi16(int1, int2);
1051
59.6M
        int3 = _mm_unpackhi_epi16(int1, int2);
1052
1053
59.6M
        linec = _mm_unpacklo_epi16(linee, lineg);
1054
59.6M
        lineg = _mm_unpackhi_epi16(linee, lineg);
1055
1056
59.6M
        int1 = _mm_unpacklo_epi16(lineb, linef);
1057
59.6M
        linef = _mm_unpackhi_epi16(lineb, linef);
1058
1059
59.6M
        int2 = _mm_unpacklo_epi16(lined, lineh);
1060
59.6M
        lineh = _mm_unpackhi_epi16(lined, lineh);
1061
1062
59.6M
        lineb = _mm_unpacklo_epi16(int1, int2);
1063
59.6M
        int4 = _mm_unpackhi_epi16(int1, int2);
1064
1065
59.6M
        lined = _mm_unpacklo_epi16(linef, lineh);
1066
59.6M
        lineh = _mm_unpackhi_epi16(linef, lineh);
1067
1068
59.6M
        int1 = _mm_unpackhi_epi16(linea, lineb);
1069
59.6M
        linea = _mm_unpacklo_epi16(linea, lineb);
1070
1071
59.6M
        int2 = _mm_unpacklo_epi16(int3, int4);
1072
59.6M
        high1 = _mm_unpackhi_epi16(int3, int4);
1073
1074
59.6M
        lineb = _mm_unpacklo_epi16(linec, lined);
1075
59.6M
        linef = _mm_unpackhi_epi16(linec, lined);
1076
1077
59.6M
        lined = _mm_unpacklo_epi16(lineg, lineh);
1078
59.6M
        lineh = _mm_unpackhi_epi16(lineg, lineh);
1079
1080
59.6M
        linee = int1;
1081
59.6M
        lineg = high1;
1082
59.6M
        linec = int2;
1083
        //End of inverse transpose
1084
1085
        //Packs and stores
1086
59.6M
        linea = _mm_packus_epi16(linea, zero);
1087
59.6M
        _mm_storel_epi64((__m128i *)(pu1_src - 3 + j), linea);
1088
1089
59.6M
        lineb = _mm_packus_epi16(lineb, zero);
1090
59.6M
        _mm_storel_epi64((__m128i *)(pu1_src - 3 + src_strd + j), lineb);
1091
1092
59.6M
        linec = _mm_packus_epi16(linec, zero);
1093
59.6M
        _mm_storel_epi64((__m128i *)(pu1_src - 3 + 2 * src_strd + j), linec);
1094
1095
59.6M
        lined = _mm_packus_epi16(lined, zero);
1096
59.6M
        _mm_storel_epi64((__m128i *)(pu1_src - 3 + 3 * src_strd + j), lined);
1097
1098
59.6M
        linee = _mm_packus_epi16(linee, zero);
1099
59.6M
        _mm_storel_epi64((__m128i *)(pu1_src - 3 + 4 * src_strd + j), linee);
1100
1101
59.6M
        linef = _mm_packus_epi16(linef, zero);
1102
59.6M
        _mm_storel_epi64((__m128i *)(pu1_src - 3 + 5 * src_strd + j), linef);
1103
1104
59.6M
        lineg = _mm_packus_epi16(lineg, zero);
1105
59.6M
        _mm_storel_epi64((__m128i *)(pu1_src - 3 + 6 * src_strd + j), lineg);
1106
1107
59.6M
        lineh = _mm_packus_epi16(lineh, zero);
1108
59.6M
        _mm_storel_epi64((__m128i *)(pu1_src - 3 + 7 * src_strd + j), lineh);
1109
1110
59.6M
    }
1111
29.8M
}
1112
1113
/*****************************************************************************/
1114
/*                                                                           */
1115
/*  Function Name : ih264_deblk_luma_horz_bslt4_ssse3()                      */
1116
/*                                                                           */
1117
/*  Description   : This function performs filtering of a luma block         */
1118
/*                  horizontal edge when boundary strength is less than 4.   */
1119
/*                                                                           */
1120
/*  Inputs        : pu1_src       - pointer to the src sample q0             */
1121
/*                  src_strd      - source stride                            */
1122
/*                  alpha         - alpha value for the boundary             */
1123
/*                  beta          - beta value for the boundary              */
1124
/*                  u4_bs         - packed Boundary strength array           */
1125
/*                  pu1_cliptab   - tc0_table                                */
1126
/*                                                                           */
1127
/*  Globals       : None                                                     */
1128
/*                                                                           */
1129
/*  Processing    : This operation is described in Sec. 8.7.2.3 under the    */
1130
/*                  title "Filtering process for edges for bS less than 4"   */
1131
/*                  in ITU T Rec H.264.                                      */
1132
/*                                                                           */
1133
/*  Outputs       : None                                                     */
1134
/*                                                                           */
1135
/*  Returns       : None                                                     */
1136
/*                                                                           */
1137
/*  Issues        : None                                                     */
1138
/*                                                                           */
1139
/*  Revision History:                                                        */
1140
/*                                                                           */
1141
/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
1142
/*         12 02 2015   Naveen Kumar P  Initial version                      */
1143
/*                                                                           */
1144
/*****************************************************************************/
1145
void ih264_deblk_luma_horz_bslt4_ssse3(UWORD8 *pu1_src,
1146
                                       WORD32 src_strd,
1147
                                       WORD32 alpha,
1148
                                       WORD32 beta,
1149
                                       UWORD32 u4_bs,
1150
                                       const UWORD8 *pu1_cliptab)
1151
30.0M
{
1152
30.0M
    WORD16 i16_posP2, i16_posP1, i16_posP0, i16_posQ1, i16_posQ2;
1153
30.0M
    UWORD8 *pu1_HorzPixel;
1154
30.0M
    __m128i zero = _mm_setzero_si128();
1155
30.0M
    __m128i bs_flag_16x8b, C0_16x8, C0_8x16, C0_hi_8x16, C_8x16, C_hi_8x16;
1156
30.0M
    __m128i q0_16x8, q1_16x8, q2_16x8, p0_16x8, p1_16x8, p2_16x8;
1157
30.0M
    __m128i temp1, temp2;
1158
30.0M
    __m128i Alpha_8x16, Beta_8x16, flag1_16x8, flag2_16x8, flag3_16x8;
1159
30.0M
    __m128i in_macro_16x8, in_macro_hi_16x8;
1160
30.0M
    __m128i const_val4_8x16;
1161
30.0M
    UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
1162
30.0M
    UWORD8 clip0, clip1, clip2, clip3;
1163
1164
30.0M
    pu1_HorzPixel = pu1_src - (src_strd << 2);
1165
1166
30.0M
    i16_posQ1 = src_strd;
1167
30.0M
    i16_posQ2 = X2(src_strd);
1168
30.0M
    i16_posP0 = X3(src_strd);
1169
30.0M
    i16_posP1 = X2(src_strd);
1170
30.0M
    i16_posP2 = src_strd;
1171
1172
30.0M
    q0_16x8 = _mm_loadu_si128((__m128i *)(pu1_src));
1173
30.0M
    q1_16x8 = _mm_loadu_si128((__m128i *)(pu1_src + i16_posQ1));
1174
1175
30.0M
    u1_Bs0 = (u4_bs >> 24) & 0xff;
1176
30.0M
    u1_Bs1 = (u4_bs >> 16) & 0xff;
1177
30.0M
    u1_Bs2 = (u4_bs >> 8) & 0xff;
1178
30.0M
    u1_Bs3 = (u4_bs >> 0) & 0xff;
1179
30.0M
    clip0 = pu1_cliptab[u1_Bs0];
1180
30.0M
    clip1 = pu1_cliptab[u1_Bs1];
1181
30.0M
    clip2 = pu1_cliptab[u1_Bs2];
1182
30.0M
    clip3 = pu1_cliptab[u1_Bs3];
1183
1184
30.0M
    Alpha_8x16 = _mm_set1_epi16(alpha);
1185
30.0M
    Beta_8x16 = _mm_set1_epi16(beta);
1186
1187
30.0M
    bs_flag_16x8b = _mm_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs2, u1_Bs2,
1188
30.0M
                                 u1_Bs2, u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,
1189
30.0M
                                 u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0);
1190
1191
30.0M
    C0_16x8 = _mm_set_epi8(clip3, clip3, clip3, clip3, clip2, clip2, clip2,
1192
30.0M
                           clip2, clip1, clip1, clip1, clip1, clip0, clip0,
1193
30.0M
                           clip0, clip0);
1194
1195
30.0M
    bs_flag_16x8b = _mm_cmpeq_epi8(bs_flag_16x8b, zero);
1196
30.0M
    bs_flag_16x8b = _mm_xor_si128(bs_flag_16x8b, _mm_set1_epi8(0xFF)); //Invert for required mask
1197
30.0M
    C0_8x16 = _mm_unpacklo_epi8(C0_16x8, zero);
1198
30.0M
    C0_hi_8x16 = _mm_unpackhi_epi8(C0_16x8, zero);
1199
1200
30.0M
    p1_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP1));
1201
30.0M
    p0_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP0));
1202
30.0M
    p2_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP2));
1203
30.0M
    q2_16x8 = _mm_loadu_si128((__m128i *)(pu1_src + i16_posQ2));
1204
1205
    //Cond1 (ABS(p0 - q0) < alpha)
1206
30.0M
    temp1 = _mm_subs_epu8(q0_16x8, p0_16x8);
1207
30.0M
    temp2 = _mm_subs_epu8(p0_16x8, q0_16x8);
1208
30.0M
    temp1 = _mm_add_epi8(temp1, temp2);
1209
1210
30.0M
    temp2 = _mm_unpacklo_epi8(temp1, zero);
1211
30.0M
    temp1 = _mm_unpackhi_epi8(temp1, zero);
1212
1213
30.0M
    temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
1214
30.0M
    temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);
1215
1216
30.0M
    flag1_16x8 = _mm_packs_epi16(temp2, temp1);
1217
30.0M
    flag1_16x8 = _mm_and_si128(flag1_16x8, bs_flag_16x8b);
1218
1219
    //Cond2 (ABS(q1 - q0) < beta)
1220
30.0M
    temp1 = _mm_subs_epu8(q0_16x8, q1_16x8);
1221
30.0M
    temp2 = _mm_subs_epu8(q1_16x8, q0_16x8);
1222
30.0M
    temp1 = _mm_add_epi8(temp1, temp2);
1223
1224
30.0M
    temp2 = _mm_unpacklo_epi8(temp1, zero);
1225
30.0M
    temp1 = _mm_unpackhi_epi8(temp1, zero);
1226
1227
30.0M
    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
1228
30.0M
    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
1229
1230
30.0M
    flag2_16x8 = _mm_packs_epi16(temp2, temp1);
1231
1232
30.0M
    flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
1233
1234
    //Cond3 (ABS(p1 - p0) < beta)
1235
30.0M
    temp1 = _mm_subs_epu8(p0_16x8, p1_16x8);
1236
30.0M
    temp2 = _mm_subs_epu8(p1_16x8, p0_16x8);
1237
30.0M
    temp1 = _mm_add_epi8(temp1, temp2);
1238
1239
30.0M
    temp2 = _mm_unpacklo_epi8(temp1, zero);
1240
30.0M
    temp1 = _mm_unpackhi_epi8(temp1, zero);
1241
1242
30.0M
    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
1243
30.0M
    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
1244
1245
30.0M
    flag2_16x8 = _mm_packs_epi16(temp2, temp1);
1246
1247
    // !((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta))
1248
30.0M
    flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
1249
1250
    // (ABS(p2 - p0) < beta)
1251
30.0M
    temp1 = _mm_subs_epu8(p0_16x8, p2_16x8);
1252
30.0M
    temp2 = _mm_subs_epu8(p2_16x8, p0_16x8);
1253
30.0M
    temp1 = _mm_add_epi8(temp1, temp2);
1254
1255
30.0M
    temp2 = _mm_unpacklo_epi8(temp1, zero);
1256
30.0M
    temp1 = _mm_unpackhi_epi8(temp1, zero);
1257
30.0M
    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
1258
30.0M
    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
1259
1260
30.0M
    flag2_16x8 = _mm_packs_epi16(temp2, temp1);
1261
30.0M
    flag2_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
1262
1263
30.0M
    temp2 = _mm_subs_epi16(zero, temp2);
1264
30.0M
    temp1 = _mm_subs_epi16(zero, temp1);
1265
1266
30.0M
    C_8x16 = _mm_add_epi16(C0_8x16, temp2);
1267
30.0M
    C_hi_8x16 = _mm_add_epi16(C0_hi_8x16, temp1);
1268
1269
    // (ABS(q2 - q0) < beta)
1270
30.0M
    temp1 = _mm_subs_epu8(q0_16x8, q2_16x8);
1271
30.0M
    temp2 = _mm_subs_epu8(q2_16x8, q0_16x8);
1272
30.0M
    temp1 = _mm_add_epi8(temp1, temp2);
1273
1274
30.0M
    temp2 = _mm_unpacklo_epi8(temp1, zero);
1275
30.0M
    temp1 = _mm_unpackhi_epi8(temp1, zero);
1276
30.0M
    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
1277
30.0M
    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
1278
1279
30.0M
    flag3_16x8 = _mm_packs_epi16(temp2, temp1);
1280
30.0M
    flag3_16x8 = _mm_and_si128(flag1_16x8, flag3_16x8);
1281
1282
30.0M
    temp2 = _mm_subs_epi16(zero, temp2);
1283
30.0M
    temp1 = _mm_subs_epi16(zero, temp1);
1284
1285
30.0M
    C_8x16 = _mm_add_epi16(C_8x16, temp2);
1286
30.0M
    C_hi_8x16 = _mm_add_epi16(C_hi_8x16, temp1);
1287
1288
30.0M
    const_val4_8x16 = _mm_set1_epi16(4);
1289
30.0M
    temp1 = _mm_subs_epi16(_mm_unpacklo_epi8(q0_16x8, zero),
1290
30.0M
                           _mm_unpacklo_epi8(p0_16x8, zero));
1291
30.0M
    temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(p1_16x8, zero),
1292
30.0M
                           _mm_unpacklo_epi8(q1_16x8, zero));
1293
30.0M
    temp1 = _mm_slli_epi16(temp1, 2);
1294
30.0M
    temp1 = _mm_add_epi16(temp1, temp2);
1295
30.0M
    temp1 = _mm_add_epi16(temp1, const_val4_8x16);
1296
30.0M
    in_macro_16x8 = _mm_srai_epi16(temp1, 3);
1297
1298
30.0M
    temp1 = _mm_subs_epi16(_mm_unpackhi_epi8(q0_16x8, zero),
1299
30.0M
                           _mm_unpackhi_epi8(p0_16x8, zero));
1300
30.0M
    temp2 = _mm_subs_epi16(_mm_unpackhi_epi8(p1_16x8, zero),
1301
30.0M
                           _mm_unpackhi_epi8(q1_16x8, zero));
1302
30.0M
    temp1 = _mm_slli_epi16(temp1, 2);
1303
30.0M
    temp1 = _mm_add_epi16(temp1, temp2);
1304
30.0M
    temp1 = _mm_add_epi16(temp1, const_val4_8x16);
1305
30.0M
    in_macro_hi_16x8 = _mm_srai_epi16(temp1, 3);
1306
1307
30.0M
    in_macro_16x8 = _mm_min_epi16(C_8x16, in_macro_16x8); //CLIP3
1308
30.0M
    in_macro_hi_16x8 = _mm_min_epi16(C_hi_8x16, in_macro_hi_16x8); //CLIP3
1309
30.0M
    C_8x16 = _mm_subs_epi16(zero, C_8x16);
1310
30.0M
    C_hi_8x16 = _mm_subs_epi16(zero, C_hi_8x16);
1311
30.0M
    in_macro_16x8 = _mm_max_epi16(C_8x16, in_macro_16x8); //CLIP3
1312
30.0M
    in_macro_hi_16x8 = _mm_max_epi16(C_hi_8x16, in_macro_hi_16x8); //CLIP3
1313
1314
30.0M
    temp1 = _mm_add_epi16(_mm_unpacklo_epi8(p0_16x8, zero), in_macro_16x8);
1315
30.0M
    temp2 = _mm_add_epi16(_mm_unpackhi_epi8(p0_16x8, zero), in_macro_hi_16x8);
1316
1317
30.0M
    temp1 = _mm_packus_epi16(temp1, temp2);
1318
1319
30.0M
    temp1 = _mm_and_si128(temp1, flag1_16x8);
1320
30.0M
    temp2 = _mm_and_si128(p0_16x8,
1321
30.0M
                          _mm_xor_si128(flag1_16x8, _mm_set1_epi16(0xFFFF)));
1322
1323
30.0M
    temp1 = _mm_add_epi8(temp1, temp2);
1324
1325
30.0M
    _mm_storeu_si128((__m128i *)(pu1_HorzPixel + i16_posP0), temp1);
1326
1327
30.0M
    temp1 = _mm_sub_epi16(_mm_unpacklo_epi8(q0_16x8, zero), in_macro_16x8);
1328
30.0M
    temp2 = _mm_sub_epi16(_mm_unpackhi_epi8(q0_16x8, zero), in_macro_hi_16x8);
1329
1330
30.0M
    temp1 = _mm_packus_epi16(temp1, temp2);
1331
1332
30.0M
    temp1 = _mm_and_si128(temp1, flag1_16x8);
1333
30.0M
    temp2 = _mm_and_si128(q0_16x8,
1334
30.0M
                          _mm_xor_si128(flag1_16x8, _mm_set1_epi16(0xFFFF)));
1335
1336
30.0M
    temp1 = _mm_add_epi8(temp1, temp2);
1337
30.0M
    _mm_storeu_si128((__m128i *)(pu1_src), temp1);
1338
1339
    //if(Ap < Beta)
1340
30.0M
    temp1 = _mm_avg_epu16(_mm_unpacklo_epi8(q0_16x8, zero),
1341
30.0M
                          _mm_unpacklo_epi8(p0_16x8, zero));
1342
30.0M
    temp2 = _mm_slli_epi16(_mm_unpacklo_epi8(p1_16x8, zero), 1);
1343
    //temp2 = _mm_subs_epi16(zero,temp2);
1344
30.0M
    temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(p2_16x8, zero), temp2);
1345
30.0M
    temp2 = _mm_add_epi16(temp1, temp2);
1346
30.0M
    in_macro_16x8 = _mm_srai_epi16(temp2, 1);
1347
1348
30.0M
    temp1 = _mm_avg_epu16(_mm_unpackhi_epi8(q0_16x8, zero),
1349
30.0M
                          _mm_unpackhi_epi8(p0_16x8, zero));
1350
30.0M
    temp2 = _mm_slli_epi16(_mm_unpackhi_epi8(p1_16x8, zero), 1);
1351
    //temp2 = _mm_subs_epi16(zero,temp2);
1352
30.0M
    temp2 = _mm_subs_epi16(_mm_unpackhi_epi8(p2_16x8, zero), temp2);
1353
30.0M
    temp2 = _mm_add_epi16(temp1, temp2);
1354
30.0M
    in_macro_hi_16x8 = _mm_srai_epi16(temp2, 1);
1355
1356
30.0M
    in_macro_16x8 = _mm_min_epi16(C0_8x16, in_macro_16x8); //CLIP3
1357
30.0M
    in_macro_hi_16x8 = _mm_min_epi16(C0_hi_8x16, in_macro_hi_16x8); //CLIP3
1358
30.0M
    C0_8x16 = _mm_subs_epi16(zero, C0_8x16);
1359
30.0M
    C0_hi_8x16 = _mm_subs_epi16(zero, C0_hi_8x16);
1360
30.0M
    in_macro_16x8 = _mm_max_epi16(C0_8x16, in_macro_16x8); //CLIP3
1361
30.0M
    in_macro_hi_16x8 = _mm_max_epi16(C0_hi_8x16, in_macro_hi_16x8); //CLIP3
1362
1363
30.0M
    temp1 = _mm_add_epi16(_mm_unpacklo_epi8(p1_16x8, zero), in_macro_16x8);
1364
30.0M
    temp2 = _mm_add_epi16(_mm_unpackhi_epi8(p1_16x8, zero), in_macro_hi_16x8);
1365
1366
30.0M
    temp1 = _mm_packus_epi16(temp1, temp2);
1367
1368
30.0M
    temp1 = _mm_and_si128(temp1, flag2_16x8);
1369
30.0M
    temp2 = _mm_and_si128(p1_16x8,
1370
30.0M
                          _mm_xor_si128(flag2_16x8, _mm_set1_epi16(0xFFFF)));
1371
30.0M
    temp1 = _mm_add_epi8(temp1, temp2);
1372
30.0M
    _mm_storeu_si128((__m128i *)(pu1_HorzPixel + i16_posP1), temp1);
1373
1374
    //if(Aq < Beta)
1375
30.0M
    temp1 = _mm_avg_epu16(_mm_unpacklo_epi8(q0_16x8, zero),
1376
30.0M
                          _mm_unpacklo_epi8(p0_16x8, zero));
1377
30.0M
    temp2 = _mm_slli_epi16(_mm_unpacklo_epi8(q1_16x8, zero), 1);
1378
    //temp2 = _mm_slli_epi16 (temp2, 1);
1379
30.0M
    temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(q2_16x8, zero), temp2);
1380
30.0M
    temp2 = _mm_add_epi16(temp1, temp2);
1381
30.0M
    in_macro_16x8 = _mm_srai_epi16(temp2, 1);
1382
1383
30.0M
    temp1 = _mm_avg_epu16(_mm_unpackhi_epi8(q0_16x8, zero),
1384
30.0M
                          _mm_unpackhi_epi8(p0_16x8, zero));
1385
30.0M
    temp2 = _mm_slli_epi16(_mm_unpackhi_epi8(q1_16x8, zero), 1);
1386
    //temp2 = _mm_slli_epi16 (temp2, 1);
1387
30.0M
    temp2 = _mm_subs_epi16(_mm_unpackhi_epi8(q2_16x8, zero), temp2);
1388
30.0M
    temp2 = _mm_add_epi16(temp1, temp2);
1389
30.0M
    in_macro_hi_16x8 = _mm_srai_epi16(temp2, 1);
1390
1391
30.0M
    in_macro_16x8 = _mm_max_epi16(C0_8x16, in_macro_16x8); //CLIP3
1392
30.0M
    in_macro_hi_16x8 = _mm_max_epi16(C0_hi_8x16, in_macro_hi_16x8); //CLIP3
1393
30.0M
    C0_8x16 = _mm_subs_epi16(zero, C0_8x16);
1394
30.0M
    C0_hi_8x16 = _mm_subs_epi16(zero, C0_hi_8x16);
1395
30.0M
    in_macro_16x8 = _mm_min_epi16(C0_8x16, in_macro_16x8); //CLIP3
1396
30.0M
    in_macro_hi_16x8 = _mm_min_epi16(C0_hi_8x16, in_macro_hi_16x8); //CLIP3
1397
1398
30.0M
    temp1 = _mm_add_epi16(_mm_unpacklo_epi8(q1_16x8, zero), in_macro_16x8);
1399
30.0M
    temp2 = _mm_add_epi16(_mm_unpackhi_epi8(q1_16x8, zero), in_macro_hi_16x8);
1400
1401
30.0M
    temp1 = _mm_packus_epi16(temp1, temp2);
1402
1403
30.0M
    temp1 = _mm_and_si128(temp1, flag3_16x8);
1404
30.0M
    temp2 = _mm_and_si128(q1_16x8,
1405
30.0M
                          _mm_xor_si128(flag3_16x8, _mm_set1_epi16(0xFFFF)));
1406
30.0M
    temp1 = _mm_add_epi8(temp1, temp2);
1407
1408
30.0M
    _mm_storeu_si128((__m128i *)(pu1_src + i16_posQ1), temp1);
1409
1410
30.0M
}
1411
1412
/*****************************************************************************/
1413
/*                                                                           */
1414
/*  Function Name : ih264_deblk_luma_vert_bs4_mbaff_ssse3()                  */
1415
/*                                                                           */
1416
/*  Description   : This function performs filtering of a luma block         */
1417
/*                  vertical edge when boundary strength is set to 4.        */
1418
/*                                                                           */
1419
/*  Inputs        : pu1_src       - pointer to the src sample q0             */
1420
/*                  src_strd      - source stride                            */
1421
/*                  alpha         - alpha value for the boundary             */
1422
/*                  beta          - beta value for the boundary              */
1423
/*                                                                           */
1424
/*  Globals       : None                                                     */
1425
/*                                                                           */
1426
/*  Processing    : When the function is called twice, this operation is as  */
1427
/*                  described in Sec. 8.7.2.3 under the title "Filtering     */
1428
/*                  process for edges for bS equal to 4" in ITU T Rec H.264. */
1429
/*                                                                           */
1430
/*  Outputs       : None                                                     */
1431
/*                                                                           */
1432
/*  Returns       : None                                                     */
1433
/*                                                                           */
1434
/*  Issues        : None                                                     */
1435
/*                                                                           */
1436
/*  Revision History:                                                        */
1437
/*                                                                           */
1438
/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
1439
/*         12 02 2015   Naveen Kumar P  Initial version                      */
1440
/*                                                                           */
1441
/*****************************************************************************/
1442
void ih264_deblk_luma_vert_bs4_mbaff_ssse3(UWORD8 *pu1_src,
1443
                                           WORD32 src_strd,
1444
                                           WORD32 alpha,
1445
                                           WORD32 beta)
1446
21.8k
{
1447
21.8k
    __m128i zero = _mm_setzero_si128();
1448
21.8k
    __m128i q0_16x8, q1_16x8, q2_16x8, q3_16x8;
1449
21.8k
    __m128i p0_16x8, p1_16x8, p2_16x8, p3_16x8;
1450
21.8k
    __m128i q0_8x16, q1_8x16, q2_8x16, q3_8x16;
1451
21.8k
    __m128i p0_8x16, p1_8x16, p2_8x16, p3_8x16;
1452
21.8k
    __m128i q0_16x8_1;
1453
21.8k
    __m128i p0_16x8_1;
1454
21.8k
    __m128i q0_16x8_2, q1_16x8_2, q2_16x8_2;
1455
21.8k
    __m128i p0_16x8_2, p1_16x8_2, p2_16x8_2;
1456
21.8k
    __m128i temp1, temp2, temp3, temp4, temp5, temp6;
1457
21.8k
    __m128i Alpha_8x16, Beta_8x16;
1458
21.8k
    __m128i flag1_16x8, flag2_16x8, flag3_16x8, flag4_16x8;
1459
21.8k
    __m128i const_val2_16x8 = _mm_set1_epi16(2);
1460
21.8k
    __m128i line1, line2, line3, line4, line5, line6, line7, line8;
1461
1462
21.8k
    Alpha_8x16 = _mm_set1_epi16(alpha);
1463
21.8k
    Beta_8x16 = _mm_set1_epi16(beta);
1464
1465
21.8k
    line1 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd));
1466
21.8k
    line2 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd));
1467
21.8k
    line3 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd));
1468
21.8k
    line4 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd));
1469
21.8k
    line5 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd));
1470
21.8k
    line6 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd));
1471
21.8k
    line7 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd));
1472
21.8k
    line8 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd));
1473
1474
21.8k
    temp1 = _mm_unpacklo_epi8(line1, line2);
1475
21.8k
    temp2 = _mm_unpacklo_epi8(line3, line4);
1476
21.8k
    temp3 = _mm_unpacklo_epi8(line5, line6);
1477
21.8k
    temp4 = _mm_unpacklo_epi8(line7, line8);
1478
1479
21.8k
    line1 = _mm_unpacklo_epi16(temp1, temp2);
1480
21.8k
    line2 = _mm_unpackhi_epi16(temp1, temp2);
1481
21.8k
    line3 = _mm_unpacklo_epi16(temp3, temp4);
1482
21.8k
    line4 = _mm_unpackhi_epi16(temp3, temp4);
1483
1484
21.8k
    p1_8x16 = _mm_unpacklo_epi32(line1, line3);
1485
21.8k
    p0_8x16 = _mm_unpackhi_epi32(line1, line3);
1486
21.8k
    q0_8x16 = _mm_unpacklo_epi32(line2, line4);
1487
21.8k
    q1_8x16 = _mm_unpackhi_epi32(line2, line4);
1488
1489
21.8k
    p3_16x8 = _mm_unpacklo_epi64(p1_8x16, zero);
1490
21.8k
    p2_16x8 = _mm_unpackhi_epi64(p1_8x16, zero);
1491
21.8k
    q2_16x8 = _mm_unpacklo_epi64(q1_8x16, zero);
1492
21.8k
    q3_16x8 = _mm_unpackhi_epi64(q1_8x16, zero);
1493
21.8k
    p1_16x8 = _mm_unpacklo_epi64(p0_8x16, zero);
1494
21.8k
    p0_16x8 = _mm_unpackhi_epi64(p0_8x16, zero);
1495
21.8k
    q0_16x8 = _mm_unpacklo_epi64(q0_8x16, zero);
1496
21.8k
    q1_16x8 = _mm_unpackhi_epi64(q0_8x16, zero);
1497
1498
    //Cond1 (ABS(p0 - q0) < alpha)
1499
21.8k
    temp1 = _mm_subs_epu8(q0_16x8, p0_16x8);
1500
21.8k
    temp2 = _mm_subs_epu8(p0_16x8, q0_16x8);
1501
21.8k
    temp1 = _mm_add_epi8(temp1, temp2);
1502
1503
21.8k
    temp2 = _mm_unpacklo_epi8(temp1, zero);
1504
21.8k
    temp1 = _mm_unpackhi_epi8(temp1, zero);
1505
1506
21.8k
    temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
1507
21.8k
    temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);
1508
1509
21.8k
    flag1_16x8 = _mm_packs_epi16(temp2, temp1);
1510
1511
    //Cond2 (ABS(q1 - q0) < beta)
1512
21.8k
    temp1 = _mm_subs_epu8(q0_16x8, q1_16x8);
1513
21.8k
    temp2 = _mm_subs_epu8(q1_16x8, q0_16x8);
1514
21.8k
    temp1 = _mm_add_epi8(temp1, temp2);
1515
1516
21.8k
    temp2 = _mm_unpacklo_epi8(temp1, zero);
1517
21.8k
    temp1 = _mm_unpackhi_epi8(temp1, zero);
1518
1519
21.8k
    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
1520
21.8k
    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
1521
1522
21.8k
    flag2_16x8 = _mm_packs_epi16(temp2, temp1);
1523
1524
21.8k
    flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
1525
1526
    //Cond3 (ABS(p1 - p0) < beta)
1527
21.8k
    temp1 = _mm_subs_epu8(p0_16x8, p1_16x8);
1528
21.8k
    temp2 = _mm_subs_epu8(p1_16x8, p0_16x8);
1529
21.8k
    temp1 = _mm_add_epi8(temp1, temp2);
1530
1531
21.8k
    temp2 = _mm_unpacklo_epi8(temp1, zero);
1532
21.8k
    temp1 = _mm_unpackhi_epi8(temp1, zero);
1533
1534
21.8k
    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
1535
21.8k
    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
1536
1537
21.8k
    flag2_16x8 = _mm_packs_epi16(temp2, temp1);
1538
1539
    // !((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta))
1540
21.8k
    flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
1541
1542
    // (ABS(p0 - q0) < ((alpha >> 2) + 2))
1543
21.8k
    temp1 = _mm_subs_epu8(p0_16x8, q0_16x8);
1544
21.8k
    temp2 = _mm_subs_epu8(q0_16x8, p0_16x8);
1545
21.8k
    temp1 = _mm_add_epi8(temp1, temp2);
1546
21.8k
    Alpha_8x16 = _mm_srai_epi16(Alpha_8x16, 2);
1547
21.8k
    Alpha_8x16 = _mm_add_epi16(Alpha_8x16, const_val2_16x8);
1548
1549
21.8k
    temp2 = _mm_unpacklo_epi8(temp1, zero);
1550
21.8k
    temp1 = _mm_unpackhi_epi8(temp1, zero);
1551
21.8k
    temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
1552
21.8k
    temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);
1553
1554
21.8k
    flag2_16x8 = _mm_packs_epi16(temp2, temp1);
1555
21.8k
    flag2_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
1556
1557
    // (ABS(p2 - p0) < beta)
1558
21.8k
    temp1 = _mm_subs_epu8(p0_16x8, p2_16x8);
1559
21.8k
    temp2 = _mm_subs_epu8(p2_16x8, p0_16x8);
1560
21.8k
    temp1 = _mm_add_epi8(temp1, temp2);
1561
1562
21.8k
    temp2 = _mm_unpacklo_epi8(temp1, zero);
1563
21.8k
    temp1 = _mm_unpackhi_epi8(temp1, zero);
1564
21.8k
    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
1565
21.8k
    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
1566
1567
21.8k
    flag3_16x8 = _mm_packs_epi16(temp2, temp1);
1568
21.8k
    flag3_16x8 = _mm_and_si128(flag3_16x8, flag2_16x8);
1569
1570
    // (ABS(q2 - q0) < beta)
1571
21.8k
    temp1 = _mm_subs_epu8(q0_16x8, q2_16x8);
1572
21.8k
    temp2 = _mm_subs_epu8(q2_16x8, q0_16x8);
1573
21.8k
    temp1 = _mm_add_epi8(temp1, temp2);
1574
1575
21.8k
    temp2 = _mm_unpacklo_epi8(temp1, zero);
1576
21.8k
    temp1 = _mm_unpackhi_epi8(temp1, zero);
1577
21.8k
    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
1578
21.8k
    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
1579
1580
21.8k
    flag4_16x8 = _mm_packs_epi16(temp2, temp1);
1581
21.8k
    flag4_16x8 = _mm_and_si128(flag4_16x8, flag2_16x8);
1582
1583
    // First 8 pixels
1584
21.8k
    p3_8x16 = _mm_unpacklo_epi8(p3_16x8, zero);
1585
21.8k
    p2_8x16 = _mm_unpacklo_epi8(p2_16x8, zero);
1586
21.8k
    p1_8x16 = _mm_unpacklo_epi8(p1_16x8, zero);
1587
21.8k
    p0_8x16 = _mm_unpacklo_epi8(p0_16x8, zero);
1588
21.8k
    q0_8x16 = _mm_unpacklo_epi8(q0_16x8, zero);
1589
21.8k
    q1_8x16 = _mm_unpacklo_epi8(q1_16x8, zero);
1590
21.8k
    q2_8x16 = _mm_unpacklo_epi8(q2_16x8, zero);
1591
21.8k
    q3_8x16 = _mm_unpacklo_epi8(q3_16x8, zero);
1592
1593
    // p0_1 and q0_1
1594
21.8k
    temp1 = _mm_add_epi16(p0_8x16, q1_8x16);
1595
21.8k
    temp2 = _mm_add_epi16(p1_8x16, q0_8x16);
1596
21.8k
    temp5 = _mm_add_epi16(temp1, const_val2_16x8);
1597
21.8k
    temp6 = _mm_add_epi16(temp2, const_val2_16x8);
1598
21.8k
    temp3 = _mm_slli_epi16(p1_8x16, 1);
1599
21.8k
    temp4 = _mm_slli_epi16(q1_8x16, 1);
1600
21.8k
    temp1 = _mm_add_epi16(temp5, temp3);
1601
21.8k
    temp2 = _mm_add_epi16(temp6, temp4);
1602
21.8k
    p0_16x8_1 = _mm_srai_epi16(temp1, 2);
1603
21.8k
    q0_16x8_1 = _mm_srai_epi16(temp2, 2);
1604
1605
    // p1_2 and q1_2
1606
21.8k
    temp6 = _mm_add_epi16(temp6, p0_8x16);
1607
21.8k
    temp5 = _mm_add_epi16(temp5, q0_8x16);
1608
21.8k
    temp1 = _mm_add_epi16(temp6, p2_8x16);
1609
21.8k
    temp2 = _mm_add_epi16(temp5, q2_8x16);
1610
21.8k
    p1_16x8_2 = _mm_srai_epi16(temp1, 2);
1611
21.8k
    q1_16x8_2 = _mm_srai_epi16(temp2, 2);
1612
1613
    // p0_2 and q0_2
1614
21.8k
    temp1 = _mm_add_epi16(temp3, p2_8x16);
1615
21.8k
    temp2 = _mm_add_epi16(temp4, q2_8x16);
1616
21.8k
    temp1 = _mm_add_epi16(temp1, q1_8x16);
1617
21.8k
    temp2 = _mm_add_epi16(temp2, p1_8x16);
1618
21.8k
    temp3 = _mm_add_epi16(p0_8x16, q0_8x16);
1619
21.8k
    temp3 = _mm_slli_epi16(temp3, 1);
1620
21.8k
    temp1 = _mm_add_epi16(temp1, temp3);
1621
21.8k
    temp2 = _mm_add_epi16(temp2, temp3);
1622
21.8k
    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(4));
1623
21.8k
    temp2 = _mm_add_epi16(temp2, _mm_set1_epi16(4));
1624
21.8k
    p0_16x8_2 = _mm_srai_epi16(temp1, 3);
1625
21.8k
    q0_16x8_2 = _mm_srai_epi16(temp2, 3);
1626
1627
    // p2_2 and q2_2
1628
21.8k
    temp1 = _mm_add_epi16(temp6, const_val2_16x8);
1629
21.8k
    temp2 = _mm_add_epi16(temp5, const_val2_16x8);
1630
21.8k
    temp3 = _mm_slli_epi16(p2_8x16, 1);
1631
21.8k
    temp4 = _mm_slli_epi16(q2_8x16, 1);
1632
21.8k
    temp3 = _mm_add_epi16(p2_8x16, temp3);
1633
21.8k
    temp4 = _mm_add_epi16(q2_8x16, temp4);
1634
21.8k
    temp5 = _mm_slli_epi16(p3_8x16, 1);
1635
21.8k
    temp6 = _mm_slli_epi16(q3_8x16, 1);
1636
21.8k
    temp1 = _mm_add_epi16(temp1, temp3);
1637
21.8k
    temp2 = _mm_add_epi16(temp2, temp4);
1638
21.8k
    temp1 = _mm_add_epi16(temp1, temp5);
1639
21.8k
    temp2 = _mm_add_epi16(temp2, temp6);
1640
21.8k
    p2_16x8_2 = _mm_srai_epi16(temp1, 3);
1641
21.8k
    q2_16x8_2 = _mm_srai_epi16(temp2, 3);
1642
1643
    // p0_1 and q0_1
1644
21.8k
    p0_16x8_1 = _mm_packus_epi16(p0_16x8_1, zero);
1645
21.8k
    q0_16x8_1 = _mm_packus_epi16(q0_16x8_1, zero);
1646
1647
    // p1_2 and q1_2
1648
21.8k
    p1_16x8_2 = _mm_packus_epi16(p1_16x8_2, zero);
1649
21.8k
    q1_16x8_2 = _mm_packus_epi16(q1_16x8_2, zero);
1650
1651
    // p0_2 and q0_2
1652
21.8k
    p0_16x8_2 = _mm_packus_epi16(p0_16x8_2, zero);
1653
21.8k
    q0_16x8_2 = _mm_packus_epi16(q0_16x8_2, zero);
1654
1655
    // p2_2 and q2_2
1656
21.8k
    p2_16x8_2 = _mm_packus_epi16(p2_16x8_2, zero);
1657
21.8k
    q2_16x8_2 = _mm_packus_epi16(q2_16x8_2, zero);
1658
1659
    // p0 and q0
1660
21.8k
    p0_16x8 = _mm_and_si128(p0_16x8,
1661
21.8k
                            _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF)));
1662
21.8k
    p0_16x8_1 = _mm_and_si128(p0_16x8_1, flag1_16x8);
1663
21.8k
    p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_1);
1664
21.8k
    q0_16x8 = _mm_and_si128(q0_16x8,
1665
21.8k
                            _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF)));
1666
21.8k
    q0_16x8_1 = _mm_and_si128(q0_16x8_1, flag1_16x8);
1667
21.8k
    q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_1);
1668
1669
    // p0 and q0
1670
21.8k
    p0_16x8 = _mm_and_si128(p0_16x8,
1671
21.8k
                            _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
1672
21.8k
    p0_16x8_2 = _mm_and_si128(p0_16x8_2, flag3_16x8);
1673
21.8k
    p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_2);
1674
21.8k
    q0_16x8 = _mm_and_si128(q0_16x8,
1675
21.8k
                            _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
1676
21.8k
    q0_16x8_2 = _mm_and_si128(q0_16x8_2, flag4_16x8);
1677
21.8k
    q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_2);
1678
1679
    // p1 and q1
1680
21.8k
    p1_16x8 = _mm_and_si128(p1_16x8,
1681
21.8k
                            _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
1682
21.8k
    p1_16x8_2 = _mm_and_si128(p1_16x8_2, flag3_16x8);
1683
21.8k
    p1_16x8 = _mm_add_epi8(p1_16x8, p1_16x8_2);
1684
21.8k
    q1_16x8 = _mm_and_si128(q1_16x8,
1685
21.8k
                            _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
1686
21.8k
    q1_16x8_2 = _mm_and_si128(q1_16x8_2, flag4_16x8);
1687
21.8k
    q1_16x8 = _mm_add_epi8(q1_16x8, q1_16x8_2);
1688
1689
    // p2 and q2
1690
21.8k
    p2_16x8 = _mm_and_si128(p2_16x8,
1691
21.8k
                            _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
1692
21.8k
    p2_16x8_2 = _mm_and_si128(p2_16x8_2, flag3_16x8);
1693
21.8k
    p2_16x8 = _mm_add_epi8(p2_16x8, p2_16x8_2);
1694
21.8k
    q2_16x8 = _mm_and_si128(q2_16x8,
1695
21.8k
                            _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
1696
21.8k
    q2_16x8_2 = _mm_and_si128(q2_16x8_2, flag4_16x8);
1697
21.8k
    q2_16x8 = _mm_add_epi8(q2_16x8, q2_16x8_2);
1698
1699
21.8k
    temp1 = _mm_unpacklo_epi8(p3_16x8, p2_16x8);
1700
21.8k
    temp2 = _mm_unpacklo_epi8(p1_16x8, p0_16x8);
1701
21.8k
    temp3 = _mm_unpacklo_epi8(q0_16x8, q1_16x8);
1702
21.8k
    temp4 = _mm_unpacklo_epi8(q2_16x8, q3_16x8);
1703
1704
21.8k
    p3_8x16 = _mm_unpacklo_epi16(temp1, temp2);
1705
21.8k
    p2_8x16 = _mm_unpackhi_epi16(temp1, temp2);
1706
21.8k
    q2_8x16 = _mm_unpacklo_epi16(temp3, temp4);
1707
21.8k
    q3_8x16 = _mm_unpackhi_epi16(temp3, temp4);
1708
1709
21.8k
    line1 = _mm_unpacklo_epi32(p3_8x16, q2_8x16);
1710
21.8k
    line2 = _mm_srli_si128(line1, 8);
1711
21.8k
    line3 = _mm_unpackhi_epi32(p3_8x16, q2_8x16);
1712
21.8k
    line4 = _mm_srli_si128(line3, 8);
1713
21.8k
    line5 = _mm_unpacklo_epi32(p2_8x16, q3_8x16);
1714
21.8k
    line6 = _mm_srli_si128(line5, 8);
1715
21.8k
    line7 = _mm_unpackhi_epi32(p2_8x16, q3_8x16);
1716
21.8k
    line8 = _mm_srli_si128(line7, 8);
1717
1718
21.8k
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd), line1);
1719
21.8k
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd), line2);
1720
21.8k
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd), line3);
1721
21.8k
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd), line4);
1722
21.8k
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd), line5);
1723
21.8k
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd), line6);
1724
21.8k
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd), line7);
1725
21.8k
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd), line8);
1726
1727
21.8k
}
1728
1729
/*****************************************************************************/
1730
/*                                                                           */
1731
/*  Function Name : ih264_deblk_luma_vert_bslt4_mbaff_ssse3()                */
1732
/*                                                                           */
1733
/*  Description   : This function performs filtering of a luma block         */
1734
/*                  vertical edge when boundary strength is less than 4.     */
1735
/*                                                                           */
1736
/*  Inputs        : pu1_src       - pointer to the src sample q0             */
1737
/*                  src_strd      - source stride                            */
1738
/*                  alpha         - alpha value for the boundary             */
1739
/*                  beta          - beta value for the boundary              */
1740
/*                  u4_bs         - packed Boundary strength array           */
1741
/*                  pu1_cliptab   - tc0_table                                */
1742
/*                                                                           */
1743
/*  Globals       : None                                                     */
1744
/*                                                                           */
1745
/*  Processing    : When the function is called twice, this operation is as  */
1746
/*                  described in Sec. 8.7.2.3 under the title "Filtering     */
1747
/*                  process for edges for bS less than 4" in ITU T Rec H.264.*/
1748
/*                                                                           */
1749
/*  Outputs       : None                                                     */
1750
/*                                                                           */
1751
/*  Returns       : None                                                     */
1752
/*                                                                           */
1753
/*  Issues        : None                                                     */
1754
/*                                                                           */
1755
/*  Revision History:                                                        */
1756
/*                                                                           */
1757
/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
1758
/*         12 02 2015   Naveen Kumar P  Initial version                      */
1759
/*                                                                           */
1760
/*****************************************************************************/
1761
void ih264_deblk_luma_vert_bslt4_mbaff_ssse3(UWORD8 *pu1_src,
1762
                                             WORD32 src_strd,
1763
                                             WORD32 alpha,
1764
                                             WORD32 beta,
1765
                                             UWORD32 u4_bs,
1766
                                             const UWORD8 *pu1_cliptab)
1767
28.6k
{
1768
28.6k
    __m128i zero = _mm_setzero_si128();
1769
28.6k
    __m128i bs_flag_16x8b, C0_16x8, C0_8x16, C_8x16;
1770
28.6k
    __m128i q0_16x8, q1_16x8, q2_16x8, q3_16x8;
1771
28.6k
    __m128i p0_16x8, p1_16x8, p2_16x8, p3_16x8;
1772
28.6k
    __m128i temp1, temp2, temp3, temp4;
1773
28.6k
    __m128i Alpha_8x16, Beta_8x16, flag1_16x8, flag2_16x8, flag3_16x8;
1774
28.6k
    __m128i in_macro_16x8;
1775
28.6k
    __m128i const_val4_8x16;
1776
28.6k
    UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
1777
28.6k
    UWORD8 clip0, clip1, clip2, clip3;
1778
28.6k
    __m128i line1, line2, line3, line4, line5, line6, line7, line8;
1779
28.6k
    __m128i q0_16x8_1, q1_16x8_1, q0_16x8_2;
1780
28.6k
    __m128i p0_16x8_1, p1_16x8_1, p0_16x8_2;
1781
1782
28.6k
    line1 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd));
1783
28.6k
    line2 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd));
1784
28.6k
    line3 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd));
1785
28.6k
    line4 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd));
1786
28.6k
    line5 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd));
1787
28.6k
    line6 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd));
1788
28.6k
    line7 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd));
1789
28.6k
    line8 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd));
1790
1791
28.6k
    temp1 = _mm_unpacklo_epi8(line1, line2);
1792
28.6k
    temp2 = _mm_unpacklo_epi8(line3, line4);
1793
28.6k
    temp3 = _mm_unpacklo_epi8(line5, line6);
1794
28.6k
    temp4 = _mm_unpacklo_epi8(line7, line8);
1795
1796
28.6k
    line1 = _mm_unpacklo_epi16(temp1, temp2);
1797
28.6k
    line2 = _mm_unpackhi_epi16(temp1, temp2);
1798
28.6k
    line3 = _mm_unpacklo_epi16(temp3, temp4);
1799
28.6k
    line4 = _mm_unpackhi_epi16(temp3, temp4);
1800
1801
28.6k
    temp1 = _mm_unpacklo_epi32(line1, line3);
1802
28.6k
    temp2 = _mm_unpackhi_epi32(line1, line3);
1803
28.6k
    temp3 = _mm_unpacklo_epi32(line2, line4);
1804
28.6k
    temp4 = _mm_unpackhi_epi32(line2, line4);
1805
1806
28.6k
    p3_16x8 = _mm_unpacklo_epi64(temp1, zero);
1807
28.6k
    p2_16x8 = _mm_unpackhi_epi64(temp1, zero);
1808
28.6k
    q2_16x8 = _mm_unpacklo_epi64(temp4, zero);
1809
28.6k
    q3_16x8 = _mm_unpackhi_epi64(temp4, zero);
1810
28.6k
    p1_16x8 = _mm_unpacklo_epi64(temp2, zero);
1811
28.6k
    p0_16x8 = _mm_unpackhi_epi64(temp2, zero);
1812
28.6k
    q0_16x8 = _mm_unpacklo_epi64(temp3, zero);
1813
28.6k
    q1_16x8 = _mm_unpackhi_epi64(temp3, zero);
1814
1815
28.6k
    u1_Bs0 = (u4_bs >> 24) & 0xff;
1816
28.6k
    u1_Bs1 = (u4_bs >> 16) & 0xff;
1817
28.6k
    u1_Bs2 = (u4_bs >> 8) & 0xff;
1818
28.6k
    u1_Bs3 = (u4_bs >> 0) & 0xff;
1819
28.6k
    clip0 = pu1_cliptab[u1_Bs0];
1820
28.6k
    clip1 = pu1_cliptab[u1_Bs1];
1821
28.6k
    clip2 = pu1_cliptab[u1_Bs2];
1822
28.6k
    clip3 = pu1_cliptab[u1_Bs3];
1823
1824
28.6k
    Alpha_8x16 = _mm_set1_epi16(alpha);
1825
28.6k
    Beta_8x16 = _mm_set1_epi16(beta);
1826
1827
28.6k
    bs_flag_16x8b = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, u1_Bs3, u1_Bs3, u1_Bs2,
1828
28.6k
                                 u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs0, u1_Bs0);
1829
1830
28.6k
    C0_16x8 = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, clip3, clip3, clip2, clip2,
1831
28.6k
                           clip1, clip1, clip0, clip0);
1832
1833
28.6k
    bs_flag_16x8b = _mm_cmpeq_epi8(bs_flag_16x8b, zero);
1834
28.6k
    bs_flag_16x8b = _mm_xor_si128(bs_flag_16x8b, _mm_set1_epi8(0xFF)); //Invert for required mask
1835
28.6k
    C0_8x16 = _mm_unpacklo_epi8(C0_16x8, zero);
1836
1837
    //Cond1 (ABS(p0 - q0) < alpha)
1838
28.6k
    temp1 = _mm_subs_epu8(q0_16x8, p0_16x8);
1839
28.6k
    temp2 = _mm_subs_epu8(p0_16x8, q0_16x8);
1840
28.6k
    temp1 = _mm_add_epi8(temp1, temp2);
1841
1842
28.6k
    temp2 = _mm_unpacklo_epi8(temp1, zero);
1843
28.6k
    temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
1844
1845
28.6k
    flag1_16x8 = _mm_packs_epi16(temp2, zero);
1846
28.6k
    flag1_16x8 = _mm_and_si128(flag1_16x8, bs_flag_16x8b);
1847
1848
    //Cond2 (ABS(q1 - q0) < beta)
1849
28.6k
    temp1 = _mm_subs_epu8(q0_16x8, q1_16x8);
1850
28.6k
    temp2 = _mm_subs_epu8(q1_16x8, q0_16x8);
1851
28.6k
    temp1 = _mm_add_epi8(temp1, temp2);
1852
1853
28.6k
    temp2 = _mm_unpacklo_epi8(temp1, zero);
1854
28.6k
    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
1855
1856
28.6k
    flag2_16x8 = _mm_packs_epi16(temp2, zero);
1857
28.6k
    flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
1858
1859
    //Cond3 (ABS(p1 - p0) < beta)
1860
28.6k
    temp1 = _mm_subs_epu8(p0_16x8, p1_16x8);
1861
28.6k
    temp2 = _mm_subs_epu8(p1_16x8, p0_16x8);
1862
28.6k
    temp1 = _mm_add_epi8(temp1, temp2);
1863
1864
28.6k
    temp2 = _mm_unpacklo_epi8(temp1, zero);
1865
28.6k
    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
1866
1867
28.6k
    flag2_16x8 = _mm_packs_epi16(temp2, zero);
1868
1869
    // !((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta))
1870
28.6k
    flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
1871
1872
    // (ABS(p2 - p0) < beta)
1873
28.6k
    temp1 = _mm_subs_epu8(p0_16x8, p2_16x8);
1874
28.6k
    temp2 = _mm_subs_epu8(p2_16x8, p0_16x8);
1875
28.6k
    temp1 = _mm_add_epi8(temp1, temp2);
1876
1877
28.6k
    temp2 = _mm_unpacklo_epi8(temp1, zero);
1878
28.6k
    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
1879
1880
28.6k
    flag2_16x8 = _mm_packs_epi16(temp2, zero);
1881
28.6k
    flag2_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
1882
1883
28.6k
    temp2 = _mm_subs_epi16(zero, temp2);
1884
1885
28.6k
    C_8x16 = _mm_add_epi16(C0_8x16, temp2);
1886
1887
    // (ABS(q2 - q0) < beta)
1888
28.6k
    temp1 = _mm_subs_epu8(q0_16x8, q2_16x8);
1889
28.6k
    temp2 = _mm_subs_epu8(q2_16x8, q0_16x8);
1890
28.6k
    temp1 = _mm_add_epi8(temp1, temp2);
1891
1892
28.6k
    temp2 = _mm_unpacklo_epi8(temp1, zero);
1893
28.6k
    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
1894
1895
28.6k
    flag3_16x8 = _mm_packs_epi16(temp2, zero);
1896
28.6k
    flag3_16x8 = _mm_and_si128(flag1_16x8, flag3_16x8);
1897
1898
28.6k
    temp2 = _mm_subs_epi16(zero, temp2);
1899
1900
28.6k
    C_8x16 = _mm_add_epi16(C_8x16, temp2);
1901
1902
28.6k
    const_val4_8x16 = _mm_set1_epi16(4);
1903
28.6k
    temp1 = _mm_subs_epi16(_mm_unpacklo_epi8(q0_16x8, zero),
1904
28.6k
                           _mm_unpacklo_epi8(p0_16x8, zero));
1905
28.6k
    temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(p1_16x8, zero),
1906
28.6k
                           _mm_unpacklo_epi8(q1_16x8, zero));
1907
28.6k
    temp1 = _mm_slli_epi16(temp1, 2);
1908
28.6k
    temp1 = _mm_add_epi16(temp1, temp2);
1909
28.6k
    temp1 = _mm_add_epi16(temp1, const_val4_8x16);
1910
28.6k
    in_macro_16x8 = _mm_srai_epi16(temp1, 3);
1911
1912
28.6k
    in_macro_16x8 = _mm_min_epi16(C_8x16, in_macro_16x8); //CLIP3
1913
28.6k
    C_8x16 = _mm_subs_epi16(zero, C_8x16);
1914
28.6k
    in_macro_16x8 = _mm_max_epi16(C_8x16, in_macro_16x8); //CLIP3
1915
1916
    // p0
1917
28.6k
    temp1 = _mm_add_epi16(_mm_unpacklo_epi8(p0_16x8, zero), in_macro_16x8);
1918
1919
28.6k
    temp1 = _mm_packus_epi16(temp1, zero);
1920
1921
28.6k
    p0_16x8_1 = _mm_and_si128(temp1, flag1_16x8);
1922
28.6k
    p0_16x8_2 = _mm_and_si128(
1923
28.6k
                    p0_16x8, _mm_xor_si128(flag1_16x8, _mm_set1_epi16(0xFFFF)));
1924
1925
28.6k
    p0_16x8_1 = _mm_add_epi8(p0_16x8_1, p0_16x8_2);
1926
1927
    // q0
1928
28.6k
    temp1 = _mm_sub_epi16(_mm_unpacklo_epi8(q0_16x8, zero), in_macro_16x8);
1929
1930
28.6k
    temp1 = _mm_packus_epi16(temp1, zero);
1931
1932
28.6k
    q0_16x8_1 = _mm_and_si128(temp1, flag1_16x8);
1933
28.6k
    q0_16x8_2 = _mm_and_si128(
1934
28.6k
                    q0_16x8, _mm_xor_si128(flag1_16x8, _mm_set1_epi16(0xFFFF)));
1935
1936
28.6k
    q0_16x8_1 = _mm_add_epi8(q0_16x8_1, q0_16x8_2);
1937
1938
    //if(Ap < Beta)
1939
28.6k
    temp1 = _mm_avg_epu16(_mm_unpacklo_epi8(q0_16x8, zero),
1940
28.6k
                          _mm_unpacklo_epi8(p0_16x8, zero));
1941
28.6k
    temp2 = _mm_slli_epi16(_mm_unpacklo_epi8(p1_16x8, zero), 1);
1942
    //temp2 = _mm_subs_epi16(zero,temp2);
1943
28.6k
    temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(p2_16x8, zero), temp2);
1944
28.6k
    temp2 = _mm_add_epi16(temp1, temp2);
1945
28.6k
    in_macro_16x8 = _mm_srai_epi16(temp2, 1);
1946
1947
28.6k
    in_macro_16x8 = _mm_min_epi16(C0_8x16, in_macro_16x8); //CLIP3
1948
28.6k
    C0_8x16 = _mm_subs_epi16(zero, C0_8x16);
1949
28.6k
    in_macro_16x8 = _mm_max_epi16(C0_8x16, in_macro_16x8); //CLIP3
1950
1951
    // p1
1952
28.6k
    temp1 = _mm_add_epi16(_mm_unpacklo_epi8(p1_16x8, zero), in_macro_16x8);
1953
1954
28.6k
    temp1 = _mm_packus_epi16(temp1, zero);
1955
1956
28.6k
    p1_16x8_1 = _mm_and_si128(temp1, flag2_16x8);
1957
28.6k
    p1_16x8 = _mm_and_si128(p1_16x8,
1958
28.6k
                            _mm_xor_si128(flag2_16x8, _mm_set1_epi16(0xFFFF)));
1959
28.6k
    p1_16x8 = _mm_add_epi8(p1_16x8, p1_16x8_1);
1960
1961
    //if(Aq < Beta)
1962
28.6k
    temp1 = _mm_avg_epu16(_mm_unpacklo_epi8(q0_16x8, zero),
1963
28.6k
                          _mm_unpacklo_epi8(p0_16x8, zero));
1964
28.6k
    temp2 = _mm_slli_epi16(_mm_unpacklo_epi8(q1_16x8, zero), 1);
1965
    //temp2 = _mm_slli_epi16 (temp2, 1);
1966
28.6k
    temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(q2_16x8, zero), temp2);
1967
28.6k
    temp2 = _mm_add_epi16(temp1, temp2);
1968
28.6k
    in_macro_16x8 = _mm_srai_epi16(temp2, 1);
1969
1970
28.6k
    in_macro_16x8 = _mm_max_epi16(C0_8x16, in_macro_16x8); //CLIP3
1971
28.6k
    C0_8x16 = _mm_subs_epi16(zero, C0_8x16);
1972
28.6k
    in_macro_16x8 = _mm_min_epi16(C0_8x16, in_macro_16x8); //CLIP3
1973
1974
28.6k
    temp1 = _mm_add_epi16(_mm_unpacklo_epi8(q1_16x8, zero), in_macro_16x8);
1975
1976
    // q1
1977
28.6k
    temp1 = _mm_packus_epi16(temp1, zero);
1978
1979
28.6k
    q1_16x8_1 = _mm_and_si128(temp1, flag3_16x8);
1980
28.6k
    q1_16x8 = _mm_and_si128(q1_16x8,
1981
28.6k
                            _mm_xor_si128(flag3_16x8, _mm_set1_epi16(0xFFFF)));
1982
28.6k
    q1_16x8 = _mm_add_epi8(q1_16x8, q1_16x8_1);
1983
1984
28.6k
    temp1 = _mm_unpacklo_epi8(p3_16x8, p2_16x8);
1985
28.6k
    temp2 = _mm_unpacklo_epi8(p1_16x8, p0_16x8_1);
1986
28.6k
    temp3 = _mm_unpacklo_epi8(q0_16x8_1, q1_16x8);
1987
28.6k
    temp4 = _mm_unpacklo_epi8(q2_16x8, q3_16x8);
1988
1989
28.6k
    line7 = _mm_unpacklo_epi16(temp1, temp2);
1990
28.6k
    temp1 = _mm_unpackhi_epi16(temp1, temp2);
1991
28.6k
    line8 = _mm_unpacklo_epi16(temp3, temp4);
1992
28.6k
    temp2 = _mm_unpackhi_epi16(temp3, temp4);
1993
1994
28.6k
    line1 = _mm_unpacklo_epi32(line7, line8);
1995
28.6k
    line2 = _mm_srli_si128(line1, 8);
1996
28.6k
    line3 = _mm_unpackhi_epi32(line7, line8);
1997
28.6k
    line4 = _mm_srli_si128(line3, 8);
1998
28.6k
    line5 = _mm_unpacklo_epi32(temp1, temp2);
1999
28.6k
    line6 = _mm_srli_si128(line5, 8);
2000
28.6k
    line7 = _mm_unpackhi_epi32(temp1, temp2);
2001
28.6k
    line8 = _mm_srli_si128(line7, 8);
2002
2003
28.6k
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd), line1);
2004
28.6k
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd), line2);
2005
28.6k
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd), line3);
2006
28.6k
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd), line4);
2007
28.6k
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd), line5);
2008
28.6k
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd), line6);
2009
28.6k
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd), line7);
2010
28.6k
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd), line8);
2011
28.6k
}
2012