Coverage Report

Created: 2026-04-12 06:59

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libavc/common/x86/ih264_deblk_luma_ssse3.c
Line
Count
Source
1
/******************************************************************************
2
 *
3
 * Copyright (C) 2015 The Android Open Source Project
4
 *
5
 * Licensed under the Apache License, Version 2.0 (the "License");
6
 * you may not use this file except in compliance with the License.
7
 * You may obtain a copy of the License at:
8
 *
9
 * http://www.apache.org/licenses/LICENSE-2.0
10
 *
11
 * Unless required by applicable law or agreed to in writing, software
12
 * distributed under the License is distributed on an "AS IS" BASIS,
13
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 * See the License for the specific language governing permissions and
15
 * limitations under the License.
16
 *
17
 *****************************************************************************
18
 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19
*/
20
/*****************************************************************************/
21
/*                                                                           */
22
/*  File Name         : ih264_deblk_luma_ssse3.c                             */
23
/*                                                                           */
24
/*  Description       : Contains function definitions for deblocking         */
25
/*                                                                           */
26
/*  List of Functions : ih264_deblk_luma_vert_bs4_ssse3()                    */
27
/*                      ih264_deblk_luma_horz_bs4_ssse3()                    */
28
/*                      ih264_deblk_luma_vert_bslt4_ssse3()                  */
29
/*                      ih264_deblk_luma_horz_bslt4_ssse3()                  */
30
/*                      ih264_deblk_luma_vert_bs4_mbaff_ssse3()              */
31
/*                      ih264_deblk_luma_vert_bslt4_mbaff_ssse3()            */
32
/*                                                                           */
33
/*  Issues / Problems : None                                                 */
34
/*                                                                           */
35
/*  Revision History  :                                                      */
36
/*                                                                           */
37
/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
38
/*         12 02 2015   Naveen Kumar P  Added luma deblocking ssse3          */
39
/*                                      intrinsics                           */
40
/*                                                                           */
41
/*****************************************************************************/
42
43
/*****************************************************************************/
44
/* File Includes                                                             */
45
/*****************************************************************************/
46
47
/* System include files */
48
#include <stdio.h>
49
50
/* User include files */
51
#include "ih264_typedefs.h"
52
#include "ih264_platform_macros.h"
53
#include "ih264_deblk_edge_filters.h"
54
#include "ih264_macros.h"
55
56
/*****************************************************************************/
57
/* Function Definitions                                                      */
58
/*****************************************************************************/
59
60
/*****************************************************************************/
61
/*                                                                           */
62
/*  Function Name : ih264_deblk_luma_vert_bs4_ssse3()                        */
63
/*                                                                           */
64
/*  Description   : This function performs filtering of a luma block         */
65
/*                  vertical edge when the boundary strength is set to 4.    */
66
/*                                                                           */
67
/*  Inputs        : pu1_src    - pointer to the src sample q0                */
68
/*                  src_strd   - source stride                               */
69
/*                  alpha      - alpha value for the boundary                */
70
/*                  beta       - beta value for the boundary                 */
71
/*                                                                           */
72
/*  Globals       : None                                                     */
73
/*                                                                           */
74
/*  Processing    : This operation is described in Sec. 8.7.2.4 under the    */
75
/*                  title "Filtering process for edges for bS equal to 4" in */
76
/*                  ITU T Rec H.264.                                         */
77
/*                                                                           */
78
/*  Outputs       : None                                                     */
79
/*                                                                           */
80
/*  Returns       : None                                                     */
81
/*                                                                           */
82
/*  Issues        : None                                                     */
83
/*                                                                           */
84
/*  Revision History:                                                        */
85
/*                                                                           */
86
/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
87
/*         12 02 2015   Naveen Kumar P  Initial version                      */
88
/*                                                                           */
89
/*****************************************************************************/
90
void ih264_deblk_luma_vert_bs4_ssse3(UWORD8 *pu1_src,
91
                                     WORD32 src_strd,
92
                                     WORD32 alpha,
93
                                     WORD32 beta)
94
8.30M
{
95
8.30M
    __m128i zero = _mm_setzero_si128();
96
8.30M
    __m128i q0_16x8, q1_16x8, q2_16x8, q3_16x8;
97
8.30M
    __m128i p0_16x8, p1_16x8, p2_16x8, p3_16x8;
98
8.30M
    __m128i q0_8x16, q1_8x16, q2_8x16, q3_8x16;
99
8.30M
    __m128i p0_8x16, p1_8x16, p2_8x16, p3_8x16;
100
8.30M
    __m128i q0_16x8_1;
101
8.30M
    __m128i p0_16x8_1;
102
8.30M
    __m128i q0_16x8_2, q1_16x8_2, q2_16x8_2;
103
8.30M
    __m128i p0_16x8_2, p1_16x8_2, p2_16x8_2;
104
8.30M
    __m128i temp1, temp2, temp3, temp4, temp5, temp6;
105
8.30M
    __m128i Alpha_8x16, Beta_8x16;
106
8.30M
    __m128i flag1_16x8, flag2_16x8, flag3_16x8, flag4_16x8;
107
8.30M
    __m128i const_val2_16x8 = _mm_set1_epi16(2);
108
8.30M
    __m128i line1, line2, line3, line4, line5, line6, line7, line8;
109
110
8.30M
    Alpha_8x16 = _mm_set1_epi16(alpha);
111
8.30M
    Beta_8x16 = _mm_set1_epi16(beta);
112
113
8.30M
    line1 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd));
114
8.30M
    line2 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd));
115
8.30M
    line3 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd));
116
8.30M
    line4 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd));
117
8.30M
    line5 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd));
118
8.30M
    line6 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd));
119
8.30M
    line7 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd));
120
8.30M
    line8 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd));
121
122
8.30M
    temp1 = _mm_unpacklo_epi8(line1, line2);
123
8.30M
    temp2 = _mm_unpacklo_epi8(line3, line4);
124
8.30M
    temp3 = _mm_unpacklo_epi8(line5, line6);
125
8.30M
    temp4 = _mm_unpacklo_epi8(line7, line8);
126
127
8.30M
    line1 = _mm_unpacklo_epi16(temp1, temp2);
128
8.30M
    line2 = _mm_unpackhi_epi16(temp1, temp2);
129
8.30M
    line3 = _mm_unpacklo_epi16(temp3, temp4);
130
8.30M
    line4 = _mm_unpackhi_epi16(temp3, temp4);
131
132
8.30M
    p1_8x16 = _mm_unpacklo_epi32(line1, line3);
133
8.30M
    p0_8x16 = _mm_unpackhi_epi32(line1, line3);
134
8.30M
    q0_8x16 = _mm_unpacklo_epi32(line2, line4);
135
8.30M
    q1_8x16 = _mm_unpackhi_epi32(line2, line4);
136
137
8.30M
    line1 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 8 * src_strd));
138
8.30M
    line2 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 9 * src_strd));
139
8.30M
    line3 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 10 * src_strd));
140
8.30M
    line4 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 11 * src_strd));
141
8.30M
    line5 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 12 * src_strd));
142
8.30M
    line6 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 13 * src_strd));
143
8.30M
    line7 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 14 * src_strd));
144
8.30M
    line8 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 15 * src_strd));
145
146
8.30M
    temp1 = _mm_unpacklo_epi8(line1, line2);
147
8.30M
    temp2 = _mm_unpacklo_epi8(line3, line4);
148
8.30M
    temp3 = _mm_unpacklo_epi8(line5, line6);
149
8.30M
    temp4 = _mm_unpacklo_epi8(line7, line8);
150
151
8.30M
    line1 = _mm_unpacklo_epi16(temp1, temp2);
152
8.30M
    line2 = _mm_unpackhi_epi16(temp1, temp2);
153
8.30M
    line3 = _mm_unpacklo_epi16(temp3, temp4);
154
8.30M
    line4 = _mm_unpackhi_epi16(temp3, temp4);
155
156
8.30M
    temp1 = _mm_unpacklo_epi32(line1, line3);
157
8.30M
    temp2 = _mm_unpackhi_epi32(line1, line3);
158
8.30M
    temp3 = _mm_unpacklo_epi32(line2, line4);
159
8.30M
    temp4 = _mm_unpackhi_epi32(line2, line4);
160
161
8.30M
    p3_16x8 = _mm_unpacklo_epi64(p1_8x16, temp1);
162
8.30M
    p2_16x8 = _mm_unpackhi_epi64(p1_8x16, temp1);
163
8.30M
    q2_16x8 = _mm_unpacklo_epi64(q1_8x16, temp4);
164
8.30M
    q3_16x8 = _mm_unpackhi_epi64(q1_8x16, temp4);
165
8.30M
    p1_16x8 = _mm_unpacklo_epi64(p0_8x16, temp2);
166
8.30M
    p0_16x8 = _mm_unpackhi_epi64(p0_8x16, temp2);
167
8.30M
    q0_16x8 = _mm_unpacklo_epi64(q0_8x16, temp3);
168
8.30M
    q1_16x8 = _mm_unpackhi_epi64(q0_8x16, temp3);
169
170
    //Cond1 (ABS(p0 - q0) < alpha)
171
8.30M
    temp1 = _mm_subs_epu8(q0_16x8, p0_16x8);
172
8.30M
    temp2 = _mm_subs_epu8(p0_16x8, q0_16x8);
173
8.30M
    temp1 = _mm_add_epi8(temp1, temp2);
174
175
8.30M
    temp2 = _mm_unpacklo_epi8(temp1, zero);
176
8.30M
    temp1 = _mm_unpackhi_epi8(temp1, zero);
177
178
8.30M
    temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
179
8.30M
    temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);
180
181
8.30M
    flag1_16x8 = _mm_packs_epi16(temp2, temp1);
182
183
    //Cond2 (ABS(q1 - q0) < beta)
184
8.30M
    temp1 = _mm_subs_epu8(q0_16x8, q1_16x8);
185
8.30M
    temp2 = _mm_subs_epu8(q1_16x8, q0_16x8);
186
8.30M
    temp1 = _mm_add_epi8(temp1, temp2);
187
188
8.30M
    temp2 = _mm_unpacklo_epi8(temp1, zero);
189
8.30M
    temp1 = _mm_unpackhi_epi8(temp1, zero);
190
191
8.30M
    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
192
8.30M
    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
193
194
8.30M
    flag2_16x8 = _mm_packs_epi16(temp2, temp1);
195
196
8.30M
    flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
197
198
    //Cond3 (ABS(p1 - p0) < beta)
199
8.30M
    temp1 = _mm_subs_epu8(p0_16x8, p1_16x8);
200
8.30M
    temp2 = _mm_subs_epu8(p1_16x8, p0_16x8);
201
8.30M
    temp1 = _mm_add_epi8(temp1, temp2);
202
203
8.30M
    temp2 = _mm_unpacklo_epi8(temp1, zero);
204
8.30M
    temp1 = _mm_unpackhi_epi8(temp1, zero);
205
206
8.30M
    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
207
8.30M
    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
208
209
8.30M
    flag2_16x8 = _mm_packs_epi16(temp2, temp1);
210
211
    // !((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta))
212
8.30M
    flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
213
214
    // (ABS(p0 - q0) < ((alpha >> 2) + 2))
215
8.30M
    temp1 = _mm_subs_epu8(p0_16x8, q0_16x8);
216
8.30M
    temp2 = _mm_subs_epu8(q0_16x8, p0_16x8);
217
8.30M
    temp1 = _mm_add_epi8(temp1, temp2);
218
8.30M
    Alpha_8x16 = _mm_srai_epi16(Alpha_8x16, 2);
219
8.30M
    Alpha_8x16 = _mm_add_epi16(Alpha_8x16, const_val2_16x8);
220
221
8.30M
    temp2 = _mm_unpacklo_epi8(temp1, zero);
222
8.30M
    temp1 = _mm_unpackhi_epi8(temp1, zero);
223
8.30M
    temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
224
8.30M
    temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);
225
226
8.30M
    flag2_16x8 = _mm_packs_epi16(temp2, temp1);
227
8.30M
    flag2_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
228
229
    // (ABS(p2 - p0) < beta)
230
8.30M
    temp1 = _mm_subs_epu8(p0_16x8, p2_16x8);
231
8.30M
    temp2 = _mm_subs_epu8(p2_16x8, p0_16x8);
232
8.30M
    temp1 = _mm_add_epi8(temp1, temp2);
233
234
8.30M
    temp2 = _mm_unpacklo_epi8(temp1, zero);
235
8.30M
    temp1 = _mm_unpackhi_epi8(temp1, zero);
236
8.30M
    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
237
8.30M
    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
238
239
8.30M
    flag3_16x8 = _mm_packs_epi16(temp2, temp1);
240
8.30M
    flag3_16x8 = _mm_and_si128(flag3_16x8, flag2_16x8);
241
242
    // (ABS(q2 - q0) < beta)
243
8.30M
    temp1 = _mm_subs_epu8(q0_16x8, q2_16x8);
244
8.30M
    temp2 = _mm_subs_epu8(q2_16x8, q0_16x8);
245
8.30M
    temp1 = _mm_add_epi8(temp1, temp2);
246
247
8.30M
    temp2 = _mm_unpacklo_epi8(temp1, zero);
248
8.30M
    temp1 = _mm_unpackhi_epi8(temp1, zero);
249
8.30M
    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
250
8.30M
    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
251
252
8.30M
    flag4_16x8 = _mm_packs_epi16(temp2, temp1);
253
8.30M
    flag4_16x8 = _mm_and_si128(flag4_16x8, flag2_16x8);
254
255
    // First 8 pixels
256
8.30M
    p3_8x16 = _mm_unpacklo_epi8(p3_16x8, zero);
257
8.30M
    p2_8x16 = _mm_unpacklo_epi8(p2_16x8, zero);
258
8.30M
    p1_8x16 = _mm_unpacklo_epi8(p1_16x8, zero);
259
8.30M
    p0_8x16 = _mm_unpacklo_epi8(p0_16x8, zero);
260
8.30M
    q0_8x16 = _mm_unpacklo_epi8(q0_16x8, zero);
261
8.30M
    q1_8x16 = _mm_unpacklo_epi8(q1_16x8, zero);
262
8.30M
    q2_8x16 = _mm_unpacklo_epi8(q2_16x8, zero);
263
8.30M
    q3_8x16 = _mm_unpacklo_epi8(q3_16x8, zero);
264
265
    // p0_1 and q0_1
266
8.30M
    temp1 = _mm_add_epi16(p0_8x16, q1_8x16);
267
8.30M
    temp2 = _mm_add_epi16(p1_8x16, q0_8x16);
268
8.30M
    temp5 = _mm_add_epi16(temp1, const_val2_16x8);
269
8.30M
    temp6 = _mm_add_epi16(temp2, const_val2_16x8);
270
8.30M
    temp3 = _mm_slli_epi16(p1_8x16, 1);
271
8.30M
    temp4 = _mm_slli_epi16(q1_8x16, 1);
272
8.30M
    temp1 = _mm_add_epi16(temp5, temp3);
273
8.30M
    temp2 = _mm_add_epi16(temp6, temp4);
274
8.30M
    p0_16x8_1 = _mm_srai_epi16(temp1, 2);
275
8.30M
    q0_16x8_1 = _mm_srai_epi16(temp2, 2);
276
277
    // p1_2 and q1_2
278
8.30M
    temp6 = _mm_add_epi16(temp6, p0_8x16);
279
8.30M
    temp5 = _mm_add_epi16(temp5, q0_8x16);
280
8.30M
    temp1 = _mm_add_epi16(temp6, p2_8x16);
281
8.30M
    temp2 = _mm_add_epi16(temp5, q2_8x16);
282
8.30M
    p1_16x8_2 = _mm_srai_epi16(temp1, 2);
283
8.30M
    q1_16x8_2 = _mm_srai_epi16(temp2, 2);
284
285
    // p0_2 and q0_2
286
8.30M
    temp1 = _mm_add_epi16(temp3, p2_8x16);
287
8.30M
    temp2 = _mm_add_epi16(temp4, q2_8x16);
288
8.30M
    temp1 = _mm_add_epi16(temp1, q1_8x16);
289
8.30M
    temp2 = _mm_add_epi16(temp2, p1_8x16);
290
8.30M
    temp3 = _mm_add_epi16(p0_8x16, q0_8x16);
291
8.30M
    temp3 = _mm_slli_epi16(temp3, 1);
292
8.30M
    temp1 = _mm_add_epi16(temp1, temp3);
293
8.30M
    temp2 = _mm_add_epi16(temp2, temp3);
294
8.30M
    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(4));
295
8.30M
    temp2 = _mm_add_epi16(temp2, _mm_set1_epi16(4));
296
8.30M
    p0_16x8_2 = _mm_srai_epi16(temp1, 3);
297
8.30M
    q0_16x8_2 = _mm_srai_epi16(temp2, 3);
298
299
    // p2_2 and q2_2
300
8.30M
    temp1 = _mm_add_epi16(temp6, const_val2_16x8);
301
8.30M
    temp2 = _mm_add_epi16(temp5, const_val2_16x8);
302
8.30M
    temp3 = _mm_slli_epi16(p2_8x16, 1);
303
8.30M
    temp4 = _mm_slli_epi16(q2_8x16, 1);
304
8.30M
    temp3 = _mm_add_epi16(p2_8x16, temp3);
305
8.30M
    temp4 = _mm_add_epi16(q2_8x16, temp4);
306
8.30M
    temp5 = _mm_slli_epi16(p3_8x16, 1);
307
8.30M
    temp6 = _mm_slli_epi16(q3_8x16, 1);
308
8.30M
    temp1 = _mm_add_epi16(temp1, temp3);
309
8.30M
    temp2 = _mm_add_epi16(temp2, temp4);
310
8.30M
    temp1 = _mm_add_epi16(temp1, temp5);
311
8.30M
    temp2 = _mm_add_epi16(temp2, temp6);
312
8.30M
    p2_16x8_2 = _mm_srai_epi16(temp1, 3);
313
8.30M
    q2_16x8_2 = _mm_srai_epi16(temp2, 3);
314
315
    // Second 8 pixels and packing with first 8 pixels
316
8.30M
    p3_8x16 = _mm_unpackhi_epi8(p3_16x8, zero);
317
8.30M
    p2_8x16 = _mm_unpackhi_epi8(p2_16x8, zero);
318
8.30M
    p1_8x16 = _mm_unpackhi_epi8(p1_16x8, zero);
319
8.30M
    p0_8x16 = _mm_unpackhi_epi8(p0_16x8, zero);
320
8.30M
    q0_8x16 = _mm_unpackhi_epi8(q0_16x8, zero);
321
8.30M
    q1_8x16 = _mm_unpackhi_epi8(q1_16x8, zero);
322
8.30M
    q2_8x16 = _mm_unpackhi_epi8(q2_16x8, zero);
323
8.30M
    q3_8x16 = _mm_unpackhi_epi8(q3_16x8, zero);
324
325
    // p0_1 and q0_1
326
8.30M
    temp1 = _mm_add_epi16(p0_8x16, q1_8x16);
327
8.30M
    temp2 = _mm_add_epi16(p1_8x16, q0_8x16);
328
8.30M
    temp5 = _mm_add_epi16(temp1, const_val2_16x8);
329
8.30M
    temp6 = _mm_add_epi16(temp2, const_val2_16x8);
330
8.30M
    temp3 = _mm_slli_epi16(p1_8x16, 1);
331
8.30M
    temp4 = _mm_slli_epi16(q1_8x16, 1);
332
8.30M
    temp1 = _mm_add_epi16(temp5, temp3);
333
8.30M
    temp2 = _mm_add_epi16(temp6, temp4);
334
8.30M
    temp1 = _mm_srai_epi16(temp1, 2);
335
8.30M
    temp2 = _mm_srai_epi16(temp2, 2);
336
8.30M
    p0_16x8_1 = _mm_packus_epi16(p0_16x8_1, temp1);
337
8.30M
    q0_16x8_1 = _mm_packus_epi16(q0_16x8_1, temp2);
338
339
    // p1_2 and q1_2
340
8.30M
    temp6 = _mm_add_epi16(temp6, p0_8x16);
341
8.30M
    temp5 = _mm_add_epi16(temp5, q0_8x16);
342
8.30M
    temp1 = _mm_add_epi16(temp6, p2_8x16);
343
8.30M
    temp2 = _mm_add_epi16(temp5, q2_8x16);
344
8.30M
    temp1 = _mm_srai_epi16(temp1, 2);
345
8.30M
    temp2 = _mm_srai_epi16(temp2, 2);
346
8.30M
    p1_16x8_2 = _mm_packus_epi16(p1_16x8_2, temp1);
347
8.30M
    q1_16x8_2 = _mm_packus_epi16(q1_16x8_2, temp2);
348
349
    // p0_2 and q0_2
350
8.30M
    temp1 = _mm_add_epi16(temp3, p2_8x16);
351
8.30M
    temp2 = _mm_add_epi16(temp4, q2_8x16);
352
8.30M
    temp1 = _mm_add_epi16(temp1, q1_8x16);
353
8.30M
    temp2 = _mm_add_epi16(temp2, p1_8x16);
354
8.30M
    temp3 = _mm_add_epi16(p0_8x16, q0_8x16);
355
8.30M
    temp3 = _mm_slli_epi16(temp3, 1);
356
8.30M
    temp1 = _mm_add_epi16(temp1, temp3);
357
8.30M
    temp2 = _mm_add_epi16(temp2, temp3);
358
8.30M
    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(4));
359
8.30M
    temp2 = _mm_add_epi16(temp2, _mm_set1_epi16(4));
360
8.30M
    temp1 = _mm_srai_epi16(temp1, 3);
361
8.30M
    temp2 = _mm_srai_epi16(temp2, 3);
362
8.30M
    p0_16x8_2 = _mm_packus_epi16(p0_16x8_2, temp1);
363
8.30M
    q0_16x8_2 = _mm_packus_epi16(q0_16x8_2, temp2);
364
365
    // p2_2 and q2_2
366
8.30M
    temp1 = _mm_add_epi16(temp6, const_val2_16x8);
367
8.30M
    temp2 = _mm_add_epi16(temp5, const_val2_16x8);
368
8.30M
    temp3 = _mm_slli_epi16(p2_8x16, 1);
369
8.30M
    temp4 = _mm_slli_epi16(q2_8x16, 1);
370
8.30M
    temp3 = _mm_add_epi16(p2_8x16, temp3);
371
8.30M
    temp4 = _mm_add_epi16(q2_8x16, temp4);
372
8.30M
    temp5 = _mm_slli_epi16(p3_8x16, 1);
373
8.30M
    temp6 = _mm_slli_epi16(q3_8x16, 1);
374
8.30M
    temp1 = _mm_add_epi16(temp1, temp3);
375
8.30M
    temp2 = _mm_add_epi16(temp2, temp4);
376
8.30M
    temp1 = _mm_add_epi16(temp1, temp5);
377
8.30M
    temp2 = _mm_add_epi16(temp2, temp6);
378
8.30M
    temp1 = _mm_srai_epi16(temp1, 3);
379
8.30M
    temp2 = _mm_srai_epi16(temp2, 3);
380
8.30M
    p2_16x8_2 = _mm_packus_epi16(p2_16x8_2, temp1);
381
8.30M
    q2_16x8_2 = _mm_packus_epi16(q2_16x8_2, temp2);
382
383
    // p0 and q0
384
8.30M
    p0_16x8 = _mm_and_si128(p0_16x8,
385
8.30M
                            _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF)));
386
8.30M
    p0_16x8_1 = _mm_and_si128(p0_16x8_1, flag1_16x8);
387
8.30M
    p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_1);
388
8.30M
    q0_16x8 = _mm_and_si128(q0_16x8,
389
8.30M
                            _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF)));
390
8.30M
    q0_16x8_1 = _mm_and_si128(q0_16x8_1, flag1_16x8);
391
8.30M
    q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_1);
392
393
    // p0 and q0
394
8.30M
    p0_16x8 = _mm_and_si128(p0_16x8,
395
8.30M
                            _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
396
8.30M
    p0_16x8_2 = _mm_and_si128(p0_16x8_2, flag3_16x8);
397
8.30M
    p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_2);
398
8.30M
    q0_16x8 = _mm_and_si128(q0_16x8,
399
8.30M
                            _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
400
8.30M
    q0_16x8_2 = _mm_and_si128(q0_16x8_2, flag4_16x8);
401
8.30M
    q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_2);
402
403
    // p1 and q1
404
8.30M
    p1_16x8 = _mm_and_si128(p1_16x8,
405
8.30M
                            _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
406
8.30M
    p1_16x8_2 = _mm_and_si128(p1_16x8_2, flag3_16x8);
407
8.30M
    p1_16x8 = _mm_add_epi8(p1_16x8, p1_16x8_2);
408
8.30M
    q1_16x8 = _mm_and_si128(q1_16x8,
409
8.30M
                            _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
410
8.30M
    q1_16x8_2 = _mm_and_si128(q1_16x8_2, flag4_16x8);
411
8.30M
    q1_16x8 = _mm_add_epi8(q1_16x8, q1_16x8_2);
412
413
    // p2 and q2
414
8.30M
    p2_16x8 = _mm_and_si128(p2_16x8,
415
8.30M
                            _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
416
8.30M
    p2_16x8_2 = _mm_and_si128(p2_16x8_2, flag3_16x8);
417
8.30M
    p2_16x8 = _mm_add_epi8(p2_16x8, p2_16x8_2);
418
8.30M
    q2_16x8 = _mm_and_si128(q2_16x8,
419
8.30M
                            _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
420
8.30M
    q2_16x8_2 = _mm_and_si128(q2_16x8_2, flag4_16x8);
421
8.30M
    q2_16x8 = _mm_add_epi8(q2_16x8, q2_16x8_2);
422
423
8.30M
    temp1 = _mm_unpacklo_epi8(p3_16x8, p2_16x8);
424
8.30M
    temp2 = _mm_unpacklo_epi8(p1_16x8, p0_16x8);
425
8.30M
    temp3 = _mm_unpacklo_epi8(q0_16x8, q1_16x8);
426
8.30M
    temp4 = _mm_unpacklo_epi8(q2_16x8, q3_16x8);
427
428
8.30M
    p3_8x16 = _mm_unpacklo_epi16(temp1, temp2);
429
8.30M
    p2_8x16 = _mm_unpackhi_epi16(temp1, temp2);
430
8.30M
    q2_8x16 = _mm_unpacklo_epi16(temp3, temp4);
431
8.30M
    q3_8x16 = _mm_unpackhi_epi16(temp3, temp4);
432
433
8.30M
    line1 = _mm_unpacklo_epi32(p3_8x16, q2_8x16);
434
8.30M
    line2 = _mm_srli_si128(line1, 8);
435
8.30M
    line3 = _mm_unpackhi_epi32(p3_8x16, q2_8x16);
436
8.30M
    line4 = _mm_srli_si128(line3, 8);
437
8.30M
    line5 = _mm_unpacklo_epi32(p2_8x16, q3_8x16);
438
8.30M
    line6 = _mm_srli_si128(line5, 8);
439
8.30M
    line7 = _mm_unpackhi_epi32(p2_8x16, q3_8x16);
440
8.30M
    line8 = _mm_srli_si128(line7, 8);
441
442
8.30M
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd), line1);
443
8.30M
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd), line2);
444
8.30M
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd), line3);
445
8.30M
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd), line4);
446
8.30M
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd), line5);
447
8.30M
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd), line6);
448
8.30M
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd), line7);
449
8.30M
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd), line8);
450
451
8.30M
    temp1 = _mm_unpackhi_epi8(p3_16x8, p2_16x8);
452
8.30M
    temp2 = _mm_unpackhi_epi8(p1_16x8, p0_16x8);
453
8.30M
    temp3 = _mm_unpackhi_epi8(q0_16x8, q1_16x8);
454
8.30M
    temp4 = _mm_unpackhi_epi8(q2_16x8, q3_16x8);
455
456
8.30M
    p3_8x16 = _mm_unpacklo_epi16(temp1, temp2);
457
8.30M
    p2_8x16 = _mm_unpackhi_epi16(temp1, temp2);
458
8.30M
    q2_8x16 = _mm_unpacklo_epi16(temp3, temp4);
459
8.30M
    q3_8x16 = _mm_unpackhi_epi16(temp3, temp4);
460
461
8.30M
    line1 = _mm_unpacklo_epi32(p3_8x16, q2_8x16);
462
8.30M
    line2 = _mm_srli_si128(line1, 8);
463
8.30M
    line3 = _mm_unpackhi_epi32(p3_8x16, q2_8x16);
464
8.30M
    line4 = _mm_srli_si128(line3, 8);
465
8.30M
    line5 = _mm_unpacklo_epi32(p2_8x16, q3_8x16);
466
8.30M
    line6 = _mm_srli_si128(line5, 8);
467
8.30M
    line7 = _mm_unpackhi_epi32(p2_8x16, q3_8x16);
468
8.30M
    line8 = _mm_srli_si128(line7, 8);
469
470
8.30M
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 8 * src_strd), line1);
471
8.30M
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 9 * src_strd), line2);
472
8.30M
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 10 * src_strd), line3);
473
8.30M
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 11 * src_strd), line4);
474
8.30M
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 12 * src_strd), line5);
475
8.30M
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 13 * src_strd), line6);
476
8.30M
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 14 * src_strd), line7);
477
8.30M
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 15 * src_strd), line8);
478
479
8.30M
}
480
481
/*****************************************************************************/
482
/*                                                                           */
483
/*  Function Name : ih264_deblk_luma_horz_bs4_ssse3()                        */
484
/*                                                                           */
485
/*  Description   : This function performs filtering of a luma block         */
486
/*                  horizontal edge when the boundary strength is set to 4.  */
487
/*                                                                           */
488
/*  Inputs        : pu1_src    - pointer to the src sample q0                */
489
/*                  src_strd   - source stride                               */
490
/*                  alpha      - alpha value for the boundary                */
491
/*                  beta       - beta value for the boundary                 */
492
/*                                                                           */
493
/*  Globals       : None                                                     */
494
/*                                                                           */
495
/*  Processing    : This operation is described in Sec. 8.7.2.4 under the    */
496
/*                  title "Filtering process for edges for bS equal to 4" in */
497
/*                  ITU T Rec H.264.                                         */
498
/*                                                                           */
499
/*  Outputs       : None                                                     */
500
/*                                                                           */
501
/*  Returns       : None                                                     */
502
/*                                                                           */
503
/*  Issues        : None                                                     */
504
/*                                                                           */
505
/*  Revision History:                                                        */
506
/*                                                                           */
507
/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
508
/*         12 02 2015   Naveen Kumar P  Initial version                      */
509
/*                                                                           */
510
/*****************************************************************************/
511
void ih264_deblk_luma_horz_bs4_ssse3(UWORD8 *pu1_src,
512
                                     WORD32 src_strd,
513
                                     WORD32 alpha,
514
                                     WORD32 beta)
515
8.20M
{
516
8.20M
    WORD16 i16_posP3, i16_posP2, i16_posP1, i16_posP0;
517
8.20M
    WORD16 i16_posQ1, i16_posQ2, i16_posQ3;
518
8.20M
    UWORD8 *pu1_HorzPixel;
519
8.20M
    __m128i zero = _mm_setzero_si128();
520
8.20M
    __m128i q0_16x8, q1_16x8, q2_16x8, q3_16x8;
521
8.20M
    __m128i p0_16x8, p1_16x8, p2_16x8, p3_16x8;
522
8.20M
    __m128i q0_8x16, q1_8x16, q2_8x16, q3_8x16;
523
8.20M
    __m128i p0_8x16, p1_8x16, p2_8x16, p3_8x16;
524
8.20M
    __m128i q0_16x8_1;
525
8.20M
    __m128i p0_16x8_1;
526
8.20M
    __m128i q0_16x8_2, q1_16x8_2, q2_16x8_2;
527
8.20M
    __m128i p0_16x8_2, p1_16x8_2, p2_16x8_2;
528
8.20M
    __m128i temp1, temp2, temp3, temp4, temp5, temp6;
529
8.20M
    __m128i Alpha_8x16, Beta_8x16;
530
8.20M
    __m128i flag1_16x8, flag2_16x8, flag3_16x8, flag4_16x8;
531
8.20M
    __m128i const_val2_16x8 = _mm_set1_epi16(2);
532
533
8.20M
    pu1_HorzPixel = pu1_src - (src_strd << 2);
534
535
8.20M
    i16_posQ1 = src_strd;
536
8.20M
    i16_posQ2 = X2(src_strd);
537
8.20M
    i16_posQ3 = X3(src_strd);
538
8.20M
    i16_posP0 = X3(src_strd);
539
8.20M
    i16_posP1 = X2(src_strd);
540
8.20M
    i16_posP2 = src_strd;
541
8.20M
    i16_posP3 = 0;
542
543
8.20M
    Alpha_8x16 = _mm_set1_epi16(alpha);
544
8.20M
    Beta_8x16 = _mm_set1_epi16(beta);
545
546
8.20M
    p3_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP3));
547
8.20M
    p2_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP2));
548
8.20M
    p1_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP1));
549
8.20M
    p0_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP0));
550
8.20M
    q0_16x8 = _mm_loadu_si128((__m128i *)(pu1_src));
551
8.20M
    q1_16x8 = _mm_loadu_si128((__m128i *)(pu1_src + i16_posQ1));
552
8.20M
    q2_16x8 = _mm_loadu_si128((__m128i *)(pu1_src + i16_posQ2));
553
8.20M
    q3_16x8 = _mm_loadu_si128((__m128i *)(pu1_src + i16_posQ3));
554
555
    //Cond1 (ABS(p0 - q0) < alpha)
556
8.20M
    temp1 = _mm_subs_epu8(q0_16x8, p0_16x8);
557
8.20M
    temp2 = _mm_subs_epu8(p0_16x8, q0_16x8);
558
8.20M
    temp1 = _mm_add_epi8(temp1, temp2);
559
560
8.20M
    temp2 = _mm_unpacklo_epi8(temp1, zero);
561
8.20M
    temp1 = _mm_unpackhi_epi8(temp1, zero);
562
563
8.20M
    temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
564
8.20M
    temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);
565
566
8.20M
    flag1_16x8 = _mm_packs_epi16(temp2, temp1);
567
568
    //Cond2 (ABS(q1 - q0) < beta)
569
8.20M
    temp1 = _mm_subs_epu8(q0_16x8, q1_16x8);
570
8.20M
    temp2 = _mm_subs_epu8(q1_16x8, q0_16x8);
571
8.20M
    temp1 = _mm_add_epi8(temp1, temp2);
572
573
8.20M
    temp2 = _mm_unpacklo_epi8(temp1, zero);
574
8.20M
    temp1 = _mm_unpackhi_epi8(temp1, zero);
575
576
8.20M
    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
577
8.20M
    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
578
579
8.20M
    flag2_16x8 = _mm_packs_epi16(temp2, temp1);
580
581
8.20M
    flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
582
583
    //Cond3 (ABS(p1 - p0) < beta)
584
8.20M
    temp1 = _mm_subs_epu8(p0_16x8, p1_16x8);
585
8.20M
    temp2 = _mm_subs_epu8(p1_16x8, p0_16x8);
586
8.20M
    temp1 = _mm_add_epi8(temp1, temp2);
587
588
8.20M
    temp2 = _mm_unpacklo_epi8(temp1, zero);
589
8.20M
    temp1 = _mm_unpackhi_epi8(temp1, zero);
590
591
8.20M
    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
592
8.20M
    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
593
594
8.20M
    flag2_16x8 = _mm_packs_epi16(temp2, temp1);
595
596
    // !((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta))
597
8.20M
    flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
598
599
    // (ABS(p0 - q0) < ((alpha >> 2) + 2))
600
8.20M
    temp1 = _mm_subs_epu8(p0_16x8, q0_16x8);
601
8.20M
    temp2 = _mm_subs_epu8(q0_16x8, p0_16x8);
602
8.20M
    temp1 = _mm_add_epi8(temp1, temp2);
603
8.20M
    Alpha_8x16 = _mm_srai_epi16(Alpha_8x16, 2);
604
8.20M
    Alpha_8x16 = _mm_add_epi16(Alpha_8x16, const_val2_16x8);
605
606
8.20M
    temp2 = _mm_unpacklo_epi8(temp1, zero);
607
8.20M
    temp1 = _mm_unpackhi_epi8(temp1, zero);
608
8.20M
    temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
609
8.20M
    temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);
610
611
8.20M
    flag2_16x8 = _mm_packs_epi16(temp2, temp1);
612
8.20M
    flag2_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
613
614
    // (ABS(p2 - p0) < beta)
615
8.20M
    temp1 = _mm_subs_epu8(p0_16x8, p2_16x8);
616
8.20M
    temp2 = _mm_subs_epu8(p2_16x8, p0_16x8);
617
8.20M
    temp1 = _mm_add_epi8(temp1, temp2);
618
619
8.20M
    temp2 = _mm_unpacklo_epi8(temp1, zero);
620
8.20M
    temp1 = _mm_unpackhi_epi8(temp1, zero);
621
8.20M
    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
622
8.20M
    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
623
624
8.20M
    flag3_16x8 = _mm_packs_epi16(temp2, temp1);
625
8.20M
    flag3_16x8 = _mm_and_si128(flag3_16x8, flag2_16x8);
626
627
    // (ABS(q2 - q0) < beta)
628
8.20M
    temp1 = _mm_subs_epu8(q0_16x8, q2_16x8);
629
8.20M
    temp2 = _mm_subs_epu8(q2_16x8, q0_16x8);
630
8.20M
    temp1 = _mm_add_epi8(temp1, temp2);
631
632
8.20M
    temp2 = _mm_unpacklo_epi8(temp1, zero);
633
8.20M
    temp1 = _mm_unpackhi_epi8(temp1, zero);
634
8.20M
    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
635
8.20M
    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
636
637
8.20M
    flag4_16x8 = _mm_packs_epi16(temp2, temp1);
638
8.20M
    flag4_16x8 = _mm_and_si128(flag4_16x8, flag2_16x8);
639
640
    // First 8 pixels
641
8.20M
    p3_8x16 = _mm_unpacklo_epi8(p3_16x8, zero);
642
8.20M
    p2_8x16 = _mm_unpacklo_epi8(p2_16x8, zero);
643
8.20M
    p1_8x16 = _mm_unpacklo_epi8(p1_16x8, zero);
644
8.20M
    p0_8x16 = _mm_unpacklo_epi8(p0_16x8, zero);
645
8.20M
    q0_8x16 = _mm_unpacklo_epi8(q0_16x8, zero);
646
8.20M
    q1_8x16 = _mm_unpacklo_epi8(q1_16x8, zero);
647
8.20M
    q2_8x16 = _mm_unpacklo_epi8(q2_16x8, zero);
648
8.20M
    q3_8x16 = _mm_unpacklo_epi8(q3_16x8, zero);
649
650
    // p0_1 and q0_1
651
8.20M
    temp1 = _mm_add_epi16(p0_8x16, q1_8x16);
652
8.20M
    temp2 = _mm_add_epi16(p1_8x16, q0_8x16);
653
8.20M
    temp5 = _mm_add_epi16(temp1, const_val2_16x8);
654
8.20M
    temp6 = _mm_add_epi16(temp2, const_val2_16x8);
655
8.20M
    temp3 = _mm_slli_epi16(p1_8x16, 1);
656
8.20M
    temp4 = _mm_slli_epi16(q1_8x16, 1);
657
8.20M
    temp1 = _mm_add_epi16(temp5, temp3);
658
8.20M
    temp2 = _mm_add_epi16(temp6, temp4);
659
8.20M
    p0_16x8_1 = _mm_srai_epi16(temp1, 2);
660
8.20M
    q0_16x8_1 = _mm_srai_epi16(temp2, 2);
661
662
    // p1_2 and q1_2
663
8.20M
    temp6 = _mm_add_epi16(temp6, p0_8x16);
664
8.20M
    temp5 = _mm_add_epi16(temp5, q0_8x16);
665
8.20M
    temp1 = _mm_add_epi16(temp6, p2_8x16);
666
8.20M
    temp2 = _mm_add_epi16(temp5, q2_8x16);
667
8.20M
    p1_16x8_2 = _mm_srai_epi16(temp1, 2);
668
8.20M
    q1_16x8_2 = _mm_srai_epi16(temp2, 2);
669
670
    // p0_2 and q0_2
671
8.20M
    temp1 = _mm_add_epi16(temp3, p2_8x16);
672
8.20M
    temp2 = _mm_add_epi16(temp4, q2_8x16);
673
8.20M
    temp1 = _mm_add_epi16(temp1, q1_8x16);
674
8.20M
    temp2 = _mm_add_epi16(temp2, p1_8x16);
675
8.20M
    temp3 = _mm_add_epi16(p0_8x16, q0_8x16);
676
8.20M
    temp3 = _mm_slli_epi16(temp3, 1);
677
8.20M
    temp1 = _mm_add_epi16(temp1, temp3);
678
8.20M
    temp2 = _mm_add_epi16(temp2, temp3);
679
8.20M
    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(4));
680
8.20M
    temp2 = _mm_add_epi16(temp2, _mm_set1_epi16(4));
681
8.20M
    p0_16x8_2 = _mm_srai_epi16(temp1, 3);
682
8.20M
    q0_16x8_2 = _mm_srai_epi16(temp2, 3);
683
684
    // p2_2 and q2_2
685
8.20M
    temp1 = _mm_add_epi16(temp6, const_val2_16x8);
686
8.20M
    temp2 = _mm_add_epi16(temp5, const_val2_16x8);
687
8.20M
    temp3 = _mm_slli_epi16(p2_8x16, 1);
688
8.20M
    temp4 = _mm_slli_epi16(q2_8x16, 1);
689
8.20M
    temp3 = _mm_add_epi16(p2_8x16, temp3);
690
8.20M
    temp4 = _mm_add_epi16(q2_8x16, temp4);
691
8.20M
    temp5 = _mm_slli_epi16(p3_8x16, 1);
692
8.20M
    temp6 = _mm_slli_epi16(q3_8x16, 1);
693
8.20M
    temp1 = _mm_add_epi16(temp1, temp3);
694
8.20M
    temp2 = _mm_add_epi16(temp2, temp4);
695
8.20M
    temp1 = _mm_add_epi16(temp1, temp5);
696
8.20M
    temp2 = _mm_add_epi16(temp2, temp6);
697
8.20M
    p2_16x8_2 = _mm_srai_epi16(temp1, 3);
698
8.20M
    q2_16x8_2 = _mm_srai_epi16(temp2, 3);
699
700
    // Second 8 pixels and packing with first 8 pixels
701
8.20M
    p3_8x16 = _mm_unpackhi_epi8(p3_16x8, zero);
702
8.20M
    p2_8x16 = _mm_unpackhi_epi8(p2_16x8, zero);
703
8.20M
    p1_8x16 = _mm_unpackhi_epi8(p1_16x8, zero);
704
8.20M
    p0_8x16 = _mm_unpackhi_epi8(p0_16x8, zero);
705
8.20M
    q0_8x16 = _mm_unpackhi_epi8(q0_16x8, zero);
706
8.20M
    q1_8x16 = _mm_unpackhi_epi8(q1_16x8, zero);
707
8.20M
    q2_8x16 = _mm_unpackhi_epi8(q2_16x8, zero);
708
8.20M
    q3_8x16 = _mm_unpackhi_epi8(q3_16x8, zero);
709
710
    // p0_1 and q0_1
711
8.20M
    temp1 = _mm_add_epi16(p0_8x16, q1_8x16);
712
8.20M
    temp2 = _mm_add_epi16(p1_8x16, q0_8x16);
713
8.20M
    temp5 = _mm_add_epi16(temp1, const_val2_16x8);
714
8.20M
    temp6 = _mm_add_epi16(temp2, const_val2_16x8);
715
8.20M
    temp3 = _mm_slli_epi16(p1_8x16, 1);
716
8.20M
    temp4 = _mm_slli_epi16(q1_8x16, 1);
717
8.20M
    temp1 = _mm_add_epi16(temp5, temp3);
718
8.20M
    temp2 = _mm_add_epi16(temp6, temp4);
719
8.20M
    temp1 = _mm_srai_epi16(temp1, 2);
720
8.20M
    temp2 = _mm_srai_epi16(temp2, 2);
721
8.20M
    p0_16x8_1 = _mm_packus_epi16(p0_16x8_1, temp1);
722
8.20M
    q0_16x8_1 = _mm_packus_epi16(q0_16x8_1, temp2);
723
724
    // p1_2 and q1_2
725
8.20M
    temp6 = _mm_add_epi16(temp6, p0_8x16);
726
8.20M
    temp5 = _mm_add_epi16(temp5, q0_8x16);
727
8.20M
    temp1 = _mm_add_epi16(temp6, p2_8x16);
728
8.20M
    temp2 = _mm_add_epi16(temp5, q2_8x16);
729
8.20M
    temp1 = _mm_srai_epi16(temp1, 2);
730
8.20M
    temp2 = _mm_srai_epi16(temp2, 2);
731
8.20M
    p1_16x8_2 = _mm_packus_epi16(p1_16x8_2, temp1);
732
8.20M
    q1_16x8_2 = _mm_packus_epi16(q1_16x8_2, temp2);
733
734
    // p0_2 and q0_2
735
8.20M
    temp1 = _mm_add_epi16(temp3, p2_8x16);
736
8.20M
    temp2 = _mm_add_epi16(temp4, q2_8x16);
737
8.20M
    temp1 = _mm_add_epi16(temp1, q1_8x16);
738
8.20M
    temp2 = _mm_add_epi16(temp2, p1_8x16);
739
8.20M
    temp3 = _mm_add_epi16(p0_8x16, q0_8x16);
740
8.20M
    temp3 = _mm_slli_epi16(temp3, 1);
741
8.20M
    temp1 = _mm_add_epi16(temp1, temp3);
742
8.20M
    temp2 = _mm_add_epi16(temp2, temp3);
743
8.20M
    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(4));
744
8.20M
    temp2 = _mm_add_epi16(temp2, _mm_set1_epi16(4));
745
8.20M
    temp1 = _mm_srai_epi16(temp1, 3);
746
8.20M
    temp2 = _mm_srai_epi16(temp2, 3);
747
8.20M
    p0_16x8_2 = _mm_packus_epi16(p0_16x8_2, temp1);
748
8.20M
    q0_16x8_2 = _mm_packus_epi16(q0_16x8_2, temp2);
749
750
    // p2_2 and q2_2
751
8.20M
    temp1 = _mm_add_epi16(temp6, const_val2_16x8);
752
8.20M
    temp2 = _mm_add_epi16(temp5, const_val2_16x8);
753
8.20M
    temp3 = _mm_slli_epi16(p2_8x16, 1);
754
8.20M
    temp4 = _mm_slli_epi16(q2_8x16, 1);
755
8.20M
    temp3 = _mm_add_epi16(p2_8x16, temp3);
756
8.20M
    temp4 = _mm_add_epi16(q2_8x16, temp4);
757
8.20M
    temp5 = _mm_slli_epi16(p3_8x16, 1);
758
8.20M
    temp6 = _mm_slli_epi16(q3_8x16, 1);
759
8.20M
    temp1 = _mm_add_epi16(temp1, temp3);
760
8.20M
    temp2 = _mm_add_epi16(temp2, temp4);
761
8.20M
    temp1 = _mm_add_epi16(temp1, temp5);
762
8.20M
    temp2 = _mm_add_epi16(temp2, temp6);
763
8.20M
    temp1 = _mm_srai_epi16(temp1, 3);
764
8.20M
    temp2 = _mm_srai_epi16(temp2, 3);
765
8.20M
    p2_16x8_2 = _mm_packus_epi16(p2_16x8_2, temp1);
766
8.20M
    q2_16x8_2 = _mm_packus_epi16(q2_16x8_2, temp2);
767
768
    // p0 and q0
769
8.20M
    p0_16x8 = _mm_and_si128(p0_16x8,
770
8.20M
                            _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF)));
771
8.20M
    p0_16x8_1 = _mm_and_si128(p0_16x8_1, flag1_16x8);
772
8.20M
    p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_1);
773
8.20M
    q0_16x8 = _mm_and_si128(q0_16x8,
774
8.20M
                            _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF)));
775
8.20M
    q0_16x8_1 = _mm_and_si128(q0_16x8_1, flag1_16x8);
776
8.20M
    q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_1);
777
778
    // p0 and q0
779
8.20M
    p0_16x8 = _mm_and_si128(p0_16x8,
780
8.20M
                            _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
781
8.20M
    p0_16x8_2 = _mm_and_si128(p0_16x8_2, flag3_16x8);
782
8.20M
    p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_2);
783
8.20M
    q0_16x8 = _mm_and_si128(q0_16x8,
784
8.20M
                            _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
785
8.20M
    q0_16x8_2 = _mm_and_si128(q0_16x8_2, flag4_16x8);
786
8.20M
    q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_2);
787
788
    // p1 and q1
789
8.20M
    p1_16x8 = _mm_and_si128(p1_16x8,
790
8.20M
                            _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
791
8.20M
    p1_16x8_2 = _mm_and_si128(p1_16x8_2, flag3_16x8);
792
8.20M
    p1_16x8 = _mm_add_epi8(p1_16x8, p1_16x8_2);
793
8.20M
    q1_16x8 = _mm_and_si128(q1_16x8,
794
8.20M
                            _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
795
8.20M
    q1_16x8_2 = _mm_and_si128(q1_16x8_2, flag4_16x8);
796
8.20M
    q1_16x8 = _mm_add_epi8(q1_16x8, q1_16x8_2);
797
798
    // p2 and q2
799
8.20M
    p2_16x8 = _mm_and_si128(p2_16x8,
800
8.20M
                            _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
801
8.20M
    p2_16x8_2 = _mm_and_si128(p2_16x8_2, flag3_16x8);
802
8.20M
    p2_16x8 = _mm_add_epi8(p2_16x8, p2_16x8_2);
803
8.20M
    q2_16x8 = _mm_and_si128(q2_16x8,
804
8.20M
                            _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
805
8.20M
    q2_16x8_2 = _mm_and_si128(q2_16x8_2, flag4_16x8);
806
8.20M
    q2_16x8 = _mm_add_epi8(q2_16x8, q2_16x8_2);
807
808
8.20M
    _mm_storeu_si128((__m128i *)(pu1_HorzPixel + i16_posP2), p2_16x8);
809
8.20M
    _mm_storeu_si128((__m128i *)(pu1_HorzPixel + i16_posP1), p1_16x8);
810
8.20M
    _mm_storeu_si128((__m128i *)(pu1_HorzPixel + i16_posP0), p0_16x8);
811
812
8.20M
    _mm_storeu_si128((__m128i *)(pu1_src), q0_16x8);
813
8.20M
    _mm_storeu_si128((__m128i *)(pu1_src + i16_posQ1), q1_16x8);
814
8.20M
    _mm_storeu_si128((__m128i *)(pu1_src + i16_posQ2), q2_16x8);
815
816
8.20M
}
817
818
/*****************************************************************************/
819
/*                                                                           */
820
/*  Function Name : ih264_deblk_luma_vert_bslt4_ssse3()                      */
821
/*                                                                           */
822
/*  Description   : This function performs filtering of a luma block         */
823
/*                  vertical edge when the boundary strength is less than 4. */
824
/*                                                                           */
825
/*  Inputs        : pu1_src       - pointer to the src sample q0             */
826
/*                  src_strd      - source stride                            */
827
/*                  alpha         - alpha value for the boundary             */
828
/*                  beta          - beta value for the boundary              */
829
/*                  u4_bs         - packed Boundary strength array           */
830
/*                  pu1_cliptab   - tc0_table                                */
831
/*                                                                           */
832
/*  Globals       : None                                                     */
833
/*                                                                           */
834
/*  Processing    : This operation is described in Sec. 8.7.2.3 under the    */
835
/*                  title "Filtering process for edges for bS less than 4"   */
836
/*                  in ITU T Rec H.264.                                      */
837
/*                                                                           */
838
/*  Outputs       : None                                                     */
839
/*                                                                           */
840
/*  Returns       : None                                                     */
841
/*                                                                           */
842
/*  Issues        : None                                                     */
843
/*                                                                           */
844
/*  Revision History:                                                        */
845
/*                                                                           */
846
/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
847
/*         12 02 2015   Naveen Kumar P  Initial version                      */
848
/*                                                                           */
849
/*****************************************************************************/
850
void ih264_deblk_luma_vert_bslt4_ssse3(UWORD8 *pu1_src,
851
                                       WORD32 src_strd,
852
                                       WORD32 alpha,
853
                                       WORD32 beta,
854
                                       UWORD32 u4_bs,
855
                                       const UWORD8 *pu1_cliptab)
856
29.9M
{
857
29.9M
    UWORD8 u1_Bs, u1_Bs1;
858
859
29.9M
    WORD32 j = 0;
860
861
29.9M
    __m128i linea, lineb, linec, lined, linee, linef, lineg, lineh;
862
29.9M
    __m128i int1, int2, int3, int4, high1, high2;
863
29.9M
    __m128i flag, flag1, i_C, i_C0;
864
29.9M
    __m128i i_Ap, i_Aq, diff, const1, const2, in_macro, in_macrotemp, temp,
865
29.9M
                    temp1;
866
29.9M
    __m128i zero = _mm_setzero_si128();
867
868
89.9M
    for(j = 0; j <= 8 * src_strd; j += 8 * src_strd)
869
59.9M
    {
870
        //Transpose
871
59.9M
        linea = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + j));
872
59.9M
        lineb = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + src_strd + j));
873
59.9M
        linec = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 2 * src_strd + j));
874
59.9M
        lined = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 3 * src_strd + j));
875
876
59.9M
        linea = _mm_unpacklo_epi8(linea, zero);
877
59.9M
        lineb = _mm_unpacklo_epi8(lineb, zero);
878
59.9M
        linec = _mm_unpacklo_epi8(linec, zero);
879
59.9M
        lined = _mm_unpacklo_epi8(lined, zero);
880
881
59.9M
        int1 = _mm_unpacklo_epi16(linea, lineb);
882
59.9M
        lineb = _mm_unpackhi_epi16(linea, lineb);
883
884
59.9M
        int2 = _mm_unpacklo_epi16(linec, lined);
885
59.9M
        lined = _mm_unpackhi_epi16(linec, lined);
886
887
59.9M
        linea = _mm_unpacklo_epi16(int1, int2);
888
59.9M
        int1 = _mm_unpackhi_epi16(int1, int2);
889
890
59.9M
        linec = _mm_unpacklo_epi16(lineb, lined);
891
59.9M
        high1 = _mm_unpackhi_epi16(lineb, lined);
892
893
59.9M
        linee = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 4 * src_strd + j));
894
59.9M
        linef = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 5 * src_strd + j));
895
59.9M
        lineg = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 6 * src_strd + j));
896
59.9M
        lineh = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 7 * src_strd + j));
897
898
59.9M
        linee = _mm_unpacklo_epi8(linee, zero);
899
59.9M
        linef = _mm_unpacklo_epi8(linef, zero);
900
59.9M
        lineg = _mm_unpacklo_epi8(lineg, zero);
901
59.9M
        lineh = _mm_unpacklo_epi8(lineh, zero);
902
903
59.9M
        int2 = _mm_unpacklo_epi16(linee, linef);
904
59.9M
        linef = _mm_unpackhi_epi16(linee, linef);
905
906
59.9M
        int3 = _mm_unpacklo_epi16(lineg, lineh);
907
59.9M
        lineh = _mm_unpackhi_epi16(lineg, lineh);
908
909
59.9M
        linee = _mm_unpacklo_epi16(int2, int3);
910
59.9M
        int2 = _mm_unpackhi_epi16(int2, int3);
911
912
59.9M
        lineg = _mm_unpacklo_epi16(linef, lineh);
913
59.9M
        high2 = _mm_unpackhi_epi16(linef, lineh);
914
915
59.9M
        int4 = _mm_unpacklo_epi16(linea, linee);
916
59.9M
        lineb = _mm_unpackhi_epi16(linea, linee);
917
918
59.9M
        int3 = _mm_unpacklo_epi16(int1, int2);
919
59.9M
        lined = _mm_unpackhi_epi16(int1, int2);
920
921
59.9M
        int2 = _mm_unpacklo_epi16(linec, lineg);
922
59.9M
        linef = _mm_unpackhi_epi16(linec, lineg);
923
924
59.9M
        linea = int4;
925
59.9M
        linec = int3;
926
59.9M
        linee = int2;
927
928
59.9M
        lineg = _mm_unpacklo_epi16(high1, high2);
929
59.9M
        lineh = _mm_unpackhi_epi16(high1, high2);
930
931
        //end of transpose
932
933
59.9M
        u1_Bs = (u4_bs >> 24) & 0xff;
934
59.9M
        u1_Bs1 = (u4_bs >> 16) & 0xff;
935
59.9M
        u4_bs <<= 16;
936
937
59.9M
        flag1 = _mm_set_epi16(u1_Bs1, u1_Bs, u1_Bs1, u1_Bs, u1_Bs1, u1_Bs,
938
59.9M
                              u1_Bs1, u1_Bs);
939
59.9M
        flag1 = _mm_cmpeq_epi16(flag1, zero); //Set flag to 1s and 0s
940
59.9M
        flag1 = _mm_xor_si128(flag1, _mm_set1_epi16(0xFFFF)); //Invert for required mask
941
942
59.9M
        i_C0 = _mm_set_epi16(pu1_cliptab[u1_Bs1], pu1_cliptab[u1_Bs],
943
59.9M
                             pu1_cliptab[u1_Bs1], pu1_cliptab[u1_Bs],
944
59.9M
                             pu1_cliptab[u1_Bs1], pu1_cliptab[u1_Bs],
945
59.9M
                             pu1_cliptab[u1_Bs1], pu1_cliptab[u1_Bs]);
946
947
59.9M
        diff = _mm_subs_epi16(linec, lined); //Condn 1
948
59.9M
        diff = _mm_abs_epi16(diff);
949
59.9M
        const1 = _mm_set1_epi16(alpha);
950
59.9M
        flag = _mm_cmpgt_epi16(const1, diff);
951
952
59.9M
        diff = _mm_subs_epi16(linee, lined); //Condtn 2
953
59.9M
        diff = _mm_abs_epi16(diff);
954
59.9M
        const1 = _mm_set1_epi16(beta);
955
59.9M
        flag = _mm_and_si128(flag, _mm_cmpgt_epi16(const1, diff));
956
957
59.9M
        diff = _mm_subs_epi16(lineb, linec); //Condtn 3
958
59.9M
        diff = _mm_abs_epi16(diff);
959
59.9M
        flag = _mm_and_si128(flag, _mm_cmpgt_epi16(const1, diff)); //Const 1= Beta from now on
960
961
59.9M
        flag = _mm_and_si128(flag, flag1); //Final flag (ui_B condition + other 3 conditions)
962
963
        //Adding Ap<Beta and Aq<Beta
964
59.9M
        i_Ap = _mm_subs_epi16(linea, linec);
965
59.9M
        i_Ap = _mm_abs_epi16(i_Ap);
966
59.9M
        const2 = _mm_cmpgt_epi16(const1, i_Ap);
967
59.9M
        const2 = _mm_subs_epi16(zero, const2); //Make FFFF=1 and 0000=0
968
59.9M
        i_C = _mm_add_epi16(i_C0, const2);
969
970
59.9M
        i_Aq = _mm_subs_epi16(linef, lined);
971
59.9M
        i_Aq = _mm_abs_epi16(i_Aq);
972
59.9M
        const2 = _mm_cmpgt_epi16(const1, i_Aq);
973
59.9M
        const2 = _mm_subs_epi16(zero, const2);
974
59.9M
        i_C = _mm_add_epi16(i_C, const2);
975
976
        //Calculate in_macro
977
59.9M
        diff = _mm_subs_epi16(lined, linec);
978
59.9M
        diff = _mm_slli_epi16(diff, 2);
979
59.9M
        const2 = _mm_subs_epi16(lineb, linee);
980
59.9M
        diff = _mm_add_epi16(diff, const2);
981
59.9M
        const2 = _mm_set1_epi16(4);
982
59.9M
        diff = _mm_add_epi16(diff, const2);
983
59.9M
        in_macro = _mm_srai_epi16(diff, 3);
984
985
59.9M
        in_macro = _mm_min_epi16(i_C, in_macro); //CLIP3
986
59.9M
        i_C = _mm_subs_epi16(zero, i_C);
987
59.9M
        in_macro = _mm_max_epi16(i_C, in_macro);
988
989
        //Compute and store
990
59.9M
        in_macrotemp = _mm_add_epi16(linec, in_macro);
991
59.9M
        in_macrotemp = _mm_and_si128(in_macrotemp, flag);
992
59.9M
        temp = _mm_and_si128(linec,
993
59.9M
                             _mm_xor_si128(flag, _mm_set1_epi16(0xFFFF)));
994
59.9M
        temp = _mm_add_epi16(temp, in_macrotemp);
995
        //temp= _mm_packus_epi16 (temp, zero);
996
        //_mm_storel_epi64(uc_HorzPixel+i16_posP0+i, in_macrotemp);
997
998
59.9M
        in_macrotemp = _mm_subs_epi16(lined, in_macro);
999
59.9M
        in_macrotemp = _mm_and_si128(in_macrotemp, flag);
1000
59.9M
        temp1 = _mm_and_si128(lined,
1001
59.9M
                              _mm_xor_si128(flag, _mm_set1_epi16(0xFFFF)));
1002
59.9M
        temp1 = _mm_add_epi16(temp1, in_macrotemp);
1003
        //temp1= _mm_packus_epi16 (temp1, zero);
1004
        //_mm_storel_epi64(pu1_src+i, in_macrotemp);
1005
1006
        //If Ap<Beta
1007
59.9M
        flag1 = _mm_cmpgt_epi16(const1, i_Ap);
1008
59.9M
        flag1 = _mm_and_si128(flag, flag1);
1009
59.9M
        in_macrotemp = _mm_add_epi16(linec, lined);
1010
59.9M
        in_macrotemp = _mm_add_epi16(in_macrotemp, _mm_set1_epi16(1));
1011
59.9M
        in_macrotemp = _mm_srai_epi16(in_macrotemp, 1);
1012
59.9M
        in_macro = _mm_add_epi16(in_macrotemp, linea);
1013
59.9M
        in_macro = _mm_subs_epi16(in_macro, _mm_slli_epi16(lineb, 1));
1014
59.9M
        in_macro = _mm_srai_epi16(in_macro, 1);
1015
1016
59.9M
        in_macro = _mm_min_epi16(i_C0, in_macro); //CLIP3
1017
59.9M
        i_C0 = _mm_subs_epi16(zero, i_C0);
1018
59.9M
        in_macro = _mm_max_epi16(i_C0, in_macro);
1019
1020
59.9M
        in_macro = _mm_and_si128(in_macro, flag1);
1021
59.9M
        lineb = _mm_add_epi16(lineb, in_macro);
1022
        //in_macro= _mm_packus_epi16 (i_p1, zero);
1023
        //_mm_storel_epi64(uc_HorzPixel+i16_posP1+i, in_macro);
1024
1025
59.9M
        flag1 = _mm_cmpgt_epi16(const1, i_Aq);
1026
59.9M
        flag1 = _mm_and_si128(flag, flag1);
1027
59.9M
        in_macro = _mm_add_epi16(in_macrotemp, linef);
1028
59.9M
        in_macro = _mm_subs_epi16(in_macro, _mm_slli_epi16(linee, 1));
1029
59.9M
        in_macro = _mm_srai_epi16(in_macro, 1);
1030
1031
59.9M
        i_C0 = _mm_abs_epi16(i_C0);
1032
59.9M
        in_macro = _mm_min_epi16(i_C0, in_macro); //CLIP3
1033
59.9M
        i_C0 = _mm_subs_epi16(zero, i_C0);
1034
59.9M
        in_macro = _mm_max_epi16(i_C0, in_macro);
1035
1036
59.9M
        in_macro = _mm_and_si128(in_macro, flag1);
1037
59.9M
        linee = _mm_add_epi16(linee, in_macro);
1038
        //in_macro= _mm_packus_epi16 (i_q1, zero);
1039
        //_mm_storel_epi64(pu1_src+i16_posQ1+i, in_macro);
1040
59.9M
        linec = temp;
1041
59.9M
        lined = temp1;
1042
        //End of filtering
1043
1044
59.9M
        int1 = _mm_unpacklo_epi16(linea, linee);
1045
59.9M
        linee = _mm_unpackhi_epi16(linea, linee);
1046
1047
59.9M
        int2 = _mm_unpacklo_epi16(linec, lineg);
1048
59.9M
        lineg = _mm_unpackhi_epi16(linec, lineg);
1049
1050
59.9M
        linea = _mm_unpacklo_epi16(int1, int2);
1051
59.9M
        int3 = _mm_unpackhi_epi16(int1, int2);
1052
1053
59.9M
        linec = _mm_unpacklo_epi16(linee, lineg);
1054
59.9M
        lineg = _mm_unpackhi_epi16(linee, lineg);
1055
1056
59.9M
        int1 = _mm_unpacklo_epi16(lineb, linef);
1057
59.9M
        linef = _mm_unpackhi_epi16(lineb, linef);
1058
1059
59.9M
        int2 = _mm_unpacklo_epi16(lined, lineh);
1060
59.9M
        lineh = _mm_unpackhi_epi16(lined, lineh);
1061
1062
59.9M
        lineb = _mm_unpacklo_epi16(int1, int2);
1063
59.9M
        int4 = _mm_unpackhi_epi16(int1, int2);
1064
1065
59.9M
        lined = _mm_unpacklo_epi16(linef, lineh);
1066
59.9M
        lineh = _mm_unpackhi_epi16(linef, lineh);
1067
1068
59.9M
        int1 = _mm_unpackhi_epi16(linea, lineb);
1069
59.9M
        linea = _mm_unpacklo_epi16(linea, lineb);
1070
1071
59.9M
        int2 = _mm_unpacklo_epi16(int3, int4);
1072
59.9M
        high1 = _mm_unpackhi_epi16(int3, int4);
1073
1074
59.9M
        lineb = _mm_unpacklo_epi16(linec, lined);
1075
59.9M
        linef = _mm_unpackhi_epi16(linec, lined);
1076
1077
59.9M
        lined = _mm_unpacklo_epi16(lineg, lineh);
1078
59.9M
        lineh = _mm_unpackhi_epi16(lineg, lineh);
1079
1080
59.9M
        linee = int1;
1081
59.9M
        lineg = high1;
1082
59.9M
        linec = int2;
1083
        //End of inverse transpose
1084
1085
        //Packs and stores
1086
59.9M
        linea = _mm_packus_epi16(linea, zero);
1087
59.9M
        _mm_storel_epi64((__m128i *)(pu1_src - 3 + j), linea);
1088
1089
59.9M
        lineb = _mm_packus_epi16(lineb, zero);
1090
59.9M
        _mm_storel_epi64((__m128i *)(pu1_src - 3 + src_strd + j), lineb);
1091
1092
59.9M
        linec = _mm_packus_epi16(linec, zero);
1093
59.9M
        _mm_storel_epi64((__m128i *)(pu1_src - 3 + 2 * src_strd + j), linec);
1094
1095
59.9M
        lined = _mm_packus_epi16(lined, zero);
1096
59.9M
        _mm_storel_epi64((__m128i *)(pu1_src - 3 + 3 * src_strd + j), lined);
1097
1098
59.9M
        linee = _mm_packus_epi16(linee, zero);
1099
59.9M
        _mm_storel_epi64((__m128i *)(pu1_src - 3 + 4 * src_strd + j), linee);
1100
1101
59.9M
        linef = _mm_packus_epi16(linef, zero);
1102
59.9M
        _mm_storel_epi64((__m128i *)(pu1_src - 3 + 5 * src_strd + j), linef);
1103
1104
59.9M
        lineg = _mm_packus_epi16(lineg, zero);
1105
59.9M
        _mm_storel_epi64((__m128i *)(pu1_src - 3 + 6 * src_strd + j), lineg);
1106
1107
59.9M
        lineh = _mm_packus_epi16(lineh, zero);
1108
59.9M
        _mm_storel_epi64((__m128i *)(pu1_src - 3 + 7 * src_strd + j), lineh);
1109
1110
59.9M
    }
1111
29.9M
}
1112
1113
/*****************************************************************************/
1114
/*                                                                           */
1115
/*  Function Name : ih264_deblk_luma_horz_bslt4_ssse3()                      */
1116
/*                                                                           */
1117
/*  Description   : This function performs filtering of a luma block         */
1118
/*                  horizontal edge when boundary strength is less than 4.   */
1119
/*                                                                           */
1120
/*  Inputs        : pu1_src       - pointer to the src sample q0             */
1121
/*                  src_strd      - source stride                            */
1122
/*                  alpha         - alpha value for the boundary             */
1123
/*                  beta          - beta value for the boundary              */
1124
/*                  u4_bs         - packed Boundary strength array           */
1125
/*                  pu1_cliptab   - tc0_table                                */
1126
/*                                                                           */
1127
/*  Globals       : None                                                     */
1128
/*                                                                           */
1129
/*  Processing    : This operation is described in Sec. 8.7.2.3 under the    */
1130
/*                  title "Filtering process for edges for bS less than 4"   */
1131
/*                  in ITU T Rec H.264.                                      */
1132
/*                                                                           */
1133
/*  Outputs       : None                                                     */
1134
/*                                                                           */
1135
/*  Returns       : None                                                     */
1136
/*                                                                           */
1137
/*  Issues        : None                                                     */
1138
/*                                                                           */
1139
/*  Revision History:                                                        */
1140
/*                                                                           */
1141
/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
1142
/*         12 02 2015   Naveen Kumar P  Initial version                      */
1143
/*                                                                           */
1144
/*****************************************************************************/
1145
void ih264_deblk_luma_horz_bslt4_ssse3(UWORD8 *pu1_src,
1146
                                       WORD32 src_strd,
1147
                                       WORD32 alpha,
1148
                                       WORD32 beta,
1149
                                       UWORD32 u4_bs,
1150
                                       const UWORD8 *pu1_cliptab)
1151
29.9M
{
1152
29.9M
    WORD16 i16_posP2, i16_posP1, i16_posP0, i16_posQ1, i16_posQ2;
1153
29.9M
    UWORD8 *pu1_HorzPixel;
1154
29.9M
    __m128i zero = _mm_setzero_si128();
1155
29.9M
    __m128i bs_flag_16x8b, C0_16x8, C0_8x16, C0_hi_8x16, C_8x16, C_hi_8x16;
1156
29.9M
    __m128i q0_16x8, q1_16x8, q2_16x8, p0_16x8, p1_16x8, p2_16x8;
1157
29.9M
    __m128i temp1, temp2;
1158
29.9M
    __m128i Alpha_8x16, Beta_8x16, flag1_16x8, flag2_16x8, flag3_16x8;
1159
29.9M
    __m128i in_macro_16x8, in_macro_hi_16x8;
1160
29.9M
    __m128i const_val4_8x16;
1161
29.9M
    UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
1162
29.9M
    UWORD8 clip0, clip1, clip2, clip3;
1163
1164
29.9M
    pu1_HorzPixel = pu1_src - (src_strd << 2);
1165
1166
29.9M
    i16_posQ1 = src_strd;
1167
29.9M
    i16_posQ2 = X2(src_strd);
1168
29.9M
    i16_posP0 = X3(src_strd);
1169
29.9M
    i16_posP1 = X2(src_strd);
1170
29.9M
    i16_posP2 = src_strd;
1171
1172
29.9M
    q0_16x8 = _mm_loadu_si128((__m128i *)(pu1_src));
1173
29.9M
    q1_16x8 = _mm_loadu_si128((__m128i *)(pu1_src + i16_posQ1));
1174
1175
29.9M
    u1_Bs0 = (u4_bs >> 24) & 0xff;
1176
29.9M
    u1_Bs1 = (u4_bs >> 16) & 0xff;
1177
29.9M
    u1_Bs2 = (u4_bs >> 8) & 0xff;
1178
29.9M
    u1_Bs3 = (u4_bs >> 0) & 0xff;
1179
29.9M
    clip0 = pu1_cliptab[u1_Bs0];
1180
29.9M
    clip1 = pu1_cliptab[u1_Bs1];
1181
29.9M
    clip2 = pu1_cliptab[u1_Bs2];
1182
29.9M
    clip3 = pu1_cliptab[u1_Bs3];
1183
1184
29.9M
    Alpha_8x16 = _mm_set1_epi16(alpha);
1185
29.9M
    Beta_8x16 = _mm_set1_epi16(beta);
1186
1187
29.9M
    bs_flag_16x8b = _mm_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs2, u1_Bs2,
1188
29.9M
                                 u1_Bs2, u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,
1189
29.9M
                                 u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0);
1190
1191
29.9M
    C0_16x8 = _mm_set_epi8(clip3, clip3, clip3, clip3, clip2, clip2, clip2,
1192
29.9M
                           clip2, clip1, clip1, clip1, clip1, clip0, clip0,
1193
29.9M
                           clip0, clip0);
1194
1195
29.9M
    bs_flag_16x8b = _mm_cmpeq_epi8(bs_flag_16x8b, zero);
1196
29.9M
    bs_flag_16x8b = _mm_xor_si128(bs_flag_16x8b, _mm_set1_epi8(0xFF)); //Invert for required mask
1197
29.9M
    C0_8x16 = _mm_unpacklo_epi8(C0_16x8, zero);
1198
29.9M
    C0_hi_8x16 = _mm_unpackhi_epi8(C0_16x8, zero);
1199
1200
29.9M
    p1_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP1));
1201
29.9M
    p0_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP0));
1202
29.9M
    p2_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP2));
1203
29.9M
    q2_16x8 = _mm_loadu_si128((__m128i *)(pu1_src + i16_posQ2));
1204
1205
    //Cond1 (ABS(p0 - q0) < alpha)
1206
29.9M
    temp1 = _mm_subs_epu8(q0_16x8, p0_16x8);
1207
29.9M
    temp2 = _mm_subs_epu8(p0_16x8, q0_16x8);
1208
29.9M
    temp1 = _mm_add_epi8(temp1, temp2);
1209
1210
29.9M
    temp2 = _mm_unpacklo_epi8(temp1, zero);
1211
29.9M
    temp1 = _mm_unpackhi_epi8(temp1, zero);
1212
1213
29.9M
    temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
1214
29.9M
    temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);
1215
1216
29.9M
    flag1_16x8 = _mm_packs_epi16(temp2, temp1);
1217
29.9M
    flag1_16x8 = _mm_and_si128(flag1_16x8, bs_flag_16x8b);
1218
1219
    //Cond2 (ABS(q1 - q0) < beta)
1220
29.9M
    temp1 = _mm_subs_epu8(q0_16x8, q1_16x8);
1221
29.9M
    temp2 = _mm_subs_epu8(q1_16x8, q0_16x8);
1222
29.9M
    temp1 = _mm_add_epi8(temp1, temp2);
1223
1224
29.9M
    temp2 = _mm_unpacklo_epi8(temp1, zero);
1225
29.9M
    temp1 = _mm_unpackhi_epi8(temp1, zero);
1226
1227
29.9M
    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
1228
29.9M
    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
1229
1230
29.9M
    flag2_16x8 = _mm_packs_epi16(temp2, temp1);
1231
1232
29.9M
    flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
1233
1234
    //Cond3 (ABS(p1 - p0) < beta)
1235
29.9M
    temp1 = _mm_subs_epu8(p0_16x8, p1_16x8);
1236
29.9M
    temp2 = _mm_subs_epu8(p1_16x8, p0_16x8);
1237
29.9M
    temp1 = _mm_add_epi8(temp1, temp2);
1238
1239
29.9M
    temp2 = _mm_unpacklo_epi8(temp1, zero);
1240
29.9M
    temp1 = _mm_unpackhi_epi8(temp1, zero);
1241
1242
29.9M
    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
1243
29.9M
    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
1244
1245
29.9M
    flag2_16x8 = _mm_packs_epi16(temp2, temp1);
1246
1247
    // !((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta))
1248
29.9M
    flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
1249
1250
    // (ABS(p2 - p0) < beta)
1251
29.9M
    temp1 = _mm_subs_epu8(p0_16x8, p2_16x8);
1252
29.9M
    temp2 = _mm_subs_epu8(p2_16x8, p0_16x8);
1253
29.9M
    temp1 = _mm_add_epi8(temp1, temp2);
1254
1255
29.9M
    temp2 = _mm_unpacklo_epi8(temp1, zero);
1256
29.9M
    temp1 = _mm_unpackhi_epi8(temp1, zero);
1257
29.9M
    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
1258
29.9M
    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
1259
1260
29.9M
    flag2_16x8 = _mm_packs_epi16(temp2, temp1);
1261
29.9M
    flag2_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
1262
1263
29.9M
    temp2 = _mm_subs_epi16(zero, temp2);
1264
29.9M
    temp1 = _mm_subs_epi16(zero, temp1);
1265
1266
29.9M
    C_8x16 = _mm_add_epi16(C0_8x16, temp2);
1267
29.9M
    C_hi_8x16 = _mm_add_epi16(C0_hi_8x16, temp1);
1268
1269
    // (ABS(q2 - q0) < beta)
1270
29.9M
    temp1 = _mm_subs_epu8(q0_16x8, q2_16x8);
1271
29.9M
    temp2 = _mm_subs_epu8(q2_16x8, q0_16x8);
1272
29.9M
    temp1 = _mm_add_epi8(temp1, temp2);
1273
1274
29.9M
    temp2 = _mm_unpacklo_epi8(temp1, zero);
1275
29.9M
    temp1 = _mm_unpackhi_epi8(temp1, zero);
1276
29.9M
    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
1277
29.9M
    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
1278
1279
29.9M
    flag3_16x8 = _mm_packs_epi16(temp2, temp1);
1280
29.9M
    flag3_16x8 = _mm_and_si128(flag1_16x8, flag3_16x8);
1281
1282
29.9M
    temp2 = _mm_subs_epi16(zero, temp2);
1283
29.9M
    temp1 = _mm_subs_epi16(zero, temp1);
1284
1285
29.9M
    C_8x16 = _mm_add_epi16(C_8x16, temp2);
1286
29.9M
    C_hi_8x16 = _mm_add_epi16(C_hi_8x16, temp1);
1287
1288
29.9M
    const_val4_8x16 = _mm_set1_epi16(4);
1289
29.9M
    temp1 = _mm_subs_epi16(_mm_unpacklo_epi8(q0_16x8, zero),
1290
29.9M
                           _mm_unpacklo_epi8(p0_16x8, zero));
1291
29.9M
    temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(p1_16x8, zero),
1292
29.9M
                           _mm_unpacklo_epi8(q1_16x8, zero));
1293
29.9M
    temp1 = _mm_slli_epi16(temp1, 2);
1294
29.9M
    temp1 = _mm_add_epi16(temp1, temp2);
1295
29.9M
    temp1 = _mm_add_epi16(temp1, const_val4_8x16);
1296
29.9M
    in_macro_16x8 = _mm_srai_epi16(temp1, 3);
1297
1298
29.9M
    temp1 = _mm_subs_epi16(_mm_unpackhi_epi8(q0_16x8, zero),
1299
29.9M
                           _mm_unpackhi_epi8(p0_16x8, zero));
1300
29.9M
    temp2 = _mm_subs_epi16(_mm_unpackhi_epi8(p1_16x8, zero),
1301
29.9M
                           _mm_unpackhi_epi8(q1_16x8, zero));
1302
29.9M
    temp1 = _mm_slli_epi16(temp1, 2);
1303
29.9M
    temp1 = _mm_add_epi16(temp1, temp2);
1304
29.9M
    temp1 = _mm_add_epi16(temp1, const_val4_8x16);
1305
29.9M
    in_macro_hi_16x8 = _mm_srai_epi16(temp1, 3);
1306
1307
29.9M
    in_macro_16x8 = _mm_min_epi16(C_8x16, in_macro_16x8); //CLIP3
1308
29.9M
    in_macro_hi_16x8 = _mm_min_epi16(C_hi_8x16, in_macro_hi_16x8); //CLIP3
1309
29.9M
    C_8x16 = _mm_subs_epi16(zero, C_8x16);
1310
29.9M
    C_hi_8x16 = _mm_subs_epi16(zero, C_hi_8x16);
1311
29.9M
    in_macro_16x8 = _mm_max_epi16(C_8x16, in_macro_16x8); //CLIP3
1312
29.9M
    in_macro_hi_16x8 = _mm_max_epi16(C_hi_8x16, in_macro_hi_16x8); //CLIP3
1313
1314
29.9M
    temp1 = _mm_add_epi16(_mm_unpacklo_epi8(p0_16x8, zero), in_macro_16x8);
1315
29.9M
    temp2 = _mm_add_epi16(_mm_unpackhi_epi8(p0_16x8, zero), in_macro_hi_16x8);
1316
1317
29.9M
    temp1 = _mm_packus_epi16(temp1, temp2);
1318
1319
29.9M
    temp1 = _mm_and_si128(temp1, flag1_16x8);
1320
29.9M
    temp2 = _mm_and_si128(p0_16x8,
1321
29.9M
                          _mm_xor_si128(flag1_16x8, _mm_set1_epi16(0xFFFF)));
1322
1323
29.9M
    temp1 = _mm_add_epi8(temp1, temp2);
1324
1325
29.9M
    _mm_storeu_si128((__m128i *)(pu1_HorzPixel + i16_posP0), temp1);
1326
1327
29.9M
    temp1 = _mm_sub_epi16(_mm_unpacklo_epi8(q0_16x8, zero), in_macro_16x8);
1328
29.9M
    temp2 = _mm_sub_epi16(_mm_unpackhi_epi8(q0_16x8, zero), in_macro_hi_16x8);
1329
1330
29.9M
    temp1 = _mm_packus_epi16(temp1, temp2);
1331
1332
29.9M
    temp1 = _mm_and_si128(temp1, flag1_16x8);
1333
29.9M
    temp2 = _mm_and_si128(q0_16x8,
1334
29.9M
                          _mm_xor_si128(flag1_16x8, _mm_set1_epi16(0xFFFF)));
1335
1336
29.9M
    temp1 = _mm_add_epi8(temp1, temp2);
1337
29.9M
    _mm_storeu_si128((__m128i *)(pu1_src), temp1);
1338
1339
    //if(Ap < Beta)
1340
29.9M
    temp1 = _mm_avg_epu16(_mm_unpacklo_epi8(q0_16x8, zero),
1341
29.9M
                          _mm_unpacklo_epi8(p0_16x8, zero));
1342
29.9M
    temp2 = _mm_slli_epi16(_mm_unpacklo_epi8(p1_16x8, zero), 1);
1343
    //temp2 = _mm_subs_epi16(zero,temp2);
1344
29.9M
    temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(p2_16x8, zero), temp2);
1345
29.9M
    temp2 = _mm_add_epi16(temp1, temp2);
1346
29.9M
    in_macro_16x8 = _mm_srai_epi16(temp2, 1);
1347
1348
29.9M
    temp1 = _mm_avg_epu16(_mm_unpackhi_epi8(q0_16x8, zero),
1349
29.9M
                          _mm_unpackhi_epi8(p0_16x8, zero));
1350
29.9M
    temp2 = _mm_slli_epi16(_mm_unpackhi_epi8(p1_16x8, zero), 1);
1351
    //temp2 = _mm_subs_epi16(zero,temp2);
1352
29.9M
    temp2 = _mm_subs_epi16(_mm_unpackhi_epi8(p2_16x8, zero), temp2);
1353
29.9M
    temp2 = _mm_add_epi16(temp1, temp2);
1354
29.9M
    in_macro_hi_16x8 = _mm_srai_epi16(temp2, 1);
1355
1356
29.9M
    in_macro_16x8 = _mm_min_epi16(C0_8x16, in_macro_16x8); //CLIP3
1357
29.9M
    in_macro_hi_16x8 = _mm_min_epi16(C0_hi_8x16, in_macro_hi_16x8); //CLIP3
1358
29.9M
    C0_8x16 = _mm_subs_epi16(zero, C0_8x16);
1359
29.9M
    C0_hi_8x16 = _mm_subs_epi16(zero, C0_hi_8x16);
1360
29.9M
    in_macro_16x8 = _mm_max_epi16(C0_8x16, in_macro_16x8); //CLIP3
1361
29.9M
    in_macro_hi_16x8 = _mm_max_epi16(C0_hi_8x16, in_macro_hi_16x8); //CLIP3
1362
1363
29.9M
    temp1 = _mm_add_epi16(_mm_unpacklo_epi8(p1_16x8, zero), in_macro_16x8);
1364
29.9M
    temp2 = _mm_add_epi16(_mm_unpackhi_epi8(p1_16x8, zero), in_macro_hi_16x8);
1365
1366
29.9M
    temp1 = _mm_packus_epi16(temp1, temp2);
1367
1368
29.9M
    temp1 = _mm_and_si128(temp1, flag2_16x8);
1369
29.9M
    temp2 = _mm_and_si128(p1_16x8,
1370
29.9M
                          _mm_xor_si128(flag2_16x8, _mm_set1_epi16(0xFFFF)));
1371
29.9M
    temp1 = _mm_add_epi8(temp1, temp2);
1372
29.9M
    _mm_storeu_si128((__m128i *)(pu1_HorzPixel + i16_posP1), temp1);
1373
1374
    //if(Aq < Beta)
1375
29.9M
    temp1 = _mm_avg_epu16(_mm_unpacklo_epi8(q0_16x8, zero),
1376
29.9M
                          _mm_unpacklo_epi8(p0_16x8, zero));
1377
29.9M
    temp2 = _mm_slli_epi16(_mm_unpacklo_epi8(q1_16x8, zero), 1);
1378
    //temp2 = _mm_slli_epi16 (temp2, 1);
1379
29.9M
    temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(q2_16x8, zero), temp2);
1380
29.9M
    temp2 = _mm_add_epi16(temp1, temp2);
1381
29.9M
    in_macro_16x8 = _mm_srai_epi16(temp2, 1);
1382
1383
29.9M
    temp1 = _mm_avg_epu16(_mm_unpackhi_epi8(q0_16x8, zero),
1384
29.9M
                          _mm_unpackhi_epi8(p0_16x8, zero));
1385
29.9M
    temp2 = _mm_slli_epi16(_mm_unpackhi_epi8(q1_16x8, zero), 1);
1386
    //temp2 = _mm_slli_epi16 (temp2, 1);
1387
29.9M
    temp2 = _mm_subs_epi16(_mm_unpackhi_epi8(q2_16x8, zero), temp2);
1388
29.9M
    temp2 = _mm_add_epi16(temp1, temp2);
1389
29.9M
    in_macro_hi_16x8 = _mm_srai_epi16(temp2, 1);
1390
1391
29.9M
    in_macro_16x8 = _mm_max_epi16(C0_8x16, in_macro_16x8); //CLIP3
1392
29.9M
    in_macro_hi_16x8 = _mm_max_epi16(C0_hi_8x16, in_macro_hi_16x8); //CLIP3
1393
29.9M
    C0_8x16 = _mm_subs_epi16(zero, C0_8x16);
1394
29.9M
    C0_hi_8x16 = _mm_subs_epi16(zero, C0_hi_8x16);
1395
29.9M
    in_macro_16x8 = _mm_min_epi16(C0_8x16, in_macro_16x8); //CLIP3
1396
29.9M
    in_macro_hi_16x8 = _mm_min_epi16(C0_hi_8x16, in_macro_hi_16x8); //CLIP3
1397
1398
29.9M
    temp1 = _mm_add_epi16(_mm_unpacklo_epi8(q1_16x8, zero), in_macro_16x8);
1399
29.9M
    temp2 = _mm_add_epi16(_mm_unpackhi_epi8(q1_16x8, zero), in_macro_hi_16x8);
1400
1401
29.9M
    temp1 = _mm_packus_epi16(temp1, temp2);
1402
1403
29.9M
    temp1 = _mm_and_si128(temp1, flag3_16x8);
1404
29.9M
    temp2 = _mm_and_si128(q1_16x8,
1405
29.9M
                          _mm_xor_si128(flag3_16x8, _mm_set1_epi16(0xFFFF)));
1406
29.9M
    temp1 = _mm_add_epi8(temp1, temp2);
1407
1408
29.9M
    _mm_storeu_si128((__m128i *)(pu1_src + i16_posQ1), temp1);
1409
1410
29.9M
}
1411
1412
/*****************************************************************************/
1413
/*                                                                           */
1414
/*  Function Name : ih264_deblk_luma_vert_bs4_mbaff_ssse3()                  */
1415
/*                                                                           */
1416
/*  Description   : This function performs filtering of a luma block         */
1417
/*                  vertical edge when boundary strength is set to 4.        */
1418
/*                                                                           */
1419
/*  Inputs        : pu1_src       - pointer to the src sample q0             */
1420
/*                  src_strd      - source stride                            */
1421
/*                  alpha         - alpha value for the boundary             */
1422
/*                  beta          - beta value for the boundary              */
1423
/*                                                                           */
1424
/*  Globals       : None                                                     */
1425
/*                                                                           */
1426
/*  Processing    : When the function is called twice, this operation is as  */
1427
/*                  described in Sec. 8.7.2.3 under the title "Filtering     */
1428
/*                  process for edges for bS equal to 4" in ITU T Rec H.264. */
1429
/*                                                                           */
1430
/*  Outputs       : None                                                     */
1431
/*                                                                           */
1432
/*  Returns       : None                                                     */
1433
/*                                                                           */
1434
/*  Issues        : None                                                     */
1435
/*                                                                           */
1436
/*  Revision History:                                                        */
1437
/*                                                                           */
1438
/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
1439
/*         12 02 2015   Naveen Kumar P  Initial version                      */
1440
/*                                                                           */
1441
/*****************************************************************************/
1442
void ih264_deblk_luma_vert_bs4_mbaff_ssse3(UWORD8 *pu1_src,
1443
                                           WORD32 src_strd,
1444
                                           WORD32 alpha,
1445
                                           WORD32 beta)
1446
37.2k
{
1447
37.2k
    __m128i zero = _mm_setzero_si128();
1448
37.2k
    __m128i q0_16x8, q1_16x8, q2_16x8, q3_16x8;
1449
37.2k
    __m128i p0_16x8, p1_16x8, p2_16x8, p3_16x8;
1450
37.2k
    __m128i q0_8x16, q1_8x16, q2_8x16, q3_8x16;
1451
37.2k
    __m128i p0_8x16, p1_8x16, p2_8x16, p3_8x16;
1452
37.2k
    __m128i q0_16x8_1;
1453
37.2k
    __m128i p0_16x8_1;
1454
37.2k
    __m128i q0_16x8_2, q1_16x8_2, q2_16x8_2;
1455
37.2k
    __m128i p0_16x8_2, p1_16x8_2, p2_16x8_2;
1456
37.2k
    __m128i temp1, temp2, temp3, temp4, temp5, temp6;
1457
37.2k
    __m128i Alpha_8x16, Beta_8x16;
1458
37.2k
    __m128i flag1_16x8, flag2_16x8, flag3_16x8, flag4_16x8;
1459
37.2k
    __m128i const_val2_16x8 = _mm_set1_epi16(2);
1460
37.2k
    __m128i line1, line2, line3, line4, line5, line6, line7, line8;
1461
1462
37.2k
    Alpha_8x16 = _mm_set1_epi16(alpha);
1463
37.2k
    Beta_8x16 = _mm_set1_epi16(beta);
1464
1465
37.2k
    line1 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd));
1466
37.2k
    line2 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd));
1467
37.2k
    line3 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd));
1468
37.2k
    line4 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd));
1469
37.2k
    line5 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd));
1470
37.2k
    line6 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd));
1471
37.2k
    line7 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd));
1472
37.2k
    line8 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd));
1473
1474
37.2k
    temp1 = _mm_unpacklo_epi8(line1, line2);
1475
37.2k
    temp2 = _mm_unpacklo_epi8(line3, line4);
1476
37.2k
    temp3 = _mm_unpacklo_epi8(line5, line6);
1477
37.2k
    temp4 = _mm_unpacklo_epi8(line7, line8);
1478
1479
37.2k
    line1 = _mm_unpacklo_epi16(temp1, temp2);
1480
37.2k
    line2 = _mm_unpackhi_epi16(temp1, temp2);
1481
37.2k
    line3 = _mm_unpacklo_epi16(temp3, temp4);
1482
37.2k
    line4 = _mm_unpackhi_epi16(temp3, temp4);
1483
1484
37.2k
    p1_8x16 = _mm_unpacklo_epi32(line1, line3);
1485
37.2k
    p0_8x16 = _mm_unpackhi_epi32(line1, line3);
1486
37.2k
    q0_8x16 = _mm_unpacklo_epi32(line2, line4);
1487
37.2k
    q1_8x16 = _mm_unpackhi_epi32(line2, line4);
1488
1489
37.2k
    p3_16x8 = _mm_unpacklo_epi64(p1_8x16, zero);
1490
37.2k
    p2_16x8 = _mm_unpackhi_epi64(p1_8x16, zero);
1491
37.2k
    q2_16x8 = _mm_unpacklo_epi64(q1_8x16, zero);
1492
37.2k
    q3_16x8 = _mm_unpackhi_epi64(q1_8x16, zero);
1493
37.2k
    p1_16x8 = _mm_unpacklo_epi64(p0_8x16, zero);
1494
37.2k
    p0_16x8 = _mm_unpackhi_epi64(p0_8x16, zero);
1495
37.2k
    q0_16x8 = _mm_unpacklo_epi64(q0_8x16, zero);
1496
37.2k
    q1_16x8 = _mm_unpackhi_epi64(q0_8x16, zero);
1497
1498
    //Cond1 (ABS(p0 - q0) < alpha)
1499
37.2k
    temp1 = _mm_subs_epu8(q0_16x8, p0_16x8);
1500
37.2k
    temp2 = _mm_subs_epu8(p0_16x8, q0_16x8);
1501
37.2k
    temp1 = _mm_add_epi8(temp1, temp2);
1502
1503
37.2k
    temp2 = _mm_unpacklo_epi8(temp1, zero);
1504
37.2k
    temp1 = _mm_unpackhi_epi8(temp1, zero);
1505
1506
37.2k
    temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
1507
37.2k
    temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);
1508
1509
37.2k
    flag1_16x8 = _mm_packs_epi16(temp2, temp1);
1510
1511
    //Cond2 (ABS(q1 - q0) < beta)
1512
37.2k
    temp1 = _mm_subs_epu8(q0_16x8, q1_16x8);
1513
37.2k
    temp2 = _mm_subs_epu8(q1_16x8, q0_16x8);
1514
37.2k
    temp1 = _mm_add_epi8(temp1, temp2);
1515
1516
37.2k
    temp2 = _mm_unpacklo_epi8(temp1, zero);
1517
37.2k
    temp1 = _mm_unpackhi_epi8(temp1, zero);
1518
1519
37.2k
    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
1520
37.2k
    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
1521
1522
37.2k
    flag2_16x8 = _mm_packs_epi16(temp2, temp1);
1523
1524
37.2k
    flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
1525
1526
    //Cond3 (ABS(p1 - p0) < beta)
1527
37.2k
    temp1 = _mm_subs_epu8(p0_16x8, p1_16x8);
1528
37.2k
    temp2 = _mm_subs_epu8(p1_16x8, p0_16x8);
1529
37.2k
    temp1 = _mm_add_epi8(temp1, temp2);
1530
1531
37.2k
    temp2 = _mm_unpacklo_epi8(temp1, zero);
1532
37.2k
    temp1 = _mm_unpackhi_epi8(temp1, zero);
1533
1534
37.2k
    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
1535
37.2k
    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
1536
1537
37.2k
    flag2_16x8 = _mm_packs_epi16(temp2, temp1);
1538
1539
    // !((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta))
1540
37.2k
    flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
1541
1542
    // (ABS(p0 - q0) < ((alpha >> 2) + 2))
1543
37.2k
    temp1 = _mm_subs_epu8(p0_16x8, q0_16x8);
1544
37.2k
    temp2 = _mm_subs_epu8(q0_16x8, p0_16x8);
1545
37.2k
    temp1 = _mm_add_epi8(temp1, temp2);
1546
37.2k
    Alpha_8x16 = _mm_srai_epi16(Alpha_8x16, 2);
1547
37.2k
    Alpha_8x16 = _mm_add_epi16(Alpha_8x16, const_val2_16x8);
1548
1549
37.2k
    temp2 = _mm_unpacklo_epi8(temp1, zero);
1550
37.2k
    temp1 = _mm_unpackhi_epi8(temp1, zero);
1551
37.2k
    temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
1552
37.2k
    temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);
1553
1554
37.2k
    flag2_16x8 = _mm_packs_epi16(temp2, temp1);
1555
37.2k
    flag2_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
1556
1557
    // (ABS(p2 - p0) < beta)
1558
37.2k
    temp1 = _mm_subs_epu8(p0_16x8, p2_16x8);
1559
37.2k
    temp2 = _mm_subs_epu8(p2_16x8, p0_16x8);
1560
37.2k
    temp1 = _mm_add_epi8(temp1, temp2);
1561
1562
37.2k
    temp2 = _mm_unpacklo_epi8(temp1, zero);
1563
37.2k
    temp1 = _mm_unpackhi_epi8(temp1, zero);
1564
37.2k
    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
1565
37.2k
    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
1566
1567
37.2k
    flag3_16x8 = _mm_packs_epi16(temp2, temp1);
1568
37.2k
    flag3_16x8 = _mm_and_si128(flag3_16x8, flag2_16x8);
1569
1570
    // (ABS(q2 - q0) < beta)
1571
37.2k
    temp1 = _mm_subs_epu8(q0_16x8, q2_16x8);
1572
37.2k
    temp2 = _mm_subs_epu8(q2_16x8, q0_16x8);
1573
37.2k
    temp1 = _mm_add_epi8(temp1, temp2);
1574
1575
37.2k
    temp2 = _mm_unpacklo_epi8(temp1, zero);
1576
37.2k
    temp1 = _mm_unpackhi_epi8(temp1, zero);
1577
37.2k
    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
1578
37.2k
    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
1579
1580
37.2k
    flag4_16x8 = _mm_packs_epi16(temp2, temp1);
1581
37.2k
    flag4_16x8 = _mm_and_si128(flag4_16x8, flag2_16x8);
1582
1583
    // First 8 pixels
1584
37.2k
    p3_8x16 = _mm_unpacklo_epi8(p3_16x8, zero);
1585
37.2k
    p2_8x16 = _mm_unpacklo_epi8(p2_16x8, zero);
1586
37.2k
    p1_8x16 = _mm_unpacklo_epi8(p1_16x8, zero);
1587
37.2k
    p0_8x16 = _mm_unpacklo_epi8(p0_16x8, zero);
1588
37.2k
    q0_8x16 = _mm_unpacklo_epi8(q0_16x8, zero);
1589
37.2k
    q1_8x16 = _mm_unpacklo_epi8(q1_16x8, zero);
1590
37.2k
    q2_8x16 = _mm_unpacklo_epi8(q2_16x8, zero);
1591
37.2k
    q3_8x16 = _mm_unpacklo_epi8(q3_16x8, zero);
1592
1593
    // p0_1 and q0_1
1594
37.2k
    temp1 = _mm_add_epi16(p0_8x16, q1_8x16);
1595
37.2k
    temp2 = _mm_add_epi16(p1_8x16, q0_8x16);
1596
37.2k
    temp5 = _mm_add_epi16(temp1, const_val2_16x8);
1597
37.2k
    temp6 = _mm_add_epi16(temp2, const_val2_16x8);
1598
37.2k
    temp3 = _mm_slli_epi16(p1_8x16, 1);
1599
37.2k
    temp4 = _mm_slli_epi16(q1_8x16, 1);
1600
37.2k
    temp1 = _mm_add_epi16(temp5, temp3);
1601
37.2k
    temp2 = _mm_add_epi16(temp6, temp4);
1602
37.2k
    p0_16x8_1 = _mm_srai_epi16(temp1, 2);
1603
37.2k
    q0_16x8_1 = _mm_srai_epi16(temp2, 2);
1604
1605
    // p1_2 and q1_2
1606
37.2k
    temp6 = _mm_add_epi16(temp6, p0_8x16);
1607
37.2k
    temp5 = _mm_add_epi16(temp5, q0_8x16);
1608
37.2k
    temp1 = _mm_add_epi16(temp6, p2_8x16);
1609
37.2k
    temp2 = _mm_add_epi16(temp5, q2_8x16);
1610
37.2k
    p1_16x8_2 = _mm_srai_epi16(temp1, 2);
1611
37.2k
    q1_16x8_2 = _mm_srai_epi16(temp2, 2);
1612
1613
    // p0_2 and q0_2
1614
37.2k
    temp1 = _mm_add_epi16(temp3, p2_8x16);
1615
37.2k
    temp2 = _mm_add_epi16(temp4, q2_8x16);
1616
37.2k
    temp1 = _mm_add_epi16(temp1, q1_8x16);
1617
37.2k
    temp2 = _mm_add_epi16(temp2, p1_8x16);
1618
37.2k
    temp3 = _mm_add_epi16(p0_8x16, q0_8x16);
1619
37.2k
    temp3 = _mm_slli_epi16(temp3, 1);
1620
37.2k
    temp1 = _mm_add_epi16(temp1, temp3);
1621
37.2k
    temp2 = _mm_add_epi16(temp2, temp3);
1622
37.2k
    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(4));
1623
37.2k
    temp2 = _mm_add_epi16(temp2, _mm_set1_epi16(4));
1624
37.2k
    p0_16x8_2 = _mm_srai_epi16(temp1, 3);
1625
37.2k
    q0_16x8_2 = _mm_srai_epi16(temp2, 3);
1626
1627
    // p2_2 and q2_2
1628
37.2k
    temp1 = _mm_add_epi16(temp6, const_val2_16x8);
1629
37.2k
    temp2 = _mm_add_epi16(temp5, const_val2_16x8);
1630
37.2k
    temp3 = _mm_slli_epi16(p2_8x16, 1);
1631
37.2k
    temp4 = _mm_slli_epi16(q2_8x16, 1);
1632
37.2k
    temp3 = _mm_add_epi16(p2_8x16, temp3);
1633
37.2k
    temp4 = _mm_add_epi16(q2_8x16, temp4);
1634
37.2k
    temp5 = _mm_slli_epi16(p3_8x16, 1);
1635
37.2k
    temp6 = _mm_slli_epi16(q3_8x16, 1);
1636
37.2k
    temp1 = _mm_add_epi16(temp1, temp3);
1637
37.2k
    temp2 = _mm_add_epi16(temp2, temp4);
1638
37.2k
    temp1 = _mm_add_epi16(temp1, temp5);
1639
37.2k
    temp2 = _mm_add_epi16(temp2, temp6);
1640
37.2k
    p2_16x8_2 = _mm_srai_epi16(temp1, 3);
1641
37.2k
    q2_16x8_2 = _mm_srai_epi16(temp2, 3);
1642
1643
    // p0_1 and q0_1
1644
37.2k
    p0_16x8_1 = _mm_packus_epi16(p0_16x8_1, zero);
1645
37.2k
    q0_16x8_1 = _mm_packus_epi16(q0_16x8_1, zero);
1646
1647
    // p1_2 and q1_2
1648
37.2k
    p1_16x8_2 = _mm_packus_epi16(p1_16x8_2, zero);
1649
37.2k
    q1_16x8_2 = _mm_packus_epi16(q1_16x8_2, zero);
1650
1651
    // p0_2 and q0_2
1652
37.2k
    p0_16x8_2 = _mm_packus_epi16(p0_16x8_2, zero);
1653
37.2k
    q0_16x8_2 = _mm_packus_epi16(q0_16x8_2, zero);
1654
1655
    // p2_2 and q2_2
1656
37.2k
    p2_16x8_2 = _mm_packus_epi16(p2_16x8_2, zero);
1657
37.2k
    q2_16x8_2 = _mm_packus_epi16(q2_16x8_2, zero);
1658
1659
    // p0 and q0
1660
37.2k
    p0_16x8 = _mm_and_si128(p0_16x8,
1661
37.2k
                            _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF)));
1662
37.2k
    p0_16x8_1 = _mm_and_si128(p0_16x8_1, flag1_16x8);
1663
37.2k
    p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_1);
1664
37.2k
    q0_16x8 = _mm_and_si128(q0_16x8,
1665
37.2k
                            _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF)));
1666
37.2k
    q0_16x8_1 = _mm_and_si128(q0_16x8_1, flag1_16x8);
1667
37.2k
    q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_1);
1668
1669
    // p0 and q0
1670
37.2k
    p0_16x8 = _mm_and_si128(p0_16x8,
1671
37.2k
                            _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
1672
37.2k
    p0_16x8_2 = _mm_and_si128(p0_16x8_2, flag3_16x8);
1673
37.2k
    p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_2);
1674
37.2k
    q0_16x8 = _mm_and_si128(q0_16x8,
1675
37.2k
                            _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
1676
37.2k
    q0_16x8_2 = _mm_and_si128(q0_16x8_2, flag4_16x8);
1677
37.2k
    q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_2);
1678
1679
    // p1 and q1
1680
37.2k
    p1_16x8 = _mm_and_si128(p1_16x8,
1681
37.2k
                            _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
1682
37.2k
    p1_16x8_2 = _mm_and_si128(p1_16x8_2, flag3_16x8);
1683
37.2k
    p1_16x8 = _mm_add_epi8(p1_16x8, p1_16x8_2);
1684
37.2k
    q1_16x8 = _mm_and_si128(q1_16x8,
1685
37.2k
                            _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
1686
37.2k
    q1_16x8_2 = _mm_and_si128(q1_16x8_2, flag4_16x8);
1687
37.2k
    q1_16x8 = _mm_add_epi8(q1_16x8, q1_16x8_2);
1688
1689
    // p2 and q2
1690
37.2k
    p2_16x8 = _mm_and_si128(p2_16x8,
1691
37.2k
                            _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
1692
37.2k
    p2_16x8_2 = _mm_and_si128(p2_16x8_2, flag3_16x8);
1693
37.2k
    p2_16x8 = _mm_add_epi8(p2_16x8, p2_16x8_2);
1694
37.2k
    q2_16x8 = _mm_and_si128(q2_16x8,
1695
37.2k
                            _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
1696
37.2k
    q2_16x8_2 = _mm_and_si128(q2_16x8_2, flag4_16x8);
1697
37.2k
    q2_16x8 = _mm_add_epi8(q2_16x8, q2_16x8_2);
1698
1699
37.2k
    temp1 = _mm_unpacklo_epi8(p3_16x8, p2_16x8);
1700
37.2k
    temp2 = _mm_unpacklo_epi8(p1_16x8, p0_16x8);
1701
37.2k
    temp3 = _mm_unpacklo_epi8(q0_16x8, q1_16x8);
1702
37.2k
    temp4 = _mm_unpacklo_epi8(q2_16x8, q3_16x8);
1703
1704
37.2k
    p3_8x16 = _mm_unpacklo_epi16(temp1, temp2);
1705
37.2k
    p2_8x16 = _mm_unpackhi_epi16(temp1, temp2);
1706
37.2k
    q2_8x16 = _mm_unpacklo_epi16(temp3, temp4);
1707
37.2k
    q3_8x16 = _mm_unpackhi_epi16(temp3, temp4);
1708
1709
37.2k
    line1 = _mm_unpacklo_epi32(p3_8x16, q2_8x16);
1710
37.2k
    line2 = _mm_srli_si128(line1, 8);
1711
37.2k
    line3 = _mm_unpackhi_epi32(p3_8x16, q2_8x16);
1712
37.2k
    line4 = _mm_srli_si128(line3, 8);
1713
37.2k
    line5 = _mm_unpacklo_epi32(p2_8x16, q3_8x16);
1714
37.2k
    line6 = _mm_srli_si128(line5, 8);
1715
37.2k
    line7 = _mm_unpackhi_epi32(p2_8x16, q3_8x16);
1716
37.2k
    line8 = _mm_srli_si128(line7, 8);
1717
1718
37.2k
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd), line1);
1719
37.2k
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd), line2);
1720
37.2k
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd), line3);
1721
37.2k
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd), line4);
1722
37.2k
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd), line5);
1723
37.2k
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd), line6);
1724
37.2k
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd), line7);
1725
37.2k
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd), line8);
1726
1727
37.2k
}
1728
1729
/*****************************************************************************/
1730
/*                                                                           */
1731
/*  Function Name : ih264_deblk_luma_vert_bslt4_mbaff_ssse3()                */
1732
/*                                                                           */
1733
/*  Description   : This function performs filtering of a luma block         */
1734
/*                  vertical edge when boundary strength is less than 4.     */
1735
/*                                                                           */
1736
/*  Inputs        : pu1_src       - pointer to the src sample q0             */
1737
/*                  src_strd      - source stride                            */
1738
/*                  alpha         - alpha value for the boundary             */
1739
/*                  beta          - beta value for the boundary              */
1740
/*                  u4_bs         - packed Boundary strength array           */
1741
/*                  pu1_cliptab   - tc0_table                                */
1742
/*                                                                           */
1743
/*  Globals       : None                                                     */
1744
/*                                                                           */
1745
/*  Processing    : When the function is called twice, this operation is as  */
1746
/*                  described in Sec. 8.7.2.3 under the title "Filtering     */
1747
/*                  process for edges for bS less than 4" in ITU T Rec H.264.*/
1748
/*                                                                           */
1749
/*  Outputs       : None                                                     */
1750
/*                                                                           */
1751
/*  Returns       : None                                                     */
1752
/*                                                                           */
1753
/*  Issues        : None                                                     */
1754
/*                                                                           */
1755
/*  Revision History:                                                        */
1756
/*                                                                           */
1757
/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
1758
/*         12 02 2015   Naveen Kumar P  Initial version                      */
1759
/*                                                                           */
1760
/*****************************************************************************/
1761
void ih264_deblk_luma_vert_bslt4_mbaff_ssse3(UWORD8 *pu1_src,
1762
                                             WORD32 src_strd,
1763
                                             WORD32 alpha,
1764
                                             WORD32 beta,
1765
                                             UWORD32 u4_bs,
1766
                                             const UWORD8 *pu1_cliptab)
1767
10.5k
{
1768
10.5k
    __m128i zero = _mm_setzero_si128();
1769
10.5k
    __m128i bs_flag_16x8b, C0_16x8, C0_8x16, C_8x16;
1770
10.5k
    __m128i q0_16x8, q1_16x8, q2_16x8, q3_16x8;
1771
10.5k
    __m128i p0_16x8, p1_16x8, p2_16x8, p3_16x8;
1772
10.5k
    __m128i temp1, temp2, temp3, temp4;
1773
10.5k
    __m128i Alpha_8x16, Beta_8x16, flag1_16x8, flag2_16x8, flag3_16x8;
1774
10.5k
    __m128i in_macro_16x8;
1775
10.5k
    __m128i const_val4_8x16;
1776
10.5k
    UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
1777
10.5k
    UWORD8 clip0, clip1, clip2, clip3;
1778
10.5k
    __m128i line1, line2, line3, line4, line5, line6, line7, line8;
1779
10.5k
    __m128i q0_16x8_1, q1_16x8_1, q0_16x8_2;
1780
10.5k
    __m128i p0_16x8_1, p1_16x8_1, p0_16x8_2;
1781
1782
10.5k
    line1 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd));
1783
10.5k
    line2 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd));
1784
10.5k
    line3 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd));
1785
10.5k
    line4 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd));
1786
10.5k
    line5 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd));
1787
10.5k
    line6 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd));
1788
10.5k
    line7 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd));
1789
10.5k
    line8 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd));
1790
1791
10.5k
    temp1 = _mm_unpacklo_epi8(line1, line2);
1792
10.5k
    temp2 = _mm_unpacklo_epi8(line3, line4);
1793
10.5k
    temp3 = _mm_unpacklo_epi8(line5, line6);
1794
10.5k
    temp4 = _mm_unpacklo_epi8(line7, line8);
1795
1796
10.5k
    line1 = _mm_unpacklo_epi16(temp1, temp2);
1797
10.5k
    line2 = _mm_unpackhi_epi16(temp1, temp2);
1798
10.5k
    line3 = _mm_unpacklo_epi16(temp3, temp4);
1799
10.5k
    line4 = _mm_unpackhi_epi16(temp3, temp4);
1800
1801
10.5k
    temp1 = _mm_unpacklo_epi32(line1, line3);
1802
10.5k
    temp2 = _mm_unpackhi_epi32(line1, line3);
1803
10.5k
    temp3 = _mm_unpacklo_epi32(line2, line4);
1804
10.5k
    temp4 = _mm_unpackhi_epi32(line2, line4);
1805
1806
10.5k
    p3_16x8 = _mm_unpacklo_epi64(temp1, zero);
1807
10.5k
    p2_16x8 = _mm_unpackhi_epi64(temp1, zero);
1808
10.5k
    q2_16x8 = _mm_unpacklo_epi64(temp4, zero);
1809
10.5k
    q3_16x8 = _mm_unpackhi_epi64(temp4, zero);
1810
10.5k
    p1_16x8 = _mm_unpacklo_epi64(temp2, zero);
1811
10.5k
    p0_16x8 = _mm_unpackhi_epi64(temp2, zero);
1812
10.5k
    q0_16x8 = _mm_unpacklo_epi64(temp3, zero);
1813
10.5k
    q1_16x8 = _mm_unpackhi_epi64(temp3, zero);
1814
1815
10.5k
    u1_Bs0 = (u4_bs >> 24) & 0xff;
1816
10.5k
    u1_Bs1 = (u4_bs >> 16) & 0xff;
1817
10.5k
    u1_Bs2 = (u4_bs >> 8) & 0xff;
1818
10.5k
    u1_Bs3 = (u4_bs >> 0) & 0xff;
1819
10.5k
    clip0 = pu1_cliptab[u1_Bs0];
1820
10.5k
    clip1 = pu1_cliptab[u1_Bs1];
1821
10.5k
    clip2 = pu1_cliptab[u1_Bs2];
1822
10.5k
    clip3 = pu1_cliptab[u1_Bs3];
1823
1824
10.5k
    Alpha_8x16 = _mm_set1_epi16(alpha);
1825
10.5k
    Beta_8x16 = _mm_set1_epi16(beta);
1826
1827
10.5k
    bs_flag_16x8b = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, u1_Bs3, u1_Bs3, u1_Bs2,
1828
10.5k
                                 u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs0, u1_Bs0);
1829
1830
10.5k
    C0_16x8 = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, clip3, clip3, clip2, clip2,
1831
10.5k
                           clip1, clip1, clip0, clip0);
1832
1833
10.5k
    bs_flag_16x8b = _mm_cmpeq_epi8(bs_flag_16x8b, zero);
1834
10.5k
    bs_flag_16x8b = _mm_xor_si128(bs_flag_16x8b, _mm_set1_epi8(0xFF)); //Invert for required mask
1835
10.5k
    C0_8x16 = _mm_unpacklo_epi8(C0_16x8, zero);
1836
1837
    //Cond1 (ABS(p0 - q0) < alpha)
1838
10.5k
    temp1 = _mm_subs_epu8(q0_16x8, p0_16x8);
1839
10.5k
    temp2 = _mm_subs_epu8(p0_16x8, q0_16x8);
1840
10.5k
    temp1 = _mm_add_epi8(temp1, temp2);
1841
1842
10.5k
    temp2 = _mm_unpacklo_epi8(temp1, zero);
1843
10.5k
    temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
1844
1845
10.5k
    flag1_16x8 = _mm_packs_epi16(temp2, zero);
1846
10.5k
    flag1_16x8 = _mm_and_si128(flag1_16x8, bs_flag_16x8b);
1847
1848
    //Cond2 (ABS(q1 - q0) < beta)
1849
10.5k
    temp1 = _mm_subs_epu8(q0_16x8, q1_16x8);
1850
10.5k
    temp2 = _mm_subs_epu8(q1_16x8, q0_16x8);
1851
10.5k
    temp1 = _mm_add_epi8(temp1, temp2);
1852
1853
10.5k
    temp2 = _mm_unpacklo_epi8(temp1, zero);
1854
10.5k
    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
1855
1856
10.5k
    flag2_16x8 = _mm_packs_epi16(temp2, zero);
1857
10.5k
    flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
1858
1859
    //Cond3 (ABS(p1 - p0) < beta)
1860
10.5k
    temp1 = _mm_subs_epu8(p0_16x8, p1_16x8);
1861
10.5k
    temp2 = _mm_subs_epu8(p1_16x8, p0_16x8);
1862
10.5k
    temp1 = _mm_add_epi8(temp1, temp2);
1863
1864
10.5k
    temp2 = _mm_unpacklo_epi8(temp1, zero);
1865
10.5k
    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
1866
1867
10.5k
    flag2_16x8 = _mm_packs_epi16(temp2, zero);
1868
1869
    // !((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta))
1870
10.5k
    flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
1871
1872
    // (ABS(p2 - p0) < beta)
1873
10.5k
    temp1 = _mm_subs_epu8(p0_16x8, p2_16x8);
1874
10.5k
    temp2 = _mm_subs_epu8(p2_16x8, p0_16x8);
1875
10.5k
    temp1 = _mm_add_epi8(temp1, temp2);
1876
1877
10.5k
    temp2 = _mm_unpacklo_epi8(temp1, zero);
1878
10.5k
    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
1879
1880
10.5k
    flag2_16x8 = _mm_packs_epi16(temp2, zero);
1881
10.5k
    flag2_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
1882
1883
10.5k
    temp2 = _mm_subs_epi16(zero, temp2);
1884
1885
10.5k
    C_8x16 = _mm_add_epi16(C0_8x16, temp2);
1886
1887
    // (ABS(q2 - q0) < beta)
1888
10.5k
    temp1 = _mm_subs_epu8(q0_16x8, q2_16x8);
1889
10.5k
    temp2 = _mm_subs_epu8(q2_16x8, q0_16x8);
1890
10.5k
    temp1 = _mm_add_epi8(temp1, temp2);
1891
1892
10.5k
    temp2 = _mm_unpacklo_epi8(temp1, zero);
1893
10.5k
    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
1894
1895
10.5k
    flag3_16x8 = _mm_packs_epi16(temp2, zero);
1896
10.5k
    flag3_16x8 = _mm_and_si128(flag1_16x8, flag3_16x8);
1897
1898
10.5k
    temp2 = _mm_subs_epi16(zero, temp2);
1899
1900
10.5k
    C_8x16 = _mm_add_epi16(C_8x16, temp2);
1901
1902
10.5k
    const_val4_8x16 = _mm_set1_epi16(4);
1903
10.5k
    temp1 = _mm_subs_epi16(_mm_unpacklo_epi8(q0_16x8, zero),
1904
10.5k
                           _mm_unpacklo_epi8(p0_16x8, zero));
1905
10.5k
    temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(p1_16x8, zero),
1906
10.5k
                           _mm_unpacklo_epi8(q1_16x8, zero));
1907
10.5k
    temp1 = _mm_slli_epi16(temp1, 2);
1908
10.5k
    temp1 = _mm_add_epi16(temp1, temp2);
1909
10.5k
    temp1 = _mm_add_epi16(temp1, const_val4_8x16);
1910
10.5k
    in_macro_16x8 = _mm_srai_epi16(temp1, 3);
1911
1912
10.5k
    in_macro_16x8 = _mm_min_epi16(C_8x16, in_macro_16x8); //CLIP3
1913
10.5k
    C_8x16 = _mm_subs_epi16(zero, C_8x16);
1914
10.5k
    in_macro_16x8 = _mm_max_epi16(C_8x16, in_macro_16x8); //CLIP3
1915
1916
    // p0
1917
10.5k
    temp1 = _mm_add_epi16(_mm_unpacklo_epi8(p0_16x8, zero), in_macro_16x8);
1918
1919
10.5k
    temp1 = _mm_packus_epi16(temp1, zero);
1920
1921
10.5k
    p0_16x8_1 = _mm_and_si128(temp1, flag1_16x8);
1922
10.5k
    p0_16x8_2 = _mm_and_si128(
1923
10.5k
                    p0_16x8, _mm_xor_si128(flag1_16x8, _mm_set1_epi16(0xFFFF)));
1924
1925
10.5k
    p0_16x8_1 = _mm_add_epi8(p0_16x8_1, p0_16x8_2);
1926
1927
    // q0
1928
10.5k
    temp1 = _mm_sub_epi16(_mm_unpacklo_epi8(q0_16x8, zero), in_macro_16x8);
1929
1930
10.5k
    temp1 = _mm_packus_epi16(temp1, zero);
1931
1932
10.5k
    q0_16x8_1 = _mm_and_si128(temp1, flag1_16x8);
1933
10.5k
    q0_16x8_2 = _mm_and_si128(
1934
10.5k
                    q0_16x8, _mm_xor_si128(flag1_16x8, _mm_set1_epi16(0xFFFF)));
1935
1936
10.5k
    q0_16x8_1 = _mm_add_epi8(q0_16x8_1, q0_16x8_2);
1937
1938
    //if(Ap < Beta)
1939
10.5k
    temp1 = _mm_avg_epu16(_mm_unpacklo_epi8(q0_16x8, zero),
1940
10.5k
                          _mm_unpacklo_epi8(p0_16x8, zero));
1941
10.5k
    temp2 = _mm_slli_epi16(_mm_unpacklo_epi8(p1_16x8, zero), 1);
1942
    //temp2 = _mm_subs_epi16(zero,temp2);
1943
10.5k
    temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(p2_16x8, zero), temp2);
1944
10.5k
    temp2 = _mm_add_epi16(temp1, temp2);
1945
10.5k
    in_macro_16x8 = _mm_srai_epi16(temp2, 1);
1946
1947
10.5k
    in_macro_16x8 = _mm_min_epi16(C0_8x16, in_macro_16x8); //CLIP3
1948
10.5k
    C0_8x16 = _mm_subs_epi16(zero, C0_8x16);
1949
10.5k
    in_macro_16x8 = _mm_max_epi16(C0_8x16, in_macro_16x8); //CLIP3
1950
1951
    // p1
1952
10.5k
    temp1 = _mm_add_epi16(_mm_unpacklo_epi8(p1_16x8, zero), in_macro_16x8);
1953
1954
10.5k
    temp1 = _mm_packus_epi16(temp1, zero);
1955
1956
10.5k
    p1_16x8_1 = _mm_and_si128(temp1, flag2_16x8);
1957
10.5k
    p1_16x8 = _mm_and_si128(p1_16x8,
1958
10.5k
                            _mm_xor_si128(flag2_16x8, _mm_set1_epi16(0xFFFF)));
1959
10.5k
    p1_16x8 = _mm_add_epi8(p1_16x8, p1_16x8_1);
1960
1961
    //if(Aq < Beta)
1962
10.5k
    temp1 = _mm_avg_epu16(_mm_unpacklo_epi8(q0_16x8, zero),
1963
10.5k
                          _mm_unpacklo_epi8(p0_16x8, zero));
1964
10.5k
    temp2 = _mm_slli_epi16(_mm_unpacklo_epi8(q1_16x8, zero), 1);
1965
    //temp2 = _mm_slli_epi16 (temp2, 1);
1966
10.5k
    temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(q2_16x8, zero), temp2);
1967
10.5k
    temp2 = _mm_add_epi16(temp1, temp2);
1968
10.5k
    in_macro_16x8 = _mm_srai_epi16(temp2, 1);
1969
1970
10.5k
    in_macro_16x8 = _mm_max_epi16(C0_8x16, in_macro_16x8); //CLIP3
1971
10.5k
    C0_8x16 = _mm_subs_epi16(zero, C0_8x16);
1972
10.5k
    in_macro_16x8 = _mm_min_epi16(C0_8x16, in_macro_16x8); //CLIP3
1973
1974
10.5k
    temp1 = _mm_add_epi16(_mm_unpacklo_epi8(q1_16x8, zero), in_macro_16x8);
1975
1976
    // q1
1977
10.5k
    temp1 = _mm_packus_epi16(temp1, zero);
1978
1979
10.5k
    q1_16x8_1 = _mm_and_si128(temp1, flag3_16x8);
1980
10.5k
    q1_16x8 = _mm_and_si128(q1_16x8,
1981
10.5k
                            _mm_xor_si128(flag3_16x8, _mm_set1_epi16(0xFFFF)));
1982
10.5k
    q1_16x8 = _mm_add_epi8(q1_16x8, q1_16x8_1);
1983
1984
10.5k
    temp1 = _mm_unpacklo_epi8(p3_16x8, p2_16x8);
1985
10.5k
    temp2 = _mm_unpacklo_epi8(p1_16x8, p0_16x8_1);
1986
10.5k
    temp3 = _mm_unpacklo_epi8(q0_16x8_1, q1_16x8);
1987
10.5k
    temp4 = _mm_unpacklo_epi8(q2_16x8, q3_16x8);
1988
1989
10.5k
    line7 = _mm_unpacklo_epi16(temp1, temp2);
1990
10.5k
    temp1 = _mm_unpackhi_epi16(temp1, temp2);
1991
10.5k
    line8 = _mm_unpacklo_epi16(temp3, temp4);
1992
10.5k
    temp2 = _mm_unpackhi_epi16(temp3, temp4);
1993
1994
10.5k
    line1 = _mm_unpacklo_epi32(line7, line8);
1995
10.5k
    line2 = _mm_srli_si128(line1, 8);
1996
10.5k
    line3 = _mm_unpackhi_epi32(line7, line8);
1997
10.5k
    line4 = _mm_srli_si128(line3, 8);
1998
10.5k
    line5 = _mm_unpacklo_epi32(temp1, temp2);
1999
10.5k
    line6 = _mm_srli_si128(line5, 8);
2000
10.5k
    line7 = _mm_unpackhi_epi32(temp1, temp2);
2001
10.5k
    line8 = _mm_srli_si128(line7, 8);
2002
2003
10.5k
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd), line1);
2004
10.5k
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd), line2);
2005
10.5k
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd), line3);
2006
10.5k
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd), line4);
2007
10.5k
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd), line5);
2008
10.5k
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd), line6);
2009
10.5k
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd), line7);
2010
10.5k
    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd), line8);
2011
10.5k
}
2012