Coverage Report

Created: 2025-08-03 06:09

/src/libavc/common/x86/ih264_deblk_chroma_ssse3.c
Line
Count
Source
1
/******************************************************************************
2
 *
3
 * Copyright (C) 2015 The Android Open Source Project
4
 *
5
 * Licensed under the Apache License, Version 2.0 (the "License");
6
 * you may not use this file except in compliance with the License.
7
 * You may obtain a copy of the License at:
8
 *
9
 * http://www.apache.org/licenses/LICENSE-2.0
10
 *
11
 * Unless required by applicable law or agreed to in writing, software
12
 * distributed under the License is distributed on an "AS IS" BASIS,
13
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 * See the License for the specific language governing permissions and
15
 * limitations under the License.
16
 *
17
 *****************************************************************************
18
 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19
*/
20
/*****************************************************************************/
21
/*                                                                           */
22
/*  File Name         : ih264_deblk_chroma_ssse3.c                           */
23
/*                                                                           */
24
/*  Description       : Contains function definitions for deblocking         */
25
/*                                                                           */
26
/*  List of Functions : ih264_deblk_chroma_vert_bs4_ssse3()                  */
27
/*                      ih264_deblk_chroma_horz_bs4_ssse3()                  */
28
/*                      ih264_deblk_chroma_vert_bslt4_ssse3()                */
29
/*                      ih264_deblk_chroma_horz_bslt4_ssse3()                */
30
/*                      ih264_deblk_chroma_vert_bs4_mbaff_ssse3()            */
31
/*                      ih264_deblk_chroma_vert_bslt4_mbaff_ssse3()          */
32
/*                                                                           */
33
/*  Issues / Problems : None                                                 */
34
/*                                                                           */
35
/*  Revision History  :                                                      */
36
/*                                                                           */
37
/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
38
/*         12 02 2015   Naveen Kumar P  Added chrom deblocking ssse3         */
39
/*                                      intrinsics                           */
40
/*                                                                           */
41
/*****************************************************************************/
42
43
/*****************************************************************************/
44
/* File Includes                                                             */
45
/*****************************************************************************/
46
47
/* System include files */
48
#include <stdio.h>
49
50
/* User include files */
51
#include "ih264_typedefs.h"
52
#include "ih264_platform_macros.h"
53
#include "ih264_deblk_edge_filters.h"
54
#include "ih264_macros.h"
55
56
/*****************************************************************************/
57
/* Function Definitions                                                      */
58
/*****************************************************************************/
59
60
/*****************************************************************************/
61
/*                                                                           */
62
/*  Function Name : ih264_deblk_chroma_vert_bs4_ssse3()                      */
63
/*                                                                           */
64
/*  Description   : This function performs filtering of a chroma block       */
65
/*                  vertical edge when the boundary strength is set to 4 in  */
66
/*                  high profile.                                            */
67
/*                                                                           */
68
/*  Inputs        : pu1_src    - pointer to the src sample q0 of U           */
69
/*                  src_strd   - source stride                               */
70
/*                  alpha_cb   - alpha value for the boundary in U           */
71
/*                  beta_cb    - beta value for the boundary in U            */
72
/*                  alpha_cr   - alpha value for the boundary in V           */
73
/*                  beta_cr    - beta value for the boundary in V            */
74
/*                                                                           */
75
/*  Globals       : None                                                     */
76
/*                                                                           */
77
/*  Processing    : This operation is described in Sec. 8.7.2.4 under the    */
78
/*                  title "Filtering process for edges for bS equal to 4" in */
79
/*                  ITU T Rec H.264 with alpha and beta values different in  */
80
/*                  U and V.                                                 */
81
/*                                                                           */
82
/*  Outputs       : None                                                     */
83
/*                                                                           */
84
/*  Returns       : None                                                     */
85
/*                                                                           */
86
/*  Issues        : None                                                     */
87
/*                                                                           */
88
/*  Revision History:                                                        */
89
/*                                                                           */
90
/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
91
/*         12 02 2015   Naveen Kumar P  Initial version                      */
92
/*                                                                           */
93
/*****************************************************************************/
94
void ih264_deblk_chroma_vert_bs4_ssse3(UWORD8 *pu1_src,
95
                                       WORD32 src_strd,
96
                                       WORD32 alpha_cb,
97
                                       WORD32 beta_cb,
98
                                       WORD32 alpha_cr,
99
                                       WORD32 beta_cr)
100
8.81M
{
101
8.81M
    UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
102
8.81M
    WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
103
8.81M
    WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
104
8.81M
    __m128i linea, lineb, linec, lined, linee, linef, lineg, lineh;
105
8.81M
    __m128i temp1, temp2, temp3, temp4;
106
107
8.81M
    __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
108
8.81M
    __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
109
8.81M
    __m128i flag1, flag2;
110
8.81M
    __m128i diff, alpha_cbcr_16x8, beta_cbcr_16x8;
111
8.81M
    __m128i zero = _mm_setzero_si128();
112
8.81M
    __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;
113
114
    /* Load and transpose the pixel values */
115
8.81M
    linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4));
116
8.81M
    lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd));
117
8.81M
    linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd));
118
8.81M
    lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd));
119
8.81M
    linee = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd));
120
8.81M
    linef = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd));
121
8.81M
    lineg = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd));
122
8.81M
    lineh = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd));
123
124
8.81M
    temp1 = _mm_unpacklo_epi16(linea, lineb);
125
8.81M
    temp2 = _mm_unpacklo_epi16(linec, lined);
126
8.81M
    temp3 = _mm_unpacklo_epi16(linee, linef);
127
8.81M
    temp4 = _mm_unpacklo_epi16(lineg, lineh);
128
129
8.81M
    p1_uv_8x16 = _mm_unpacklo_epi32(temp1, temp2);
130
8.81M
    p0_uv_8x16 = _mm_unpacklo_epi32(temp3, temp4);
131
8.81M
    q0_uv_8x16 = _mm_unpackhi_epi32(temp1, temp2);
132
8.81M
    q1_uv_8x16 = _mm_unpackhi_epi32(temp3, temp4);
133
134
8.81M
    p1_uv_16x8 = _mm_unpacklo_epi64(p1_uv_8x16, p0_uv_8x16);
135
8.81M
    p0_uv_16x8 = _mm_unpackhi_epi64(p1_uv_8x16, p0_uv_8x16);
136
8.81M
    q0_uv_16x8 = _mm_unpacklo_epi64(q0_uv_8x16, q1_uv_8x16);
137
8.81M
    q1_uv_16x8 = _mm_unpackhi_epi64(q0_uv_8x16, q1_uv_8x16);
138
    /* End of transpose */
139
140
8.81M
    q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
141
8.81M
    q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
142
8.81M
    p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
143
8.81M
    p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);
144
145
8.81M
    diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
146
8.81M
    diff = _mm_abs_epi16(diff);
147
8.81M
    alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
148
8.81M
    flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
149
150
8.81M
    diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
151
8.81M
    diff = _mm_abs_epi16(diff);
152
8.81M
    beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
153
8.81M
    flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
154
155
8.81M
    diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
156
8.81M
    diff = _mm_abs_epi16(diff);
157
8.81M
    flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
158
159
8.81M
    temp1 = _mm_slli_epi16(p1_uv_8x16, 1);
160
8.81M
    temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16);
161
8.81M
    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
162
8.81M
    temp1 = _mm_add_epi16(temp1, temp2);
163
8.81M
    p0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);
164
165
8.81M
    temp1 = _mm_slli_epi16(q1_uv_8x16, 1);
166
8.81M
    temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16);
167
8.81M
    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
168
8.81M
    temp1 = _mm_add_epi16(temp1, temp2);
169
8.81M
    q0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);
170
171
8.81M
    q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero);
172
8.81M
    q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero);
173
8.81M
    p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero);
174
8.81M
    p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero);
175
176
8.81M
    diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
177
8.81M
    diff = _mm_abs_epi16(diff);
178
8.81M
    alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
179
8.81M
    flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
180
181
8.81M
    diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
182
8.81M
    diff = _mm_abs_epi16(diff);
183
8.81M
    beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
184
8.81M
    flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
185
186
8.81M
    diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
187
8.81M
    diff = _mm_abs_epi16(diff);
188
8.81M
    flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
189
190
8.81M
    temp1 = _mm_slli_epi16(p1_uv_8x16, 1);
191
8.81M
    temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16);
192
8.81M
    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
193
8.81M
    temp1 = _mm_add_epi16(temp1, temp2);
194
8.81M
    p0_uv_8x16_2 = _mm_srai_epi16(temp1, 2);
195
196
8.81M
    temp1 = _mm_slli_epi16(q1_uv_8x16, 1);
197
8.81M
    temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16);
198
8.81M
    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
199
8.81M
    temp1 = _mm_add_epi16(temp1, temp2);
200
8.81M
    q0_uv_8x16_2 = _mm_srai_epi16(temp1, 2);
201
202
8.81M
    p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2);
203
8.81M
    q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2);
204
205
8.81M
    flag1 = _mm_packs_epi16(flag1, flag2);
206
207
8.81M
    p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
208
8.81M
                                 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
209
8.81M
    p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
210
8.81M
    p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);
211
212
8.81M
    q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
213
8.81M
                                 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
214
8.81M
    q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
215
8.81M
    q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);
216
217
    /* Inverse-transpose and store back */
218
8.81M
    temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8);
219
8.81M
    temp2 = _mm_unpackhi_epi16(p1_uv_16x8, p0_uv_16x8);
220
8.81M
    temp3 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8);
221
8.81M
    temp4 = _mm_unpackhi_epi16(q0_uv_16x8, q1_uv_16x8);
222
223
8.81M
    linea = _mm_unpacklo_epi32(temp1, temp3);
224
8.81M
    lineb = _mm_srli_si128(linea, 8);
225
8.81M
    linec = _mm_unpackhi_epi32(temp1, temp3);
226
8.81M
    lined = _mm_srli_si128(linec, 8);
227
8.81M
    linee = _mm_unpacklo_epi32(temp2, temp4);
228
8.81M
    linef = _mm_srli_si128(linee, 8);
229
8.81M
    lineg = _mm_unpackhi_epi32(temp2, temp4);
230
8.81M
    lineh = _mm_srli_si128(lineg, 8);
231
232
8.81M
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea);
233
8.81M
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb);
234
8.81M
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec);
235
8.81M
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined);
236
8.81M
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd), linee);
237
8.81M
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd), linef);
238
8.81M
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd), lineg);
239
8.81M
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd), lineh);
240
241
8.81M
}
242
243
/*****************************************************************************/
244
/*                                                                           */
245
/*  Function Name : ih264_deblk_chroma_horz_bs4_ssse3()                      */
246
/*                                                                           */
247
/*  Description   : This function performs filtering of a chroma block       */
248
/*                  horizontal edge when the boundary strength is set to 4   */
249
/*                  in high profile.                                         */
250
/*                                                                           */
251
/*  Inputs        : pu1_src    - pointer to the src sample q0 of U           */
252
/*                  src_strd   - source stride                               */
253
/*                  alpha_cb   - alpha value for the boundary in U           */
254
/*                  beta_cb    - beta value for the boundary in U            */
255
/*                  alpha_cr   - alpha value for the boundary in V           */
256
/*                  beta_cr    - beta value for the boundary in V            */
257
/*                                                                           */
258
/*  Globals       : None                                                     */
259
/*                                                                           */
260
/*  Processing    : This operation is described in Sec. 8.7.2.4 under the    */
261
/*                  title "Filtering process for edges for bS equal to 4" in */
262
/*                  ITU T Rec H.264 with alpha and beta values different in  */
263
/*                  U and V.                                                 */
264
/*                                                                           */
265
/*  Outputs       : None                                                     */
266
/*                                                                           */
267
/*  Returns       : None                                                     */
268
/*                                                                           */
269
/*  Issues        : None                                                     */
270
/*                                                                           */
271
/*  Revision History:                                                        */
272
/*                                                                           */
273
/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
274
/*         12 02 2015   Naveen Kumar P  Initial version                      */
275
/*                                                                           */
276
/*****************************************************************************/
277
void ih264_deblk_chroma_horz_bs4_ssse3(UWORD8 *pu1_src,
278
                                       WORD32 src_strd,
279
                                       WORD32 alpha_cb,
280
                                       WORD32 beta_cb,
281
                                       WORD32 alpha_cr,
282
                                       WORD32 beta_cr)
283
8.73M
{
284
8.73M
    UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
285
8.73M
    WORD16 i16_posP1, i16_posP0, i16_posQ1;
286
287
8.73M
    UWORD8 *pu1_HorzPixelUV; /*! < Pointer to the first pixel of the boundary */
288
8.73M
    WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
289
8.73M
    WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
290
8.73M
    __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
291
8.73M
    __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
292
8.73M
    __m128i flag1, flag2;
293
8.73M
    __m128i diff, alpha_cbcr_16x8, beta_cbcr_16x8;
294
8.73M
    __m128i zero = _mm_setzero_si128();
295
8.73M
    __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;
296
8.73M
    __m128i temp1, temp2;
297
298
8.73M
    pu1_HorzPixelUV = pu1_src_uv - (src_strd << 1);
299
300
8.73M
    i16_posQ1 = src_strd;
301
8.73M
    i16_posP0 = src_strd;
302
8.73M
    i16_posP1 = 0;
303
304
8.73M
    q0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv));
305
8.73M
    q1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv + i16_posQ1));
306
8.73M
    p1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP1));
307
8.73M
    p0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0));
308
309
8.73M
    q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
310
8.73M
    q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
311
8.73M
    p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
312
8.73M
    p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);
313
314
8.73M
    diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
315
8.73M
    diff = _mm_abs_epi16(diff);
316
8.73M
    alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
317
8.73M
    flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
318
319
8.73M
    diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
320
8.73M
    diff = _mm_abs_epi16(diff);
321
8.73M
    beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
322
8.73M
    flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
323
324
8.73M
    diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
325
8.73M
    diff = _mm_abs_epi16(diff);
326
8.73M
    flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
327
328
8.73M
    temp1 = _mm_slli_epi16(p1_uv_8x16, 1);
329
8.73M
    temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16);
330
8.73M
    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
331
8.73M
    temp1 = _mm_add_epi16(temp1, temp2);
332
8.73M
    p0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);
333
334
8.73M
    temp1 = _mm_slli_epi16(q1_uv_8x16, 1);
335
8.73M
    temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16);
336
8.73M
    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
337
8.73M
    temp1 = _mm_add_epi16(temp1, temp2);
338
8.73M
    q0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);
339
340
8.73M
    q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero);
341
8.73M
    q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero);
342
8.73M
    p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero);
343
8.73M
    p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero);
344
345
8.73M
    diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
346
8.73M
    diff = _mm_abs_epi16(diff);
347
8.73M
    alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
348
8.73M
    flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
349
350
8.73M
    diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
351
8.73M
    diff = _mm_abs_epi16(diff);
352
8.73M
    beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
353
8.73M
    flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
354
355
8.73M
    diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
356
8.73M
    diff = _mm_abs_epi16(diff);
357
8.73M
    flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
358
359
8.73M
    temp1 = _mm_slli_epi16(p1_uv_8x16, 1);
360
8.73M
    temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16);
361
8.73M
    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
362
8.73M
    temp1 = _mm_add_epi16(temp1, temp2);
363
8.73M
    p0_uv_8x16_2 = _mm_srai_epi16(temp1, 2);
364
365
8.73M
    temp1 = _mm_slli_epi16(q1_uv_8x16, 1);
366
8.73M
    temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16);
367
8.73M
    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
368
8.73M
    temp1 = _mm_add_epi16(temp1, temp2);
369
8.73M
    q0_uv_8x16_2 = _mm_srai_epi16(temp1, 2);
370
371
8.73M
    p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2);
372
8.73M
    q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2);
373
374
8.73M
    flag1 = _mm_packs_epi16(flag1, flag2);
375
376
8.73M
    p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
377
8.73M
                                 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
378
8.73M
    p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
379
8.73M
    p0_uv_8x16_1 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);
380
8.73M
    _mm_storeu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0), p0_uv_8x16_1);
381
382
8.73M
    q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
383
8.73M
                                 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
384
8.73M
    q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
385
8.73M
    q0_uv_8x16_1 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);
386
8.73M
    _mm_storeu_si128((__m128i *)(pu1_src_uv), q0_uv_8x16_1);
387
388
8.73M
}
389
390
/*****************************************************************************/
391
/*                                                                           */
392
/*  Function Name : ih264_deblk_chroma_vert_bslt4_ssse3()                    */
393
/*                                                                           */
394
/*  Description   : This function performs filtering of a chroma block       */
395
/*                  vertical edge when the boundary strength is less than 4  */
396
/*                  in high profile.                                         */
397
/*                                                                           */
398
/*  Inputs        : pu1_src          - pointer to the src sample q0 of U     */
399
/*                  src_strd         - source stride                         */
400
/*                  alpha_cb         - alpha value for the boundary in U     */
401
/*                  beta_cb          - beta value for the boundary in U      */
402
/*                  alpha_cr         - alpha value for the boundary in V     */
403
/*                  beta_cr          - beta value for the boundary in V      */
404
/*                  u4_bs            - packed Boundary strength array        */
405
/*                  pu1_cliptab_cb   - tc0_table for U                       */
406
/*                  pu1_cliptab_cr   - tc0_table for V                       */
407
/*                                                                           */
408
/*  Globals       : None                                                     */
409
/*                                                                           */
410
/*  Processing    : This operation is described in Sec. 8.7.2.3 under the    */
411
/*                  title "Filtering process for edges for bS less than 4"   */
412
/*                  in ITU T Rec H.264 with alpha and beta values different  */
413
/*                  in U and V.                                              */
414
/*                                                                           */
415
/*  Outputs       : None                                                     */
416
/*                                                                           */
417
/*  Returns       : None                                                     */
418
/*                                                                           */
419
/*  Issues        : None                                                     */
420
/*                                                                           */
421
/*  Revision History:                                                        */
422
/*                                                                           */
423
/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
424
/*         12 02 2015   Naveen Kumar P  Initial version                      */
425
/*                                                                           */
426
/*****************************************************************************/
427
void ih264_deblk_chroma_vert_bslt4_ssse3(UWORD8 *pu1_src,
428
                                         WORD32 src_strd,
429
                                         WORD32 alpha_cb,
430
                                         WORD32 beta_cb,
431
                                         WORD32 alpha_cr,
432
                                         WORD32 beta_cr,
433
                                         UWORD32 u4_bs,
434
                                         const UWORD8 *pu1_cliptab_cb,
435
                                         const UWORD8 *pu1_cliptab_cr)
436
11.3M
{
437
11.3M
    UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
438
11.3M
    UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
439
11.3M
    WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
440
11.3M
    WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
441
11.3M
    __m128i linea, lineb, linec, lined, linee, linef, lineg, lineh;
442
11.3M
    __m128i temp1, temp2, temp3, temp4;
443
444
11.3M
    __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
445
11.3M
    __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
446
11.3M
    __m128i flag_bs, flag1, flag2;
447
11.3M
    __m128i diff, diff1, alpha_cbcr_16x8, beta_cbcr_16x8, in_macro;
448
11.3M
    __m128i zero = _mm_setzero_si128();
449
11.3M
    __m128i C0_uv_8x16;
450
11.3M
    __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;
451
452
11.3M
    u1_Bs0 = (u4_bs >> 24) & 0xff;
453
11.3M
    u1_Bs1 = (u4_bs >> 16) & 0xff;
454
11.3M
    u1_Bs2 = (u4_bs >> 8) & 0xff;
455
11.3M
    u1_Bs3 = (u4_bs >> 0) & 0xff;
456
457
11.3M
    flag_bs = _mm_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs2, u1_Bs2,
458
11.3M
                           u1_Bs2, u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,
459
11.3M
                           u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0);
460
11.3M
    flag_bs = _mm_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s
461
11.3M
    flag_bs = _mm_xor_si128(flag_bs, _mm_set1_epi8(0xFF)); //Invert for required mask
462
463
    /* Load and transpose the pixel values */
464
11.3M
    linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4));
465
11.3M
    lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd));
466
11.3M
    linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd));
467
11.3M
    lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd));
468
11.3M
    linee = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd));
469
11.3M
    linef = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd));
470
11.3M
    lineg = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd));
471
11.3M
    lineh = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd));
472
473
11.3M
    temp1 = _mm_unpacklo_epi16(linea, lineb);
474
11.3M
    temp2 = _mm_unpacklo_epi16(linec, lined);
475
11.3M
    temp3 = _mm_unpacklo_epi16(linee, linef);
476
11.3M
    temp4 = _mm_unpacklo_epi16(lineg, lineh);
477
478
11.3M
    p1_uv_8x16 = _mm_unpacklo_epi32(temp1, temp2);
479
11.3M
    p0_uv_8x16 = _mm_unpacklo_epi32(temp3, temp4);
480
11.3M
    q0_uv_8x16 = _mm_unpackhi_epi32(temp1, temp2);
481
11.3M
    q1_uv_8x16 = _mm_unpackhi_epi32(temp3, temp4);
482
483
11.3M
    p1_uv_16x8 = _mm_unpacklo_epi64(p1_uv_8x16, p0_uv_8x16);
484
11.3M
    p0_uv_16x8 = _mm_unpackhi_epi64(p1_uv_8x16, p0_uv_8x16);
485
11.3M
    q0_uv_16x8 = _mm_unpacklo_epi64(q0_uv_8x16, q1_uv_8x16);
486
11.3M
    q1_uv_16x8 = _mm_unpackhi_epi64(q0_uv_8x16, q1_uv_8x16);
487
    /* End of transpose */
488
489
11.3M
    q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
490
11.3M
    q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
491
11.3M
    p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
492
11.3M
    p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);
493
494
11.3M
    diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
495
11.3M
    diff = _mm_abs_epi16(diff);
496
11.3M
    alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
497
11.3M
    flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
498
499
11.3M
    diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
500
11.3M
    diff = _mm_abs_epi16(diff);
501
11.3M
    beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
502
11.3M
    flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
503
504
11.3M
    diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
505
11.3M
    diff = _mm_abs_epi16(diff);
506
11.3M
    flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
507
508
11.3M
    diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16);
509
11.3M
    diff = _mm_slli_epi16(diff, 2);
510
11.3M
    diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16);
511
11.3M
    diff = _mm_add_epi16(diff, diff1);
512
11.3M
    diff = _mm_add_epi16(diff, _mm_set1_epi16(4));
513
11.3M
    in_macro = _mm_srai_epi16(diff, 3);
514
515
11.3M
    C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
516
11.3M
                               pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
517
11.3M
                               pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0],
518
11.3M
                               pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0]);
519
520
11.3M
    C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1));
521
522
11.3M
    in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3
523
11.3M
    C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16);
524
11.3M
    in_macro = _mm_max_epi16(C0_uv_8x16, in_macro);
525
526
11.3M
    p0_uv_8x16_1 = _mm_add_epi16(p0_uv_8x16, in_macro);
527
11.3M
    q0_uv_8x16_1 = _mm_sub_epi16(q0_uv_8x16, in_macro);
528
529
11.3M
    q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero);
530
11.3M
    q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero);
531
11.3M
    p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero);
532
11.3M
    p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero);
533
534
11.3M
    diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
535
11.3M
    diff = _mm_abs_epi16(diff);
536
11.3M
    alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
537
11.3M
    flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
538
539
11.3M
    diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
540
11.3M
    diff = _mm_abs_epi16(diff);
541
11.3M
    beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
542
11.3M
    flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
543
544
11.3M
    diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
545
11.3M
    diff = _mm_abs_epi16(diff);
546
11.3M
    flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
547
548
11.3M
    diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16);
549
11.3M
    diff = _mm_slli_epi16(diff, 2);
550
11.3M
    diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16);
551
11.3M
    diff = _mm_add_epi16(diff, diff1);
552
11.3M
    diff = _mm_add_epi16(diff, _mm_set1_epi16(4));
553
11.3M
    in_macro = _mm_srai_epi16(diff, 3);
554
555
11.3M
    C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
556
11.3M
                               pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
557
11.3M
                               pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2],
558
11.3M
                               pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2]);
559
560
11.3M
    C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1));
561
562
11.3M
    in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3
563
11.3M
    C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16);
564
11.3M
    in_macro = _mm_max_epi16(C0_uv_8x16, in_macro);
565
566
11.3M
    p0_uv_8x16_2 = _mm_add_epi16(p0_uv_8x16, in_macro);
567
11.3M
    q0_uv_8x16_2 = _mm_sub_epi16(q0_uv_8x16, in_macro);
568
569
11.3M
    p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2);
570
11.3M
    q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2);
571
572
11.3M
    flag1 = _mm_packs_epi16(flag1, flag2);
573
11.3M
    flag1 = _mm_and_si128(flag1, flag_bs); //Final flag (BS condition + other 3 conditions)
574
575
11.3M
    p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
576
11.3M
                                 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
577
11.3M
    p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
578
11.3M
    p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);
579
580
11.3M
    q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
581
11.3M
                                 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
582
11.3M
    q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
583
11.3M
    q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);
584
585
    /* Inverse-transpose and store back */
586
11.3M
    temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8);
587
11.3M
    temp2 = _mm_unpackhi_epi16(p1_uv_16x8, p0_uv_16x8);
588
11.3M
    temp3 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8);
589
11.3M
    temp4 = _mm_unpackhi_epi16(q0_uv_16x8, q1_uv_16x8);
590
591
11.3M
    linea = _mm_unpacklo_epi32(temp1, temp3);
592
11.3M
    lineb = _mm_srli_si128(linea, 8);
593
11.3M
    linec = _mm_unpackhi_epi32(temp1, temp3);
594
11.3M
    lined = _mm_srli_si128(linec, 8);
595
11.3M
    linee = _mm_unpacklo_epi32(temp2, temp4);
596
11.3M
    linef = _mm_srli_si128(linee, 8);
597
11.3M
    lineg = _mm_unpackhi_epi32(temp2, temp4);
598
11.3M
    lineh = _mm_srli_si128(lineg, 8);
599
600
11.3M
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea);
601
11.3M
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb);
602
11.3M
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec);
603
11.3M
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined);
604
11.3M
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd), linee);
605
11.3M
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd), linef);
606
11.3M
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd), lineg);
607
11.3M
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd), lineh);
608
609
11.3M
}
610
611
/*****************************************************************************/
612
/*                                                                           */
613
/*  Function Name : ih264_deblk_chroma_horz_bslt4_ssse3()                    */
614
/*                                                                           */
615
/*  Description   : This function performs filtering of a chroma block       */
616
/*                  horizontal edge when the boundary strength is less than  */
617
/*                  4 in high profile.                                       */
618
/*                                                                           */
619
/*  Inputs        : pu1_src          - pointer to the src sample q0 of U     */
620
/*                  src_strd         - source stride                         */
621
/*                  alpha_cb         - alpha value for the boundary in U     */
622
/*                  beta_cb          - beta value for the boundary in U      */
623
/*                  alpha_cr         - alpha value for the boundary in V     */
624
/*                  beta_cr          - beta value for the boundary in V      */
625
/*                  u4_bs            - packed Boundary strength array        */
626
/*                  pu1_cliptab_cb   - tc0_table for U                       */
627
/*                  pu1_cliptab_cr   - tc0_table for V                       */
628
/*                                                                           */
629
/*  Globals       : None                                                     */
630
/*                                                                           */
631
/*  Processing    : This operation is described in Sec. 8.7.2.3 under the    */
632
/*                  title "Filtering process for edges for bS less than 4"   */
633
/*                  in ITU T Rec H.264 with alpha and beta values different  */
634
/*                  in U and V.                                              */
635
/*                                                                           */
636
/*  Outputs       : None                                                     */
637
/*                                                                           */
638
/*  Returns       : None                                                     */
639
/*                                                                           */
640
/*  Issues        : None                                                     */
641
/*                                                                           */
642
/*  Revision History:                                                        */
643
/*                                                                           */
644
/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
645
/*         12 02 2015   Naveen Kumar P  Initial version                      */
646
/*                                                                           */
647
/*****************************************************************************/
648
void ih264_deblk_chroma_horz_bslt4_ssse3(UWORD8 *pu1_src,
649
                                         WORD32 src_strd,
650
                                         WORD32 alpha_cb,
651
                                         WORD32 beta_cb,
652
                                         WORD32 alpha_cr,
653
                                         WORD32 beta_cr,
654
                                         UWORD32 u4_bs,
655
                                         const UWORD8 *pu1_cliptab_cb,
656
                                         const UWORD8 *pu1_cliptab_cr)
657
11.4M
{
658
11.4M
    UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
659
11.4M
    WORD16 i16_posP1, i16_posP0, i16_posQ1;
660
11.4M
    UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
661
662
11.4M
    UWORD8 *pu1_HorzPixelUV; /*! < Pointer to the first pixel of the boundary */
663
11.4M
    WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
664
11.4M
    WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
665
11.4M
    __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
666
11.4M
    __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
667
11.4M
    __m128i flag_bs, flag1, flag2;
668
11.4M
    __m128i diff, diff1, alpha_cbcr_16x8, beta_cbcr_16x8, in_macro;
669
11.4M
    __m128i zero = _mm_setzero_si128();
670
11.4M
    __m128i C0_uv_8x16;
671
11.4M
    __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;
672
673
11.4M
    pu1_HorzPixelUV = pu1_src_uv - (src_strd << 1);
674
675
11.4M
    i16_posQ1 = src_strd;
676
11.4M
    i16_posP0 = src_strd;
677
11.4M
    i16_posP1 = 0;
678
679
11.4M
    u1_Bs0 = (u4_bs >> 24) & 0xff;
680
11.4M
    u1_Bs1 = (u4_bs >> 16) & 0xff;
681
11.4M
    u1_Bs2 = (u4_bs >> 8) & 0xff;
682
11.4M
    u1_Bs3 = (u4_bs >> 0) & 0xff;
683
684
11.4M
    flag_bs = _mm_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs2, u1_Bs2,
685
11.4M
                           u1_Bs2, u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,
686
11.4M
                           u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0);
687
11.4M
    flag_bs = _mm_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s
688
11.4M
    flag_bs = _mm_xor_si128(flag_bs, _mm_set1_epi8(0xFF)); //Invert for required mask
689
690
11.4M
    q0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv));
691
11.4M
    q1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv + i16_posQ1));
692
11.4M
    p1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP1));
693
11.4M
    p0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0));
694
695
11.4M
    q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
696
11.4M
    q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
697
11.4M
    p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
698
11.4M
    p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);
699
700
11.4M
    diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
701
11.4M
    diff = _mm_abs_epi16(diff);
702
11.4M
    alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
703
11.4M
    flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
704
705
11.4M
    diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
706
11.4M
    diff = _mm_abs_epi16(diff);
707
11.4M
    beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
708
11.4M
    flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
709
710
11.4M
    diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
711
11.4M
    diff = _mm_abs_epi16(diff);
712
11.4M
    flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
713
714
11.4M
    diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16);
715
11.4M
    diff = _mm_slli_epi16(diff, 2);
716
11.4M
    diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16);
717
11.4M
    diff = _mm_add_epi16(diff, diff1);
718
11.4M
    diff = _mm_add_epi16(diff, _mm_set1_epi16(4));
719
11.4M
    in_macro = _mm_srai_epi16(diff, 3);
720
721
11.4M
    C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
722
11.4M
                               pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
723
11.4M
                               pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0],
724
11.4M
                               pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0]);
725
726
11.4M
    C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1));
727
728
11.4M
    in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3
729
11.4M
    C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16);
730
11.4M
    in_macro = _mm_max_epi16(C0_uv_8x16, in_macro);
731
732
11.4M
    p0_uv_8x16_1 = _mm_add_epi16(p0_uv_8x16, in_macro);
733
11.4M
    q0_uv_8x16_1 = _mm_sub_epi16(q0_uv_8x16, in_macro);
734
735
11.4M
    q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero);
736
11.4M
    q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero);
737
11.4M
    p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero);
738
11.4M
    p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero);
739
740
11.4M
    diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
741
11.4M
    diff = _mm_abs_epi16(diff);
742
11.4M
    alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
743
11.4M
    flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
744
745
11.4M
    diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
746
11.4M
    diff = _mm_abs_epi16(diff);
747
11.4M
    beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
748
11.4M
    flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
749
750
11.4M
    diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
751
11.4M
    diff = _mm_abs_epi16(diff);
752
11.4M
    flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
753
754
11.4M
    diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16);
755
11.4M
    diff = _mm_slli_epi16(diff, 2);
756
11.4M
    diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16);
757
11.4M
    diff = _mm_add_epi16(diff, diff1);
758
11.4M
    diff = _mm_add_epi16(diff, _mm_set1_epi16(4));
759
11.4M
    in_macro = _mm_srai_epi16(diff, 3);
760
761
11.4M
    C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
762
11.4M
                               pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
763
11.4M
                               pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2],
764
11.4M
                               pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2]);
765
766
11.4M
    C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1));
767
768
11.4M
    in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3
769
11.4M
    C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16);
770
11.4M
    in_macro = _mm_max_epi16(C0_uv_8x16, in_macro);
771
772
11.4M
    p0_uv_8x16_2 = _mm_add_epi16(p0_uv_8x16, in_macro);
773
11.4M
    q0_uv_8x16_2 = _mm_sub_epi16(q0_uv_8x16, in_macro);
774
775
11.4M
    p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2);
776
11.4M
    q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2);
777
778
11.4M
    flag1 = _mm_packs_epi16(flag1, flag2);
779
11.4M
    flag1 = _mm_and_si128(flag1, flag_bs); //Final flag (BS condition + other 3 conditions)
780
781
11.4M
    p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
782
11.4M
                                 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
783
11.4M
    p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
784
11.4M
    p0_uv_8x16_1 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);
785
11.4M
    _mm_storeu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0), p0_uv_8x16_1);
786
787
11.4M
    q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
788
11.4M
                                 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
789
11.4M
    q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
790
11.4M
    q0_uv_8x16_1 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);
791
11.4M
    _mm_storeu_si128((__m128i *)(pu1_src_uv), q0_uv_8x16_1);
792
793
11.4M
}
794
795
/*****************************************************************************/
796
/*                                                                           */
797
/*  Function Name : ih264_deblk_chroma_vert_bs4_mbaff_ssse3()                */
798
/*                                                                           */
799
/*  Description   : This function performs filtering of a chroma block       */
800
/*                  vertical edge when boundary strength is set to 4 in high */
801
/*                  profile.                                                 */
802
/*                                                                           */
803
/*  Inputs        : pu1_src          - pointer to the src sample q0 of U     */
804
/*                  src_strd         - source stride                         */
805
/*                  alpha_cb         - alpha value for the boundary in U     */
806
/*                  beta_cb          - beta value for the boundary in U      */
807
/*                  alpha_cr         - alpha value for the boundary in V     */
808
/*                  beta_cr          - beta value for the boundary in V      */
809
/*                  u4_bs            - packed Boundary strength array        */
810
/*                  pu1_cliptab_cb   - tc0_table for U                       */
811
/*                  pu1_cliptab_cr   - tc0_table for V                       */
812
/*                                                                           */
813
/*  Globals       : None                                                     */
814
/*                                                                           */
815
/*  Processing    : When the function is called twice, this operation is as  */
816
/*                  described in Sec. 8.7.2.4 under the title "Filtering     */
817
/*                  process for edges for bS equal to 4" in ITU T Rec H.264  */
818
/*                  with alpha and beta values different in U and V.         */
819
/*                                                                           */
820
/*  Outputs       : None                                                     */
821
/*                                                                           */
822
/*  Returns       : None                                                     */
823
/*                                                                           */
824
/*  Issues        : None                                                     */
825
/*                                                                           */
826
/*  Revision History:                                                        */
827
/*                                                                           */
828
/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
829
/*         12 02 2015   Naveen Kumar P  Initial version                      */
830
/*                                                                           */
831
/*****************************************************************************/
832
void ih264_deblk_chroma_vert_bs4_mbaff_ssse3(UWORD8 *pu1_src,
833
                                             WORD32 src_strd,
834
                                             WORD32 alpha_cb,
835
                                             WORD32 beta_cb,
836
                                             WORD32 alpha_cr,
837
                                             WORD32 beta_cr)
838
31.1k
{
839
31.1k
    UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
840
31.1k
    WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
841
31.1k
    WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
842
31.1k
    __m128i linea, lineb, linec, lined;
843
31.1k
    __m128i temp1, temp2;
844
845
31.1k
    __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
846
31.1k
    __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
847
31.1k
    __m128i flag1;
848
31.1k
    __m128i diff, alpha_cbcr_16x8, beta_cbcr_16x8;
849
31.1k
    __m128i zero = _mm_setzero_si128();
850
31.1k
    __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;
851
852
    /* Load and transpose the pixel values */
853
31.1k
    linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4));
854
31.1k
    lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd));
855
31.1k
    linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd));
856
31.1k
    lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd));
857
858
31.1k
    temp1 = _mm_unpacklo_epi16(linea, lineb);
859
31.1k
    temp2 = _mm_unpacklo_epi16(linec, lined);
860
861
31.1k
    p1_uv_16x8 = _mm_unpacklo_epi32(temp1, temp2);
862
31.1k
    p0_uv_16x8 = _mm_srli_si128(p1_uv_16x8, 8);
863
31.1k
    q0_uv_16x8 = _mm_unpackhi_epi32(temp1, temp2);
864
31.1k
    q1_uv_16x8 = _mm_srli_si128(q0_uv_16x8, 8);
865
    /* End of transpose */
866
867
31.1k
    q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
868
31.1k
    q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
869
31.1k
    p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
870
31.1k
    p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);
871
872
31.1k
    diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
873
31.1k
    diff = _mm_abs_epi16(diff);
874
31.1k
    alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
875
31.1k
    flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
876
877
31.1k
    diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
878
31.1k
    diff = _mm_abs_epi16(diff);
879
31.1k
    beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
880
31.1k
    flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
881
882
31.1k
    diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
883
31.1k
    diff = _mm_abs_epi16(diff);
884
31.1k
    flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
885
886
31.1k
    temp1 = _mm_slli_epi16(p1_uv_8x16, 1);
887
31.1k
    temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16);
888
31.1k
    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
889
31.1k
    temp1 = _mm_add_epi16(temp1, temp2);
890
31.1k
    p0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);
891
892
31.1k
    temp1 = _mm_slli_epi16(q1_uv_8x16, 1);
893
31.1k
    temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16);
894
31.1k
    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
895
31.1k
    temp1 = _mm_add_epi16(temp1, temp2);
896
31.1k
    q0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);
897
898
31.1k
    p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_1);
899
31.1k
    q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_1);
900
901
31.1k
    flag1 = _mm_packs_epi16(flag1, flag1);
902
903
31.1k
    p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
904
31.1k
                                 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
905
31.1k
    p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
906
31.1k
    p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);
907
908
31.1k
    q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
909
31.1k
                                 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
910
31.1k
    q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
911
31.1k
    q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);
912
913
    /* Inverse-transpose and store back */
914
31.1k
    temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8);
915
31.1k
    temp2 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8);
916
917
31.1k
    linea = _mm_unpacklo_epi32(temp1, temp2);
918
31.1k
    lineb = _mm_srli_si128(linea, 8);
919
31.1k
    linec = _mm_unpackhi_epi32(temp1, temp2);
920
31.1k
    lined = _mm_srli_si128(linec, 8);
921
922
31.1k
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea);
923
31.1k
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb);
924
31.1k
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec);
925
31.1k
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined);
926
927
31.1k
}
928
929
/*****************************************************************************/
930
/*                                                                           */
931
/*  Function Name : ih264_deblk_chroma_vert_bslt4_mbaff_ssse3()              */
932
/*                                                                           */
933
/*  Description   : This function performs filtering of a chroma block       */
934
/*                  vertical edge when boundary strength is less than 4 in   */
935
/*                  high profile.                                            */
936
/*                                                                           */
937
/*  Inputs        : pu1_src          - pointer to the src sample q0 of U     */
938
/*                  src_strd         - source stride                         */
939
/*                  alpha_cb         - alpha value for the boundary in U     */
940
/*                  beta_cb          - beta value for the boundary in U      */
941
/*                  alpha_cr         - alpha value for the boundary in V     */
942
/*                  beta_cr          - beta value for the boundary in V      */
943
/*                  u4_bs            - packed Boundary strength array        */
944
/*                  pu1_cliptab_cb   - tc0_table for U                       */
945
/*                  pu1_cliptab_cr   - tc0_table for V                       */
946
/*                                                                           */
947
/*  Globals       : None                                                     */
948
/*                                                                           */
949
/*  Processing    : When the function is called twice, this operation is as  */
950
/*                  described in Sec. 8.7.2.4 under the title "Filtering     */
951
/*                  process for edges for bS less than 4" in ITU T Rec H.264 */
952
/*                  with alpha and beta values different in U and V.         */
953
/*                                                                           */
954
/*  Outputs       : None                                                     */
955
/*                                                                           */
956
/*  Returns       : None                                                     */
957
/*                                                                           */
958
/*  Issues        : None                                                     */
959
/*                                                                           */
960
/*  Revision History:                                                        */
961
/*                                                                           */
962
/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
963
/*         12 02 2015   Naveen Kumar P  Initial version                      */
964
/*                                                                           */
965
/*****************************************************************************/
966
void ih264_deblk_chroma_vert_bslt4_mbaff_ssse3(UWORD8 *pu1_src,
967
                                               WORD32 src_strd,
968
                                               WORD32 alpha_cb,
969
                                               WORD32 beta_cb,
970
                                               WORD32 alpha_cr,
971
                                               WORD32 beta_cr,
972
                                               UWORD32 u4_bs,
973
                                               const UWORD8 *pu1_cliptab_cb,
974
                                               const UWORD8 *pu1_cliptab_cr)
975
13.1k
{
976
13.1k
    UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
977
13.1k
    UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
978
13.1k
    WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
979
13.1k
    WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
980
13.1k
    __m128i linea, lineb, linec, lined;
981
13.1k
    __m128i temp1, temp2;
982
983
13.1k
    __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
984
13.1k
    __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
985
13.1k
    __m128i flag_bs, flag1;
986
13.1k
    __m128i diff, diff1, alpha_cbcr_16x8, beta_cbcr_16x8, in_macro;
987
13.1k
    __m128i zero = _mm_setzero_si128();
988
13.1k
    __m128i C0_uv_8x16;
989
13.1k
    __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;
990
991
13.1k
    u1_Bs0 = (u4_bs >> 24) & 0xff;
992
13.1k
    u1_Bs1 = (u4_bs >> 16) & 0xff;
993
13.1k
    u1_Bs2 = (u4_bs >> 8) & 0xff;
994
13.1k
    u1_Bs3 = (u4_bs >> 0) & 0xff;
995
996
13.1k
    flag_bs = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, u1_Bs3, u1_Bs3, u1_Bs2,
997
13.1k
                           u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs0, u1_Bs0);
998
13.1k
    flag_bs = _mm_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s
999
13.1k
    flag_bs = _mm_xor_si128(flag_bs, _mm_set1_epi8(0xFF)); //Invert for required mask
1000
1001
    /* Load and transpose the pixel values */
1002
13.1k
    linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4));
1003
13.1k
    lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd));
1004
13.1k
    linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd));
1005
13.1k
    lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd));
1006
1007
13.1k
    temp1 = _mm_unpacklo_epi16(linea, lineb);
1008
13.1k
    temp2 = _mm_unpacklo_epi16(linec, lined);
1009
1010
13.1k
    p1_uv_16x8 = _mm_unpacklo_epi32(temp1, temp2);
1011
13.1k
    p0_uv_16x8 = _mm_srli_si128(p1_uv_16x8, 8);
1012
13.1k
    q0_uv_16x8 = _mm_unpackhi_epi32(temp1, temp2);
1013
13.1k
    q1_uv_16x8 = _mm_srli_si128(q0_uv_16x8, 8);
1014
    /* End of transpose */
1015
1016
13.1k
    q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
1017
13.1k
    q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
1018
13.1k
    p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
1019
13.1k
    p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);
1020
1021
13.1k
    diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
1022
13.1k
    diff = _mm_abs_epi16(diff);
1023
13.1k
    alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
1024
13.1k
    flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
1025
1026
13.1k
    diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
1027
13.1k
    diff = _mm_abs_epi16(diff);
1028
13.1k
    beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
1029
13.1k
    flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
1030
1031
13.1k
    diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
1032
13.1k
    diff = _mm_abs_epi16(diff);
1033
13.1k
    flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
1034
1035
13.1k
    diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16);
1036
13.1k
    diff = _mm_slli_epi16(diff, 2);
1037
13.1k
    diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16);
1038
13.1k
    diff = _mm_add_epi16(diff, diff1);
1039
13.1k
    diff = _mm_add_epi16(diff, _mm_set1_epi16(4));
1040
13.1k
    in_macro = _mm_srai_epi16(diff, 3);
1041
1042
13.1k
    C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
1043
13.1k
                               pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2],
1044
13.1k
                               pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
1045
13.1k
                               pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0]);
1046
1047
13.1k
    C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1));
1048
1049
13.1k
    in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3
1050
13.1k
    C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16);
1051
13.1k
    in_macro = _mm_max_epi16(C0_uv_8x16, in_macro);
1052
1053
13.1k
    p0_uv_8x16_1 = _mm_add_epi16(p0_uv_8x16, in_macro);
1054
13.1k
    q0_uv_8x16_1 = _mm_sub_epi16(q0_uv_8x16, in_macro);
1055
1056
13.1k
    p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_1);
1057
13.1k
    q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_1);
1058
1059
13.1k
    flag1 = _mm_packs_epi16(flag1, flag1);
1060
13.1k
    flag1 = _mm_and_si128(flag1, flag_bs); //Final flag (BS condition + other 3 conditions)
1061
1062
13.1k
    p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
1063
13.1k
                                 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
1064
13.1k
    p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
1065
13.1k
    p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);
1066
1067
13.1k
    q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
1068
13.1k
                                 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
1069
13.1k
    q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
1070
13.1k
    q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);
1071
1072
    /* Inverse-transpose and store back */
1073
13.1k
    temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8);
1074
13.1k
    temp2 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8);
1075
1076
13.1k
    linea = _mm_unpacklo_epi32(temp1, temp2);
1077
13.1k
    lineb = _mm_srli_si128(linea, 8);
1078
13.1k
    linec = _mm_unpackhi_epi32(temp1, temp2);
1079
13.1k
    lined = _mm_srli_si128(linec, 8);
1080
1081
13.1k
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea);
1082
13.1k
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb);
1083
13.1k
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec);
1084
13.1k
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined);
1085
1086
13.1k
}
1087