Coverage Report

Created: 2025-10-28 06:47

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libavc/common/x86/ih264_deblk_chroma_ssse3.c
Line
Count
Source
1
/******************************************************************************
2
 *
3
 * Copyright (C) 2015 The Android Open Source Project
4
 *
5
 * Licensed under the Apache License, Version 2.0 (the "License");
6
 * you may not use this file except in compliance with the License.
7
 * You may obtain a copy of the License at:
8
 *
9
 * http://www.apache.org/licenses/LICENSE-2.0
10
 *
11
 * Unless required by applicable law or agreed to in writing, software
12
 * distributed under the License is distributed on an "AS IS" BASIS,
13
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 * See the License for the specific language governing permissions and
15
 * limitations under the License.
16
 *
17
 *****************************************************************************
18
 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19
*/
20
/*****************************************************************************/
21
/*                                                                           */
22
/*  File Name         : ih264_deblk_chroma_ssse3.c                           */
23
/*                                                                           */
24
/*  Description       : Contains function definitions for deblocking         */
25
/*                                                                           */
26
/*  List of Functions : ih264_deblk_chroma_vert_bs4_ssse3()                  */
27
/*                      ih264_deblk_chroma_horz_bs4_ssse3()                  */
28
/*                      ih264_deblk_chroma_vert_bslt4_ssse3()                */
29
/*                      ih264_deblk_chroma_horz_bslt4_ssse3()                */
30
/*                      ih264_deblk_chroma_vert_bs4_mbaff_ssse3()            */
31
/*                      ih264_deblk_chroma_vert_bslt4_mbaff_ssse3()          */
32
/*                                                                           */
33
/*  Issues / Problems : None                                                 */
34
/*                                                                           */
35
/*  Revision History  :                                                      */
36
/*                                                                           */
37
/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
38
/*         12 02 2015   Naveen Kumar P  Added chrom deblocking ssse3         */
39
/*                                      intrinsics                           */
40
/*                                                                           */
41
/*****************************************************************************/
42
43
/*****************************************************************************/
44
/* File Includes                                                             */
45
/*****************************************************************************/
46
47
/* System include files */
48
#include <stdio.h>
49
50
/* User include files */
51
#include "ih264_typedefs.h"
52
#include "ih264_platform_macros.h"
53
#include "ih264_deblk_edge_filters.h"
54
#include "ih264_macros.h"
55
56
/*****************************************************************************/
57
/* Function Definitions                                                      */
58
/*****************************************************************************/
59
60
/*****************************************************************************/
61
/*                                                                           */
62
/*  Function Name : ih264_deblk_chroma_vert_bs4_ssse3()                      */
63
/*                                                                           */
64
/*  Description   : This function performs filtering of a chroma block       */
65
/*                  vertical edge when the boundary strength is set to 4 in  */
66
/*                  high profile.                                            */
67
/*                                                                           */
68
/*  Inputs        : pu1_src    - pointer to the src sample q0 of U           */
69
/*                  src_strd   - source stride                               */
70
/*                  alpha_cb   - alpha value for the boundary in U           */
71
/*                  beta_cb    - beta value for the boundary in U            */
72
/*                  alpha_cr   - alpha value for the boundary in V           */
73
/*                  beta_cr    - beta value for the boundary in V            */
74
/*                                                                           */
75
/*  Globals       : None                                                     */
76
/*                                                                           */
77
/*  Processing    : This operation is described in Sec. 8.7.2.4 under the    */
78
/*                  title "Filtering process for edges for bS equal to 4" in */
79
/*                  ITU T Rec H.264 with alpha and beta values different in  */
80
/*                  U and V.                                                 */
81
/*                                                                           */
82
/*  Outputs       : None                                                     */
83
/*                                                                           */
84
/*  Returns       : None                                                     */
85
/*                                                                           */
86
/*  Issues        : None                                                     */
87
/*                                                                           */
88
/*  Revision History:                                                        */
89
/*                                                                           */
90
/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
91
/*         12 02 2015   Naveen Kumar P  Initial version                      */
92
/*                                                                           */
93
/*****************************************************************************/
94
void ih264_deblk_chroma_vert_bs4_ssse3(UWORD8 *pu1_src,
95
                                       WORD32 src_strd,
96
                                       WORD32 alpha_cb,
97
                                       WORD32 beta_cb,
98
                                       WORD32 alpha_cr,
99
                                       WORD32 beta_cr)
100
8.45M
{
101
8.45M
    UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
102
8.45M
    WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
103
8.45M
    WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
104
8.45M
    __m128i linea, lineb, linec, lined, linee, linef, lineg, lineh;
105
8.45M
    __m128i temp1, temp2, temp3, temp4;
106
107
8.45M
    __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
108
8.45M
    __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
109
8.45M
    __m128i flag1, flag2;
110
8.45M
    __m128i diff, alpha_cbcr_16x8, beta_cbcr_16x8;
111
8.45M
    __m128i zero = _mm_setzero_si128();
112
8.45M
    __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;
113
114
    /* Load and transpose the pixel values */
115
8.45M
    linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4));
116
8.45M
    lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd));
117
8.45M
    linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd));
118
8.45M
    lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd));
119
8.45M
    linee = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd));
120
8.45M
    linef = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd));
121
8.45M
    lineg = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd));
122
8.45M
    lineh = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd));
123
124
8.45M
    temp1 = _mm_unpacklo_epi16(linea, lineb);
125
8.45M
    temp2 = _mm_unpacklo_epi16(linec, lined);
126
8.45M
    temp3 = _mm_unpacklo_epi16(linee, linef);
127
8.45M
    temp4 = _mm_unpacklo_epi16(lineg, lineh);
128
129
8.45M
    p1_uv_8x16 = _mm_unpacklo_epi32(temp1, temp2);
130
8.45M
    p0_uv_8x16 = _mm_unpacklo_epi32(temp3, temp4);
131
8.45M
    q0_uv_8x16 = _mm_unpackhi_epi32(temp1, temp2);
132
8.45M
    q1_uv_8x16 = _mm_unpackhi_epi32(temp3, temp4);
133
134
8.45M
    p1_uv_16x8 = _mm_unpacklo_epi64(p1_uv_8x16, p0_uv_8x16);
135
8.45M
    p0_uv_16x8 = _mm_unpackhi_epi64(p1_uv_8x16, p0_uv_8x16);
136
8.45M
    q0_uv_16x8 = _mm_unpacklo_epi64(q0_uv_8x16, q1_uv_8x16);
137
8.45M
    q1_uv_16x8 = _mm_unpackhi_epi64(q0_uv_8x16, q1_uv_8x16);
138
    /* End of transpose */
139
140
8.45M
    q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
141
8.45M
    q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
142
8.45M
    p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
143
8.45M
    p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);
144
145
8.45M
    diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
146
8.45M
    diff = _mm_abs_epi16(diff);
147
8.45M
    alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
148
8.45M
    flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
149
150
8.45M
    diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
151
8.45M
    diff = _mm_abs_epi16(diff);
152
8.45M
    beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
153
8.45M
    flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
154
155
8.45M
    diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
156
8.45M
    diff = _mm_abs_epi16(diff);
157
8.45M
    flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
158
159
8.45M
    temp1 = _mm_slli_epi16(p1_uv_8x16, 1);
160
8.45M
    temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16);
161
8.45M
    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
162
8.45M
    temp1 = _mm_add_epi16(temp1, temp2);
163
8.45M
    p0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);
164
165
8.45M
    temp1 = _mm_slli_epi16(q1_uv_8x16, 1);
166
8.45M
    temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16);
167
8.45M
    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
168
8.45M
    temp1 = _mm_add_epi16(temp1, temp2);
169
8.45M
    q0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);
170
171
8.45M
    q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero);
172
8.45M
    q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero);
173
8.45M
    p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero);
174
8.45M
    p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero);
175
176
8.45M
    diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
177
8.45M
    diff = _mm_abs_epi16(diff);
178
8.45M
    alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
179
8.45M
    flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
180
181
8.45M
    diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
182
8.45M
    diff = _mm_abs_epi16(diff);
183
8.45M
    beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
184
8.45M
    flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
185
186
8.45M
    diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
187
8.45M
    diff = _mm_abs_epi16(diff);
188
8.45M
    flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
189
190
8.45M
    temp1 = _mm_slli_epi16(p1_uv_8x16, 1);
191
8.45M
    temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16);
192
8.45M
    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
193
8.45M
    temp1 = _mm_add_epi16(temp1, temp2);
194
8.45M
    p0_uv_8x16_2 = _mm_srai_epi16(temp1, 2);
195
196
8.45M
    temp1 = _mm_slli_epi16(q1_uv_8x16, 1);
197
8.45M
    temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16);
198
8.45M
    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
199
8.45M
    temp1 = _mm_add_epi16(temp1, temp2);
200
8.45M
    q0_uv_8x16_2 = _mm_srai_epi16(temp1, 2);
201
202
8.45M
    p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2);
203
8.45M
    q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2);
204
205
8.45M
    flag1 = _mm_packs_epi16(flag1, flag2);
206
207
8.45M
    p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
208
8.45M
                                 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
209
8.45M
    p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
210
8.45M
    p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);
211
212
8.45M
    q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
213
8.45M
                                 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
214
8.45M
    q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
215
8.45M
    q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);
216
217
    /* Inverse-transpose and store back */
218
8.45M
    temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8);
219
8.45M
    temp2 = _mm_unpackhi_epi16(p1_uv_16x8, p0_uv_16x8);
220
8.45M
    temp3 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8);
221
8.45M
    temp4 = _mm_unpackhi_epi16(q0_uv_16x8, q1_uv_16x8);
222
223
8.45M
    linea = _mm_unpacklo_epi32(temp1, temp3);
224
8.45M
    lineb = _mm_srli_si128(linea, 8);
225
8.45M
    linec = _mm_unpackhi_epi32(temp1, temp3);
226
8.45M
    lined = _mm_srli_si128(linec, 8);
227
8.45M
    linee = _mm_unpacklo_epi32(temp2, temp4);
228
8.45M
    linef = _mm_srli_si128(linee, 8);
229
8.45M
    lineg = _mm_unpackhi_epi32(temp2, temp4);
230
8.45M
    lineh = _mm_srli_si128(lineg, 8);
231
232
8.45M
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea);
233
8.45M
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb);
234
8.45M
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec);
235
8.45M
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined);
236
8.45M
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd), linee);
237
8.45M
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd), linef);
238
8.45M
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd), lineg);
239
8.45M
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd), lineh);
240
241
8.45M
}
242
243
/*****************************************************************************/
244
/*                                                                           */
245
/*  Function Name : ih264_deblk_chroma_horz_bs4_ssse3()                      */
246
/*                                                                           */
247
/*  Description   : This function performs filtering of a chroma block       */
248
/*                  horizontal edge when the boundary strength is set to 4   */
249
/*                  in high profile.                                         */
250
/*                                                                           */
251
/*  Inputs        : pu1_src    - pointer to the src sample q0 of U           */
252
/*                  src_strd   - source stride                               */
253
/*                  alpha_cb   - alpha value for the boundary in U           */
254
/*                  beta_cb    - beta value for the boundary in U            */
255
/*                  alpha_cr   - alpha value for the boundary in V           */
256
/*                  beta_cr    - beta value for the boundary in V            */
257
/*                                                                           */
258
/*  Globals       : None                                                     */
259
/*                                                                           */
260
/*  Processing    : This operation is described in Sec. 8.7.2.4 under the    */
261
/*                  title "Filtering process for edges for bS equal to 4" in */
262
/*                  ITU T Rec H.264 with alpha and beta values different in  */
263
/*                  U and V.                                                 */
264
/*                                                                           */
265
/*  Outputs       : None                                                     */
266
/*                                                                           */
267
/*  Returns       : None                                                     */
268
/*                                                                           */
269
/*  Issues        : None                                                     */
270
/*                                                                           */
271
/*  Revision History:                                                        */
272
/*                                                                           */
273
/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
274
/*         12 02 2015   Naveen Kumar P  Initial version                      */
275
/*                                                                           */
276
/*****************************************************************************/
277
void ih264_deblk_chroma_horz_bs4_ssse3(UWORD8 *pu1_src,
278
                                       WORD32 src_strd,
279
                                       WORD32 alpha_cb,
280
                                       WORD32 beta_cb,
281
                                       WORD32 alpha_cr,
282
                                       WORD32 beta_cr)
283
8.37M
{
284
8.37M
    UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
285
8.37M
    WORD16 i16_posP1, i16_posP0, i16_posQ1;
286
287
8.37M
    UWORD8 *pu1_HorzPixelUV; /*! < Pointer to the first pixel of the boundary */
288
8.37M
    WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
289
8.37M
    WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
290
8.37M
    __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
291
8.37M
    __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
292
8.37M
    __m128i flag1, flag2;
293
8.37M
    __m128i diff, alpha_cbcr_16x8, beta_cbcr_16x8;
294
8.37M
    __m128i zero = _mm_setzero_si128();
295
8.37M
    __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;
296
8.37M
    __m128i temp1, temp2;
297
298
8.37M
    pu1_HorzPixelUV = pu1_src_uv - (src_strd << 1);
299
300
8.37M
    i16_posQ1 = src_strd;
301
8.37M
    i16_posP0 = src_strd;
302
8.37M
    i16_posP1 = 0;
303
304
8.37M
    q0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv));
305
8.37M
    q1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv + i16_posQ1));
306
8.37M
    p1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP1));
307
8.37M
    p0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0));
308
309
8.37M
    q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
310
8.37M
    q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
311
8.37M
    p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
312
8.37M
    p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);
313
314
8.37M
    diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
315
8.37M
    diff = _mm_abs_epi16(diff);
316
8.37M
    alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
317
8.37M
    flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
318
319
8.37M
    diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
320
8.37M
    diff = _mm_abs_epi16(diff);
321
8.37M
    beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
322
8.37M
    flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
323
324
8.37M
    diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
325
8.37M
    diff = _mm_abs_epi16(diff);
326
8.37M
    flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
327
328
8.37M
    temp1 = _mm_slli_epi16(p1_uv_8x16, 1);
329
8.37M
    temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16);
330
8.37M
    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
331
8.37M
    temp1 = _mm_add_epi16(temp1, temp2);
332
8.37M
    p0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);
333
334
8.37M
    temp1 = _mm_slli_epi16(q1_uv_8x16, 1);
335
8.37M
    temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16);
336
8.37M
    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
337
8.37M
    temp1 = _mm_add_epi16(temp1, temp2);
338
8.37M
    q0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);
339
340
8.37M
    q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero);
341
8.37M
    q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero);
342
8.37M
    p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero);
343
8.37M
    p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero);
344
345
8.37M
    diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
346
8.37M
    diff = _mm_abs_epi16(diff);
347
8.37M
    alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
348
8.37M
    flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
349
350
8.37M
    diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
351
8.37M
    diff = _mm_abs_epi16(diff);
352
8.37M
    beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
353
8.37M
    flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
354
355
8.37M
    diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
356
8.37M
    diff = _mm_abs_epi16(diff);
357
8.37M
    flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
358
359
8.37M
    temp1 = _mm_slli_epi16(p1_uv_8x16, 1);
360
8.37M
    temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16);
361
8.37M
    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
362
8.37M
    temp1 = _mm_add_epi16(temp1, temp2);
363
8.37M
    p0_uv_8x16_2 = _mm_srai_epi16(temp1, 2);
364
365
8.37M
    temp1 = _mm_slli_epi16(q1_uv_8x16, 1);
366
8.37M
    temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16);
367
8.37M
    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
368
8.37M
    temp1 = _mm_add_epi16(temp1, temp2);
369
8.37M
    q0_uv_8x16_2 = _mm_srai_epi16(temp1, 2);
370
371
8.37M
    p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2);
372
8.37M
    q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2);
373
374
8.37M
    flag1 = _mm_packs_epi16(flag1, flag2);
375
376
8.37M
    p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
377
8.37M
                                 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
378
8.37M
    p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
379
8.37M
    p0_uv_8x16_1 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);
380
8.37M
    _mm_storeu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0), p0_uv_8x16_1);
381
382
8.37M
    q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
383
8.37M
                                 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
384
8.37M
    q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
385
8.37M
    q0_uv_8x16_1 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);
386
8.37M
    _mm_storeu_si128((__m128i *)(pu1_src_uv), q0_uv_8x16_1);
387
388
8.37M
}
389
390
/*****************************************************************************/
391
/*                                                                           */
392
/*  Function Name : ih264_deblk_chroma_vert_bslt4_ssse3()                    */
393
/*                                                                           */
394
/*  Description   : This function performs filtering of a chroma block       */
395
/*                  vertical edge when the boundary strength is less than 4  */
396
/*                  in high profile.                                         */
397
/*                                                                           */
398
/*  Inputs        : pu1_src          - pointer to the src sample q0 of U     */
399
/*                  src_strd         - source stride                         */
400
/*                  alpha_cb         - alpha value for the boundary in U     */
401
/*                  beta_cb          - beta value for the boundary in U      */
402
/*                  alpha_cr         - alpha value for the boundary in V     */
403
/*                  beta_cr          - beta value for the boundary in V      */
404
/*                  u4_bs            - packed Boundary strength array        */
405
/*                  pu1_cliptab_cb   - tc0_table for U                       */
406
/*                  pu1_cliptab_cr   - tc0_table for V                       */
407
/*                                                                           */
408
/*  Globals       : None                                                     */
409
/*                                                                           */
410
/*  Processing    : This operation is described in Sec. 8.7.2.3 under the    */
411
/*                  title "Filtering process for edges for bS less than 4"   */
412
/*                  in ITU T Rec H.264 with alpha and beta values different  */
413
/*                  in U and V.                                              */
414
/*                                                                           */
415
/*  Outputs       : None                                                     */
416
/*                                                                           */
417
/*  Returns       : None                                                     */
418
/*                                                                           */
419
/*  Issues        : None                                                     */
420
/*                                                                           */
421
/*  Revision History:                                                        */
422
/*                                                                           */
423
/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
424
/*         12 02 2015   Naveen Kumar P  Initial version                      */
425
/*                                                                           */
426
/*****************************************************************************/
427
void ih264_deblk_chroma_vert_bslt4_ssse3(UWORD8 *pu1_src,
428
                                         WORD32 src_strd,
429
                                         WORD32 alpha_cb,
430
                                         WORD32 beta_cb,
431
                                         WORD32 alpha_cr,
432
                                         WORD32 beta_cr,
433
                                         UWORD32 u4_bs,
434
                                         const UWORD8 *pu1_cliptab_cb,
435
                                         const UWORD8 *pu1_cliptab_cr)
436
10.6M
{
437
10.6M
    UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
438
10.6M
    UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
439
10.6M
    WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
440
10.6M
    WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
441
10.6M
    __m128i linea, lineb, linec, lined, linee, linef, lineg, lineh;
442
10.6M
    __m128i temp1, temp2, temp3, temp4;
443
444
10.6M
    __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
445
10.6M
    __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
446
10.6M
    __m128i flag_bs, flag1, flag2;
447
10.6M
    __m128i diff, diff1, alpha_cbcr_16x8, beta_cbcr_16x8, in_macro;
448
10.6M
    __m128i zero = _mm_setzero_si128();
449
10.6M
    __m128i C0_uv_8x16;
450
10.6M
    __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;
451
452
10.6M
    u1_Bs0 = (u4_bs >> 24) & 0xff;
453
10.6M
    u1_Bs1 = (u4_bs >> 16) & 0xff;
454
10.6M
    u1_Bs2 = (u4_bs >> 8) & 0xff;
455
10.6M
    u1_Bs3 = (u4_bs >> 0) & 0xff;
456
457
10.6M
    flag_bs = _mm_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs2, u1_Bs2,
458
10.6M
                           u1_Bs2, u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,
459
10.6M
                           u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0);
460
10.6M
    flag_bs = _mm_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s
461
10.6M
    flag_bs = _mm_xor_si128(flag_bs, _mm_set1_epi8(0xFF)); //Invert for required mask
462
463
    /* Load and transpose the pixel values */
464
10.6M
    linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4));
465
10.6M
    lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd));
466
10.6M
    linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd));
467
10.6M
    lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd));
468
10.6M
    linee = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd));
469
10.6M
    linef = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd));
470
10.6M
    lineg = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd));
471
10.6M
    lineh = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd));
472
473
10.6M
    temp1 = _mm_unpacklo_epi16(linea, lineb);
474
10.6M
    temp2 = _mm_unpacklo_epi16(linec, lined);
475
10.6M
    temp3 = _mm_unpacklo_epi16(linee, linef);
476
10.6M
    temp4 = _mm_unpacklo_epi16(lineg, lineh);
477
478
10.6M
    p1_uv_8x16 = _mm_unpacklo_epi32(temp1, temp2);
479
10.6M
    p0_uv_8x16 = _mm_unpacklo_epi32(temp3, temp4);
480
10.6M
    q0_uv_8x16 = _mm_unpackhi_epi32(temp1, temp2);
481
10.6M
    q1_uv_8x16 = _mm_unpackhi_epi32(temp3, temp4);
482
483
10.6M
    p1_uv_16x8 = _mm_unpacklo_epi64(p1_uv_8x16, p0_uv_8x16);
484
10.6M
    p0_uv_16x8 = _mm_unpackhi_epi64(p1_uv_8x16, p0_uv_8x16);
485
10.6M
    q0_uv_16x8 = _mm_unpacklo_epi64(q0_uv_8x16, q1_uv_8x16);
486
10.6M
    q1_uv_16x8 = _mm_unpackhi_epi64(q0_uv_8x16, q1_uv_8x16);
487
    /* End of transpose */
488
489
10.6M
    q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
490
10.6M
    q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
491
10.6M
    p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
492
10.6M
    p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);
493
494
10.6M
    diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
495
10.6M
    diff = _mm_abs_epi16(diff);
496
10.6M
    alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
497
10.6M
    flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
498
499
10.6M
    diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
500
10.6M
    diff = _mm_abs_epi16(diff);
501
10.6M
    beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
502
10.6M
    flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
503
504
10.6M
    diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
505
10.6M
    diff = _mm_abs_epi16(diff);
506
10.6M
    flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
507
508
10.6M
    diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16);
509
10.6M
    diff = _mm_slli_epi16(diff, 2);
510
10.6M
    diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16);
511
10.6M
    diff = _mm_add_epi16(diff, diff1);
512
10.6M
    diff = _mm_add_epi16(diff, _mm_set1_epi16(4));
513
10.6M
    in_macro = _mm_srai_epi16(diff, 3);
514
515
10.6M
    C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
516
10.6M
                               pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
517
10.6M
                               pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0],
518
10.6M
                               pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0]);
519
520
10.6M
    C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1));
521
522
10.6M
    in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3
523
10.6M
    C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16);
524
10.6M
    in_macro = _mm_max_epi16(C0_uv_8x16, in_macro);
525
526
10.6M
    p0_uv_8x16_1 = _mm_add_epi16(p0_uv_8x16, in_macro);
527
10.6M
    q0_uv_8x16_1 = _mm_sub_epi16(q0_uv_8x16, in_macro);
528
529
10.6M
    q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero);
530
10.6M
    q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero);
531
10.6M
    p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero);
532
10.6M
    p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero);
533
534
10.6M
    diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
535
10.6M
    diff = _mm_abs_epi16(diff);
536
10.6M
    alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
537
10.6M
    flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
538
539
10.6M
    diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
540
10.6M
    diff = _mm_abs_epi16(diff);
541
10.6M
    beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
542
10.6M
    flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
543
544
10.6M
    diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
545
10.6M
    diff = _mm_abs_epi16(diff);
546
10.6M
    flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
547
548
10.6M
    diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16);
549
10.6M
    diff = _mm_slli_epi16(diff, 2);
550
10.6M
    diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16);
551
10.6M
    diff = _mm_add_epi16(diff, diff1);
552
10.6M
    diff = _mm_add_epi16(diff, _mm_set1_epi16(4));
553
10.6M
    in_macro = _mm_srai_epi16(diff, 3);
554
555
10.6M
    C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
556
10.6M
                               pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
557
10.6M
                               pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2],
558
10.6M
                               pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2]);
559
560
10.6M
    C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1));
561
562
10.6M
    in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3
563
10.6M
    C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16);
564
10.6M
    in_macro = _mm_max_epi16(C0_uv_8x16, in_macro);
565
566
10.6M
    p0_uv_8x16_2 = _mm_add_epi16(p0_uv_8x16, in_macro);
567
10.6M
    q0_uv_8x16_2 = _mm_sub_epi16(q0_uv_8x16, in_macro);
568
569
10.6M
    p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2);
570
10.6M
    q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2);
571
572
10.6M
    flag1 = _mm_packs_epi16(flag1, flag2);
573
10.6M
    flag1 = _mm_and_si128(flag1, flag_bs); //Final flag (BS condition + other 3 conditions)
574
575
10.6M
    p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
576
10.6M
                                 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
577
10.6M
    p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
578
10.6M
    p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);
579
580
10.6M
    q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
581
10.6M
                                 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
582
10.6M
    q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
583
10.6M
    q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);
584
585
    /* Inverse-transpose and store back */
586
10.6M
    temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8);
587
10.6M
    temp2 = _mm_unpackhi_epi16(p1_uv_16x8, p0_uv_16x8);
588
10.6M
    temp3 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8);
589
10.6M
    temp4 = _mm_unpackhi_epi16(q0_uv_16x8, q1_uv_16x8);
590
591
10.6M
    linea = _mm_unpacklo_epi32(temp1, temp3);
592
10.6M
    lineb = _mm_srli_si128(linea, 8);
593
10.6M
    linec = _mm_unpackhi_epi32(temp1, temp3);
594
10.6M
    lined = _mm_srli_si128(linec, 8);
595
10.6M
    linee = _mm_unpacklo_epi32(temp2, temp4);
596
10.6M
    linef = _mm_srli_si128(linee, 8);
597
10.6M
    lineg = _mm_unpackhi_epi32(temp2, temp4);
598
10.6M
    lineh = _mm_srli_si128(lineg, 8);
599
600
10.6M
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea);
601
10.6M
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb);
602
10.6M
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec);
603
10.6M
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined);
604
10.6M
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd), linee);
605
10.6M
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd), linef);
606
10.6M
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd), lineg);
607
10.6M
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd), lineh);
608
609
10.6M
}
610
611
/*****************************************************************************/
612
/*                                                                           */
613
/*  Function Name : ih264_deblk_chroma_horz_bslt4_ssse3()                    */
614
/*                                                                           */
615
/*  Description   : This function performs filtering of a chroma block       */
616
/*                  horizontal edge when the boundary strength is less than  */
617
/*                  4 in high profile.                                       */
618
/*                                                                           */
619
/*  Inputs        : pu1_src          - pointer to the src sample q0 of U     */
620
/*                  src_strd         - source stride                         */
621
/*                  alpha_cb         - alpha value for the boundary in U     */
622
/*                  beta_cb          - beta value for the boundary in U      */
623
/*                  alpha_cr         - alpha value for the boundary in V     */
624
/*                  beta_cr          - beta value for the boundary in V      */
625
/*                  u4_bs            - packed Boundary strength array        */
626
/*                  pu1_cliptab_cb   - tc0_table for U                       */
627
/*                  pu1_cliptab_cr   - tc0_table for V                       */
628
/*                                                                           */
629
/*  Globals       : None                                                     */
630
/*                                                                           */
631
/*  Processing    : This operation is described in Sec. 8.7.2.3 under the    */
632
/*                  title "Filtering process for edges for bS less than 4"   */
633
/*                  in ITU T Rec H.264 with alpha and beta values different  */
634
/*                  in U and V.                                              */
635
/*                                                                           */
636
/*  Outputs       : None                                                     */
637
/*                                                                           */
638
/*  Returns       : None                                                     */
639
/*                                                                           */
640
/*  Issues        : None                                                     */
641
/*                                                                           */
642
/*  Revision History:                                                        */
643
/*                                                                           */
644
/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
645
/*         12 02 2015   Naveen Kumar P  Initial version                      */
646
/*                                                                           */
647
/*****************************************************************************/
648
void ih264_deblk_chroma_horz_bslt4_ssse3(UWORD8 *pu1_src,
649
                                         WORD32 src_strd,
650
                                         WORD32 alpha_cb,
651
                                         WORD32 beta_cb,
652
                                         WORD32 alpha_cr,
653
                                         WORD32 beta_cr,
654
                                         UWORD32 u4_bs,
655
                                         const UWORD8 *pu1_cliptab_cb,
656
                                         const UWORD8 *pu1_cliptab_cr)
657
10.7M
{
658
10.7M
    UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
659
10.7M
    WORD16 i16_posP1, i16_posP0, i16_posQ1;
660
10.7M
    UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
661
662
10.7M
    UWORD8 *pu1_HorzPixelUV; /*! < Pointer to the first pixel of the boundary */
663
10.7M
    WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
664
10.7M
    WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
665
10.7M
    __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
666
10.7M
    __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
667
10.7M
    __m128i flag_bs, flag1, flag2;
668
10.7M
    __m128i diff, diff1, alpha_cbcr_16x8, beta_cbcr_16x8, in_macro;
669
10.7M
    __m128i zero = _mm_setzero_si128();
670
10.7M
    __m128i C0_uv_8x16;
671
10.7M
    __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;
672
673
10.7M
    pu1_HorzPixelUV = pu1_src_uv - (src_strd << 1);
674
675
10.7M
    i16_posQ1 = src_strd;
676
10.7M
    i16_posP0 = src_strd;
677
10.7M
    i16_posP1 = 0;
678
679
10.7M
    u1_Bs0 = (u4_bs >> 24) & 0xff;
680
10.7M
    u1_Bs1 = (u4_bs >> 16) & 0xff;
681
10.7M
    u1_Bs2 = (u4_bs >> 8) & 0xff;
682
10.7M
    u1_Bs3 = (u4_bs >> 0) & 0xff;
683
684
10.7M
    flag_bs = _mm_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs2, u1_Bs2,
685
10.7M
                           u1_Bs2, u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,
686
10.7M
                           u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0);
687
10.7M
    flag_bs = _mm_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s
688
10.7M
    flag_bs = _mm_xor_si128(flag_bs, _mm_set1_epi8(0xFF)); //Invert for required mask
689
690
10.7M
    q0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv));
691
10.7M
    q1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv + i16_posQ1));
692
10.7M
    p1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP1));
693
10.7M
    p0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0));
694
695
10.7M
    q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
696
10.7M
    q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
697
10.7M
    p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
698
10.7M
    p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);
699
700
10.7M
    diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
701
10.7M
    diff = _mm_abs_epi16(diff);
702
10.7M
    alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
703
10.7M
    flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
704
705
10.7M
    diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
706
10.7M
    diff = _mm_abs_epi16(diff);
707
10.7M
    beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
708
10.7M
    flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
709
710
10.7M
    diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
711
10.7M
    diff = _mm_abs_epi16(diff);
712
10.7M
    flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
713
714
10.7M
    diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16);
715
10.7M
    diff = _mm_slli_epi16(diff, 2);
716
10.7M
    diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16);
717
10.7M
    diff = _mm_add_epi16(diff, diff1);
718
10.7M
    diff = _mm_add_epi16(diff, _mm_set1_epi16(4));
719
10.7M
    in_macro = _mm_srai_epi16(diff, 3);
720
721
10.7M
    C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
722
10.7M
                               pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
723
10.7M
                               pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0],
724
10.7M
                               pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0]);
725
726
10.7M
    C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1));
727
728
10.7M
    in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3
729
10.7M
    C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16);
730
10.7M
    in_macro = _mm_max_epi16(C0_uv_8x16, in_macro);
731
732
10.7M
    p0_uv_8x16_1 = _mm_add_epi16(p0_uv_8x16, in_macro);
733
10.7M
    q0_uv_8x16_1 = _mm_sub_epi16(q0_uv_8x16, in_macro);
734
735
10.7M
    q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero);
736
10.7M
    q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero);
737
10.7M
    p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero);
738
10.7M
    p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero);
739
740
10.7M
    diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
741
10.7M
    diff = _mm_abs_epi16(diff);
742
10.7M
    alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
743
10.7M
    flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
744
745
10.7M
    diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
746
10.7M
    diff = _mm_abs_epi16(diff);
747
10.7M
    beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
748
10.7M
    flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
749
750
10.7M
    diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
751
10.7M
    diff = _mm_abs_epi16(diff);
752
10.7M
    flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
753
754
10.7M
    diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16);
755
10.7M
    diff = _mm_slli_epi16(diff, 2);
756
10.7M
    diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16);
757
10.7M
    diff = _mm_add_epi16(diff, diff1);
758
10.7M
    diff = _mm_add_epi16(diff, _mm_set1_epi16(4));
759
10.7M
    in_macro = _mm_srai_epi16(diff, 3);
760
761
10.7M
    C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
762
10.7M
                               pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
763
10.7M
                               pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2],
764
10.7M
                               pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2]);
765
766
10.7M
    C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1));
767
768
10.7M
    in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3
769
10.7M
    C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16);
770
10.7M
    in_macro = _mm_max_epi16(C0_uv_8x16, in_macro);
771
772
10.7M
    p0_uv_8x16_2 = _mm_add_epi16(p0_uv_8x16, in_macro);
773
10.7M
    q0_uv_8x16_2 = _mm_sub_epi16(q0_uv_8x16, in_macro);
774
775
10.7M
    p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2);
776
10.7M
    q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2);
777
778
10.7M
    flag1 = _mm_packs_epi16(flag1, flag2);
779
10.7M
    flag1 = _mm_and_si128(flag1, flag_bs); //Final flag (BS condition + other 3 conditions)
780
781
10.7M
    p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
782
10.7M
                                 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
783
10.7M
    p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
784
10.7M
    p0_uv_8x16_1 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);
785
10.7M
    _mm_storeu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0), p0_uv_8x16_1);
786
787
10.7M
    q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
788
10.7M
                                 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
789
10.7M
    q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
790
10.7M
    q0_uv_8x16_1 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);
791
10.7M
    _mm_storeu_si128((__m128i *)(pu1_src_uv), q0_uv_8x16_1);
792
793
10.7M
}
794
795
/*****************************************************************************/
796
/*                                                                           */
797
/*  Function Name : ih264_deblk_chroma_vert_bs4_mbaff_ssse3()                */
798
/*                                                                           */
799
/*  Description   : This function performs filtering of a chroma block       */
800
/*                  vertical edge when boundary strength is set to 4 in high */
801
/*                  profile.                                                 */
802
/*                                                                           */
803
/*  Inputs        : pu1_src          - pointer to the src sample q0 of U     */
804
/*                  src_strd         - source stride                         */
805
/*                  alpha_cb         - alpha value for the boundary in U     */
806
/*                  beta_cb          - beta value for the boundary in U      */
807
/*                  alpha_cr         - alpha value for the boundary in V     */
808
/*                  beta_cr          - beta value for the boundary in V      */
809
/*                  u4_bs            - packed Boundary strength array        */
810
/*                  pu1_cliptab_cb   - tc0_table for U                       */
811
/*                  pu1_cliptab_cr   - tc0_table for V                       */
812
/*                                                                           */
813
/*  Globals       : None                                                     */
814
/*                                                                           */
815
/*  Processing    : When the function is called twice, this operation is as  */
816
/*                  described in Sec. 8.7.2.4 under the title "Filtering     */
817
/*                  process for edges for bS equal to 4" in ITU T Rec H.264  */
818
/*                  with alpha and beta values different in U and V.         */
819
/*                                                                           */
820
/*  Outputs       : None                                                     */
821
/*                                                                           */
822
/*  Returns       : None                                                     */
823
/*                                                                           */
824
/*  Issues        : None                                                     */
825
/*                                                                           */
826
/*  Revision History:                                                        */
827
/*                                                                           */
828
/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
829
/*         12 02 2015   Naveen Kumar P  Initial version                      */
830
/*                                                                           */
831
/*****************************************************************************/
832
void ih264_deblk_chroma_vert_bs4_mbaff_ssse3(UWORD8 *pu1_src,
833
                                             WORD32 src_strd,
834
                                             WORD32 alpha_cb,
835
                                             WORD32 beta_cb,
836
                                             WORD32 alpha_cr,
837
                                             WORD32 beta_cr)
838
20.6k
{
839
20.6k
    UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
840
20.6k
    WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
841
20.6k
    WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
842
20.6k
    __m128i linea, lineb, linec, lined;
843
20.6k
    __m128i temp1, temp2;
844
845
20.6k
    __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
846
20.6k
    __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
847
20.6k
    __m128i flag1;
848
20.6k
    __m128i diff, alpha_cbcr_16x8, beta_cbcr_16x8;
849
20.6k
    __m128i zero = _mm_setzero_si128();
850
20.6k
    __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;
851
852
    /* Load and transpose the pixel values */
853
20.6k
    linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4));
854
20.6k
    lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd));
855
20.6k
    linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd));
856
20.6k
    lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd));
857
858
20.6k
    temp1 = _mm_unpacklo_epi16(linea, lineb);
859
20.6k
    temp2 = _mm_unpacklo_epi16(linec, lined);
860
861
20.6k
    p1_uv_16x8 = _mm_unpacklo_epi32(temp1, temp2);
862
20.6k
    p0_uv_16x8 = _mm_srli_si128(p1_uv_16x8, 8);
863
20.6k
    q0_uv_16x8 = _mm_unpackhi_epi32(temp1, temp2);
864
20.6k
    q1_uv_16x8 = _mm_srli_si128(q0_uv_16x8, 8);
865
    /* End of transpose */
866
867
20.6k
    q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
868
20.6k
    q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
869
20.6k
    p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
870
20.6k
    p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);
871
872
20.6k
    diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
873
20.6k
    diff = _mm_abs_epi16(diff);
874
20.6k
    alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
875
20.6k
    flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
876
877
20.6k
    diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
878
20.6k
    diff = _mm_abs_epi16(diff);
879
20.6k
    beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
880
20.6k
    flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
881
882
20.6k
    diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
883
20.6k
    diff = _mm_abs_epi16(diff);
884
20.6k
    flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
885
886
20.6k
    temp1 = _mm_slli_epi16(p1_uv_8x16, 1);
887
20.6k
    temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16);
888
20.6k
    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
889
20.6k
    temp1 = _mm_add_epi16(temp1, temp2);
890
20.6k
    p0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);
891
892
20.6k
    temp1 = _mm_slli_epi16(q1_uv_8x16, 1);
893
20.6k
    temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16);
894
20.6k
    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
895
20.6k
    temp1 = _mm_add_epi16(temp1, temp2);
896
20.6k
    q0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);
897
898
20.6k
    p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_1);
899
20.6k
    q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_1);
900
901
20.6k
    flag1 = _mm_packs_epi16(flag1, flag1);
902
903
20.6k
    p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
904
20.6k
                                 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
905
20.6k
    p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
906
20.6k
    p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);
907
908
20.6k
    q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
909
20.6k
                                 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
910
20.6k
    q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
911
20.6k
    q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);
912
913
    /* Inverse-transpose and store back */
914
20.6k
    temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8);
915
20.6k
    temp2 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8);
916
917
20.6k
    linea = _mm_unpacklo_epi32(temp1, temp2);
918
20.6k
    lineb = _mm_srli_si128(linea, 8);
919
20.6k
    linec = _mm_unpackhi_epi32(temp1, temp2);
920
20.6k
    lined = _mm_srli_si128(linec, 8);
921
922
20.6k
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea);
923
20.6k
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb);
924
20.6k
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec);
925
20.6k
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined);
926
927
20.6k
}
928
929
/*****************************************************************************/
930
/*                                                                           */
931
/*  Function Name : ih264_deblk_chroma_vert_bslt4_mbaff_ssse3()              */
932
/*                                                                           */
933
/*  Description   : This function performs filtering of a chroma block       */
934
/*                  vertical edge when boundary strength is less than 4 in   */
935
/*                  high profile.                                            */
936
/*                                                                           */
937
/*  Inputs        : pu1_src          - pointer to the src sample q0 of U     */
938
/*                  src_strd         - source stride                         */
939
/*                  alpha_cb         - alpha value for the boundary in U     */
940
/*                  beta_cb          - beta value for the boundary in U      */
941
/*                  alpha_cr         - alpha value for the boundary in V     */
942
/*                  beta_cr          - beta value for the boundary in V      */
943
/*                  u4_bs            - packed Boundary strength array        */
944
/*                  pu1_cliptab_cb   - tc0_table for U                       */
945
/*                  pu1_cliptab_cr   - tc0_table for V                       */
946
/*                                                                           */
947
/*  Globals       : None                                                     */
948
/*                                                                           */
949
/*  Processing    : When the function is called twice, this operation is as  */
950
/*                  described in Sec. 8.7.2.4 under the title "Filtering     */
951
/*                  process for edges for bS less than 4" in ITU T Rec H.264 */
952
/*                  with alpha and beta values different in U and V.         */
953
/*                                                                           */
954
/*  Outputs       : None                                                     */
955
/*                                                                           */
956
/*  Returns       : None                                                     */
957
/*                                                                           */
958
/*  Issues        : None                                                     */
959
/*                                                                           */
960
/*  Revision History:                                                        */
961
/*                                                                           */
962
/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
963
/*         12 02 2015   Naveen Kumar P  Initial version                      */
964
/*                                                                           */
965
/*****************************************************************************/
966
void ih264_deblk_chroma_vert_bslt4_mbaff_ssse3(UWORD8 *pu1_src,
967
                                               WORD32 src_strd,
968
                                               WORD32 alpha_cb,
969
                                               WORD32 beta_cb,
970
                                               WORD32 alpha_cr,
971
                                               WORD32 beta_cr,
972
                                               UWORD32 u4_bs,
973
                                               const UWORD8 *pu1_cliptab_cb,
974
                                               const UWORD8 *pu1_cliptab_cr)
975
12.0k
{
976
12.0k
    UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
977
12.0k
    UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
978
12.0k
    WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
979
12.0k
    WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
980
12.0k
    __m128i linea, lineb, linec, lined;
981
12.0k
    __m128i temp1, temp2;
982
983
12.0k
    __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
984
12.0k
    __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
985
12.0k
    __m128i flag_bs, flag1;
986
12.0k
    __m128i diff, diff1, alpha_cbcr_16x8, beta_cbcr_16x8, in_macro;
987
12.0k
    __m128i zero = _mm_setzero_si128();
988
12.0k
    __m128i C0_uv_8x16;
989
12.0k
    __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;
990
991
12.0k
    u1_Bs0 = (u4_bs >> 24) & 0xff;
992
12.0k
    u1_Bs1 = (u4_bs >> 16) & 0xff;
993
12.0k
    u1_Bs2 = (u4_bs >> 8) & 0xff;
994
12.0k
    u1_Bs3 = (u4_bs >> 0) & 0xff;
995
996
12.0k
    flag_bs = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, u1_Bs3, u1_Bs3, u1_Bs2,
997
12.0k
                           u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs0, u1_Bs0);
998
12.0k
    flag_bs = _mm_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s
999
12.0k
    flag_bs = _mm_xor_si128(flag_bs, _mm_set1_epi8(0xFF)); //Invert for required mask
1000
1001
    /* Load and transpose the pixel values */
1002
12.0k
    linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4));
1003
12.0k
    lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd));
1004
12.0k
    linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd));
1005
12.0k
    lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd));
1006
1007
12.0k
    temp1 = _mm_unpacklo_epi16(linea, lineb);
1008
12.0k
    temp2 = _mm_unpacklo_epi16(linec, lined);
1009
1010
12.0k
    p1_uv_16x8 = _mm_unpacklo_epi32(temp1, temp2);
1011
12.0k
    p0_uv_16x8 = _mm_srli_si128(p1_uv_16x8, 8);
1012
12.0k
    q0_uv_16x8 = _mm_unpackhi_epi32(temp1, temp2);
1013
12.0k
    q1_uv_16x8 = _mm_srli_si128(q0_uv_16x8, 8);
1014
    /* End of transpose */
1015
1016
12.0k
    q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
1017
12.0k
    q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
1018
12.0k
    p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
1019
12.0k
    p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);
1020
1021
12.0k
    diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
1022
12.0k
    diff = _mm_abs_epi16(diff);
1023
12.0k
    alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
1024
12.0k
    flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
1025
1026
12.0k
    diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
1027
12.0k
    diff = _mm_abs_epi16(diff);
1028
12.0k
    beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
1029
12.0k
    flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
1030
1031
12.0k
    diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
1032
12.0k
    diff = _mm_abs_epi16(diff);
1033
12.0k
    flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
1034
1035
12.0k
    diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16);
1036
12.0k
    diff = _mm_slli_epi16(diff, 2);
1037
12.0k
    diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16);
1038
12.0k
    diff = _mm_add_epi16(diff, diff1);
1039
12.0k
    diff = _mm_add_epi16(diff, _mm_set1_epi16(4));
1040
12.0k
    in_macro = _mm_srai_epi16(diff, 3);
1041
1042
12.0k
    C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
1043
12.0k
                               pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2],
1044
12.0k
                               pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
1045
12.0k
                               pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0]);
1046
1047
12.0k
    C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1));
1048
1049
12.0k
    in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3
1050
12.0k
    C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16);
1051
12.0k
    in_macro = _mm_max_epi16(C0_uv_8x16, in_macro);
1052
1053
12.0k
    p0_uv_8x16_1 = _mm_add_epi16(p0_uv_8x16, in_macro);
1054
12.0k
    q0_uv_8x16_1 = _mm_sub_epi16(q0_uv_8x16, in_macro);
1055
1056
12.0k
    p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_1);
1057
12.0k
    q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_1);
1058
1059
12.0k
    flag1 = _mm_packs_epi16(flag1, flag1);
1060
12.0k
    flag1 = _mm_and_si128(flag1, flag_bs); //Final flag (BS condition + other 3 conditions)
1061
1062
12.0k
    p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
1063
12.0k
                                 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
1064
12.0k
    p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
1065
12.0k
    p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);
1066
1067
12.0k
    q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
1068
12.0k
                                 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
1069
12.0k
    q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
1070
12.0k
    q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);
1071
1072
    /* Inverse-transpose and store back */
1073
12.0k
    temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8);
1074
12.0k
    temp2 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8);
1075
1076
12.0k
    linea = _mm_unpacklo_epi32(temp1, temp2);
1077
12.0k
    lineb = _mm_srli_si128(linea, 8);
1078
12.0k
    linec = _mm_unpackhi_epi32(temp1, temp2);
1079
12.0k
    lined = _mm_srli_si128(linec, 8);
1080
1081
12.0k
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea);
1082
12.0k
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb);
1083
12.0k
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec);
1084
12.0k
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined);
1085
1086
12.0k
}
1087