Coverage Report

Created: 2025-11-11 06:42

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libavc/common/x86/ih264_deblk_chroma_ssse3.c
Line
Count
Source
1
/******************************************************************************
2
 *
3
 * Copyright (C) 2015 The Android Open Source Project
4
 *
5
 * Licensed under the Apache License, Version 2.0 (the "License");
6
 * you may not use this file except in compliance with the License.
7
 * You may obtain a copy of the License at:
8
 *
9
 * http://www.apache.org/licenses/LICENSE-2.0
10
 *
11
 * Unless required by applicable law or agreed to in writing, software
12
 * distributed under the License is distributed on an "AS IS" BASIS,
13
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 * See the License for the specific language governing permissions and
15
 * limitations under the License.
16
 *
17
 *****************************************************************************
18
 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19
*/
20
/*****************************************************************************/
21
/*                                                                           */
22
/*  File Name         : ih264_deblk_chroma_ssse3.c                           */
23
/*                                                                           */
24
/*  Description       : Contains function definitions for deblocking         */
25
/*                                                                           */
26
/*  List of Functions : ih264_deblk_chroma_vert_bs4_ssse3()                  */
27
/*                      ih264_deblk_chroma_horz_bs4_ssse3()                  */
28
/*                      ih264_deblk_chroma_vert_bslt4_ssse3()                */
29
/*                      ih264_deblk_chroma_horz_bslt4_ssse3()                */
30
/*                      ih264_deblk_chroma_vert_bs4_mbaff_ssse3()            */
31
/*                      ih264_deblk_chroma_vert_bslt4_mbaff_ssse3()          */
32
/*                                                                           */
33
/*  Issues / Problems : None                                                 */
34
/*                                                                           */
35
/*  Revision History  :                                                      */
36
/*                                                                           */
37
/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
38
/*         12 02 2015   Naveen Kumar P  Added chrom deblocking ssse3         */
39
/*                                      intrinsics                           */
40
/*                                                                           */
41
/*****************************************************************************/
42
43
/*****************************************************************************/
44
/* File Includes                                                             */
45
/*****************************************************************************/
46
47
/* System include files */
48
#include <stdio.h>
49
50
/* User include files */
51
#include "ih264_typedefs.h"
52
#include "ih264_platform_macros.h"
53
#include "ih264_deblk_edge_filters.h"
54
#include "ih264_macros.h"
55
56
/*****************************************************************************/
57
/* Function Definitions                                                      */
58
/*****************************************************************************/
59
60
/*****************************************************************************/
61
/*                                                                           */
62
/*  Function Name : ih264_deblk_chroma_vert_bs4_ssse3()                      */
63
/*                                                                           */
64
/*  Description   : This function performs filtering of a chroma block       */
65
/*                  vertical edge when the boundary strength is set to 4 in  */
66
/*                  high profile.                                            */
67
/*                                                                           */
68
/*  Inputs        : pu1_src    - pointer to the src sample q0 of U           */
69
/*                  src_strd   - source stride                               */
70
/*                  alpha_cb   - alpha value for the boundary in U           */
71
/*                  beta_cb    - beta value for the boundary in U            */
72
/*                  alpha_cr   - alpha value for the boundary in V           */
73
/*                  beta_cr    - beta value for the boundary in V            */
74
/*                                                                           */
75
/*  Globals       : None                                                     */
76
/*                                                                           */
77
/*  Processing    : This operation is described in Sec. 8.7.2.4 under the    */
78
/*                  title "Filtering process for edges for bS equal to 4" in */
79
/*                  ITU T Rec H.264 with alpha and beta values different in  */
80
/*                  U and V.                                                 */
81
/*                                                                           */
82
/*  Outputs       : None                                                     */
83
/*                                                                           */
84
/*  Returns       : None                                                     */
85
/*                                                                           */
86
/*  Issues        : None                                                     */
87
/*                                                                           */
88
/*  Revision History:                                                        */
89
/*                                                                           */
90
/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
91
/*         12 02 2015   Naveen Kumar P  Initial version                      */
92
/*                                                                           */
93
/*****************************************************************************/
94
void ih264_deblk_chroma_vert_bs4_ssse3(UWORD8 *pu1_src,
95
                                       WORD32 src_strd,
96
                                       WORD32 alpha_cb,
97
                                       WORD32 beta_cb,
98
                                       WORD32 alpha_cr,
99
                                       WORD32 beta_cr)
100
8.54M
{
101
8.54M
    UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
102
8.54M
    WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
103
8.54M
    WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
104
8.54M
    __m128i linea, lineb, linec, lined, linee, linef, lineg, lineh;
105
8.54M
    __m128i temp1, temp2, temp3, temp4;
106
107
8.54M
    __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
108
8.54M
    __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
109
8.54M
    __m128i flag1, flag2;
110
8.54M
    __m128i diff, alpha_cbcr_16x8, beta_cbcr_16x8;
111
8.54M
    __m128i zero = _mm_setzero_si128();
112
8.54M
    __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;
113
114
    /* Load and transpose the pixel values */
115
8.54M
    linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4));
116
8.54M
    lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd));
117
8.54M
    linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd));
118
8.54M
    lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd));
119
8.54M
    linee = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd));
120
8.54M
    linef = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd));
121
8.54M
    lineg = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd));
122
8.54M
    lineh = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd));
123
124
8.54M
    temp1 = _mm_unpacklo_epi16(linea, lineb);
125
8.54M
    temp2 = _mm_unpacklo_epi16(linec, lined);
126
8.54M
    temp3 = _mm_unpacklo_epi16(linee, linef);
127
8.54M
    temp4 = _mm_unpacklo_epi16(lineg, lineh);
128
129
8.54M
    p1_uv_8x16 = _mm_unpacklo_epi32(temp1, temp2);
130
8.54M
    p0_uv_8x16 = _mm_unpacklo_epi32(temp3, temp4);
131
8.54M
    q0_uv_8x16 = _mm_unpackhi_epi32(temp1, temp2);
132
8.54M
    q1_uv_8x16 = _mm_unpackhi_epi32(temp3, temp4);
133
134
8.54M
    p1_uv_16x8 = _mm_unpacklo_epi64(p1_uv_8x16, p0_uv_8x16);
135
8.54M
    p0_uv_16x8 = _mm_unpackhi_epi64(p1_uv_8x16, p0_uv_8x16);
136
8.54M
    q0_uv_16x8 = _mm_unpacklo_epi64(q0_uv_8x16, q1_uv_8x16);
137
8.54M
    q1_uv_16x8 = _mm_unpackhi_epi64(q0_uv_8x16, q1_uv_8x16);
138
    /* End of transpose */
139
140
8.54M
    q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
141
8.54M
    q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
142
8.54M
    p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
143
8.54M
    p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);
144
145
8.54M
    diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
146
8.54M
    diff = _mm_abs_epi16(diff);
147
8.54M
    alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
148
8.54M
    flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
149
150
8.54M
    diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
151
8.54M
    diff = _mm_abs_epi16(diff);
152
8.54M
    beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
153
8.54M
    flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
154
155
8.54M
    diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
156
8.54M
    diff = _mm_abs_epi16(diff);
157
8.54M
    flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
158
159
8.54M
    temp1 = _mm_slli_epi16(p1_uv_8x16, 1);
160
8.54M
    temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16);
161
8.54M
    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
162
8.54M
    temp1 = _mm_add_epi16(temp1, temp2);
163
8.54M
    p0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);
164
165
8.54M
    temp1 = _mm_slli_epi16(q1_uv_8x16, 1);
166
8.54M
    temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16);
167
8.54M
    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
168
8.54M
    temp1 = _mm_add_epi16(temp1, temp2);
169
8.54M
    q0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);
170
171
8.54M
    q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero);
172
8.54M
    q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero);
173
8.54M
    p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero);
174
8.54M
    p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero);
175
176
8.54M
    diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
177
8.54M
    diff = _mm_abs_epi16(diff);
178
8.54M
    alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
179
8.54M
    flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
180
181
8.54M
    diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
182
8.54M
    diff = _mm_abs_epi16(diff);
183
8.54M
    beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
184
8.54M
    flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
185
186
8.54M
    diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
187
8.54M
    diff = _mm_abs_epi16(diff);
188
8.54M
    flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
189
190
8.54M
    temp1 = _mm_slli_epi16(p1_uv_8x16, 1);
191
8.54M
    temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16);
192
8.54M
    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
193
8.54M
    temp1 = _mm_add_epi16(temp1, temp2);
194
8.54M
    p0_uv_8x16_2 = _mm_srai_epi16(temp1, 2);
195
196
8.54M
    temp1 = _mm_slli_epi16(q1_uv_8x16, 1);
197
8.54M
    temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16);
198
8.54M
    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
199
8.54M
    temp1 = _mm_add_epi16(temp1, temp2);
200
8.54M
    q0_uv_8x16_2 = _mm_srai_epi16(temp1, 2);
201
202
8.54M
    p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2);
203
8.54M
    q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2);
204
205
8.54M
    flag1 = _mm_packs_epi16(flag1, flag2);
206
207
8.54M
    p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
208
8.54M
                                 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
209
8.54M
    p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
210
8.54M
    p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);
211
212
8.54M
    q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
213
8.54M
                                 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
214
8.54M
    q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
215
8.54M
    q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);
216
217
    /* Inverse-transpose and store back */
218
8.54M
    temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8);
219
8.54M
    temp2 = _mm_unpackhi_epi16(p1_uv_16x8, p0_uv_16x8);
220
8.54M
    temp3 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8);
221
8.54M
    temp4 = _mm_unpackhi_epi16(q0_uv_16x8, q1_uv_16x8);
222
223
8.54M
    linea = _mm_unpacklo_epi32(temp1, temp3);
224
8.54M
    lineb = _mm_srli_si128(linea, 8);
225
8.54M
    linec = _mm_unpackhi_epi32(temp1, temp3);
226
8.54M
    lined = _mm_srli_si128(linec, 8);
227
8.54M
    linee = _mm_unpacklo_epi32(temp2, temp4);
228
8.54M
    linef = _mm_srli_si128(linee, 8);
229
8.54M
    lineg = _mm_unpackhi_epi32(temp2, temp4);
230
8.54M
    lineh = _mm_srli_si128(lineg, 8);
231
232
8.54M
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea);
233
8.54M
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb);
234
8.54M
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec);
235
8.54M
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined);
236
8.54M
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd), linee);
237
8.54M
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd), linef);
238
8.54M
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd), lineg);
239
8.54M
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd), lineh);
240
241
8.54M
}
242
243
/*****************************************************************************/
244
/*                                                                           */
245
/*  Function Name : ih264_deblk_chroma_horz_bs4_ssse3()                      */
246
/*                                                                           */
247
/*  Description   : This function performs filtering of a chroma block       */
248
/*                  horizontal edge when the boundary strength is set to 4   */
249
/*                  in high profile.                                         */
250
/*                                                                           */
251
/*  Inputs        : pu1_src    - pointer to the src sample q0 of U           */
252
/*                  src_strd   - source stride                               */
253
/*                  alpha_cb   - alpha value for the boundary in U           */
254
/*                  beta_cb    - beta value for the boundary in U            */
255
/*                  alpha_cr   - alpha value for the boundary in V           */
256
/*                  beta_cr    - beta value for the boundary in V            */
257
/*                                                                           */
258
/*  Globals       : None                                                     */
259
/*                                                                           */
260
/*  Processing    : This operation is described in Sec. 8.7.2.4 under the    */
261
/*                  title "Filtering process for edges for bS equal to 4" in */
262
/*                  ITU T Rec H.264 with alpha and beta values different in  */
263
/*                  U and V.                                                 */
264
/*                                                                           */
265
/*  Outputs       : None                                                     */
266
/*                                                                           */
267
/*  Returns       : None                                                     */
268
/*                                                                           */
269
/*  Issues        : None                                                     */
270
/*                                                                           */
271
/*  Revision History:                                                        */
272
/*                                                                           */
273
/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
274
/*         12 02 2015   Naveen Kumar P  Initial version                      */
275
/*                                                                           */
276
/*****************************************************************************/
277
void ih264_deblk_chroma_horz_bs4_ssse3(UWORD8 *pu1_src,
278
                                       WORD32 src_strd,
279
                                       WORD32 alpha_cb,
280
                                       WORD32 beta_cb,
281
                                       WORD32 alpha_cr,
282
                                       WORD32 beta_cr)
283
8.45M
{
284
8.45M
    UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
285
8.45M
    WORD16 i16_posP1, i16_posP0, i16_posQ1;
286
287
8.45M
    UWORD8 *pu1_HorzPixelUV; /*! < Pointer to the first pixel of the boundary */
288
8.45M
    WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
289
8.45M
    WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
290
8.45M
    __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
291
8.45M
    __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
292
8.45M
    __m128i flag1, flag2;
293
8.45M
    __m128i diff, alpha_cbcr_16x8, beta_cbcr_16x8;
294
8.45M
    __m128i zero = _mm_setzero_si128();
295
8.45M
    __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;
296
8.45M
    __m128i temp1, temp2;
297
298
8.45M
    pu1_HorzPixelUV = pu1_src_uv - (src_strd << 1);
299
300
8.45M
    i16_posQ1 = src_strd;
301
8.45M
    i16_posP0 = src_strd;
302
8.45M
    i16_posP1 = 0;
303
304
8.45M
    q0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv));
305
8.45M
    q1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv + i16_posQ1));
306
8.45M
    p1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP1));
307
8.45M
    p0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0));
308
309
8.45M
    q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
310
8.45M
    q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
311
8.45M
    p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
312
8.45M
    p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);
313
314
8.45M
    diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
315
8.45M
    diff = _mm_abs_epi16(diff);
316
8.45M
    alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
317
8.45M
    flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
318
319
8.45M
    diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
320
8.45M
    diff = _mm_abs_epi16(diff);
321
8.45M
    beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
322
8.45M
    flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
323
324
8.45M
    diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
325
8.45M
    diff = _mm_abs_epi16(diff);
326
8.45M
    flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
327
328
8.45M
    temp1 = _mm_slli_epi16(p1_uv_8x16, 1);
329
8.45M
    temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16);
330
8.45M
    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
331
8.45M
    temp1 = _mm_add_epi16(temp1, temp2);
332
8.45M
    p0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);
333
334
8.45M
    temp1 = _mm_slli_epi16(q1_uv_8x16, 1);
335
8.45M
    temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16);
336
8.45M
    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
337
8.45M
    temp1 = _mm_add_epi16(temp1, temp2);
338
8.45M
    q0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);
339
340
8.45M
    q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero);
341
8.45M
    q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero);
342
8.45M
    p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero);
343
8.45M
    p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero);
344
345
8.45M
    diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
346
8.45M
    diff = _mm_abs_epi16(diff);
347
8.45M
    alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
348
8.45M
    flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
349
350
8.45M
    diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
351
8.45M
    diff = _mm_abs_epi16(diff);
352
8.45M
    beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
353
8.45M
    flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
354
355
8.45M
    diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
356
8.45M
    diff = _mm_abs_epi16(diff);
357
8.45M
    flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
358
359
8.45M
    temp1 = _mm_slli_epi16(p1_uv_8x16, 1);
360
8.45M
    temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16);
361
8.45M
    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
362
8.45M
    temp1 = _mm_add_epi16(temp1, temp2);
363
8.45M
    p0_uv_8x16_2 = _mm_srai_epi16(temp1, 2);
364
365
8.45M
    temp1 = _mm_slli_epi16(q1_uv_8x16, 1);
366
8.45M
    temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16);
367
8.45M
    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
368
8.45M
    temp1 = _mm_add_epi16(temp1, temp2);
369
8.45M
    q0_uv_8x16_2 = _mm_srai_epi16(temp1, 2);
370
371
8.45M
    p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2);
372
8.45M
    q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2);
373
374
8.45M
    flag1 = _mm_packs_epi16(flag1, flag2);
375
376
8.45M
    p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
377
8.45M
                                 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
378
8.45M
    p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
379
8.45M
    p0_uv_8x16_1 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);
380
8.45M
    _mm_storeu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0), p0_uv_8x16_1);
381
382
8.45M
    q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
383
8.45M
                                 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
384
8.45M
    q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
385
8.45M
    q0_uv_8x16_1 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);
386
8.45M
    _mm_storeu_si128((__m128i *)(pu1_src_uv), q0_uv_8x16_1);
387
388
8.45M
}
389
390
/*****************************************************************************/
391
/*                                                                           */
392
/*  Function Name : ih264_deblk_chroma_vert_bslt4_ssse3()                    */
393
/*                                                                           */
394
/*  Description   : This function performs filtering of a chroma block       */
395
/*                  vertical edge when the boundary strength is less than 4  */
396
/*                  in high profile.                                         */
397
/*                                                                           */
398
/*  Inputs        : pu1_src          - pointer to the src sample q0 of U     */
399
/*                  src_strd         - source stride                         */
400
/*                  alpha_cb         - alpha value for the boundary in U     */
401
/*                  beta_cb          - beta value for the boundary in U      */
402
/*                  alpha_cr         - alpha value for the boundary in V     */
403
/*                  beta_cr          - beta value for the boundary in V      */
404
/*                  u4_bs            - packed Boundary strength array        */
405
/*                  pu1_cliptab_cb   - tc0_table for U                       */
406
/*                  pu1_cliptab_cr   - tc0_table for V                       */
407
/*                                                                           */
408
/*  Globals       : None                                                     */
409
/*                                                                           */
410
/*  Processing    : This operation is described in Sec. 8.7.2.3 under the    */
411
/*                  title "Filtering process for edges for bS less than 4"   */
412
/*                  in ITU T Rec H.264 with alpha and beta values different  */
413
/*                  in U and V.                                              */
414
/*                                                                           */
415
/*  Outputs       : None                                                     */
416
/*                                                                           */
417
/*  Returns       : None                                                     */
418
/*                                                                           */
419
/*  Issues        : None                                                     */
420
/*                                                                           */
421
/*  Revision History:                                                        */
422
/*                                                                           */
423
/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
424
/*         12 02 2015   Naveen Kumar P  Initial version                      */
425
/*                                                                           */
426
/*****************************************************************************/
427
void ih264_deblk_chroma_vert_bslt4_ssse3(UWORD8 *pu1_src,
428
                                         WORD32 src_strd,
429
                                         WORD32 alpha_cb,
430
                                         WORD32 beta_cb,
431
                                         WORD32 alpha_cr,
432
                                         WORD32 beta_cr,
433
                                         UWORD32 u4_bs,
434
                                         const UWORD8 *pu1_cliptab_cb,
435
                                         const UWORD8 *pu1_cliptab_cr)
436
10.8M
{
437
10.8M
    UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
438
10.8M
    UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
439
10.8M
    WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
440
10.8M
    WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
441
10.8M
    __m128i linea, lineb, linec, lined, linee, linef, lineg, lineh;
442
10.8M
    __m128i temp1, temp2, temp3, temp4;
443
444
10.8M
    __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
445
10.8M
    __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
446
10.8M
    __m128i flag_bs, flag1, flag2;
447
10.8M
    __m128i diff, diff1, alpha_cbcr_16x8, beta_cbcr_16x8, in_macro;
448
10.8M
    __m128i zero = _mm_setzero_si128();
449
10.8M
    __m128i C0_uv_8x16;
450
10.8M
    __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;
451
452
10.8M
    u1_Bs0 = (u4_bs >> 24) & 0xff;
453
10.8M
    u1_Bs1 = (u4_bs >> 16) & 0xff;
454
10.8M
    u1_Bs2 = (u4_bs >> 8) & 0xff;
455
10.8M
    u1_Bs3 = (u4_bs >> 0) & 0xff;
456
457
10.8M
    flag_bs = _mm_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs2, u1_Bs2,
458
10.8M
                           u1_Bs2, u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,
459
10.8M
                           u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0);
460
10.8M
    flag_bs = _mm_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s
461
10.8M
    flag_bs = _mm_xor_si128(flag_bs, _mm_set1_epi8(0xFF)); //Invert for required mask
462
463
    /* Load and transpose the pixel values */
464
10.8M
    linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4));
465
10.8M
    lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd));
466
10.8M
    linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd));
467
10.8M
    lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd));
468
10.8M
    linee = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd));
469
10.8M
    linef = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd));
470
10.8M
    lineg = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd));
471
10.8M
    lineh = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd));
472
473
10.8M
    temp1 = _mm_unpacklo_epi16(linea, lineb);
474
10.8M
    temp2 = _mm_unpacklo_epi16(linec, lined);
475
10.8M
    temp3 = _mm_unpacklo_epi16(linee, linef);
476
10.8M
    temp4 = _mm_unpacklo_epi16(lineg, lineh);
477
478
10.8M
    p1_uv_8x16 = _mm_unpacklo_epi32(temp1, temp2);
479
10.8M
    p0_uv_8x16 = _mm_unpacklo_epi32(temp3, temp4);
480
10.8M
    q0_uv_8x16 = _mm_unpackhi_epi32(temp1, temp2);
481
10.8M
    q1_uv_8x16 = _mm_unpackhi_epi32(temp3, temp4);
482
483
10.8M
    p1_uv_16x8 = _mm_unpacklo_epi64(p1_uv_8x16, p0_uv_8x16);
484
10.8M
    p0_uv_16x8 = _mm_unpackhi_epi64(p1_uv_8x16, p0_uv_8x16);
485
10.8M
    q0_uv_16x8 = _mm_unpacklo_epi64(q0_uv_8x16, q1_uv_8x16);
486
10.8M
    q1_uv_16x8 = _mm_unpackhi_epi64(q0_uv_8x16, q1_uv_8x16);
487
    /* End of transpose */
488
489
10.8M
    q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
490
10.8M
    q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
491
10.8M
    p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
492
10.8M
    p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);
493
494
10.8M
    diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
495
10.8M
    diff = _mm_abs_epi16(diff);
496
10.8M
    alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
497
10.8M
    flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
498
499
10.8M
    diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
500
10.8M
    diff = _mm_abs_epi16(diff);
501
10.8M
    beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
502
10.8M
    flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
503
504
10.8M
    diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
505
10.8M
    diff = _mm_abs_epi16(diff);
506
10.8M
    flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
507
508
10.8M
    diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16);
509
10.8M
    diff = _mm_slli_epi16(diff, 2);
510
10.8M
    diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16);
511
10.8M
    diff = _mm_add_epi16(diff, diff1);
512
10.8M
    diff = _mm_add_epi16(diff, _mm_set1_epi16(4));
513
10.8M
    in_macro = _mm_srai_epi16(diff, 3);
514
515
10.8M
    C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
516
10.8M
                               pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
517
10.8M
                               pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0],
518
10.8M
                               pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0]);
519
520
10.8M
    C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1));
521
522
10.8M
    in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3
523
10.8M
    C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16);
524
10.8M
    in_macro = _mm_max_epi16(C0_uv_8x16, in_macro);
525
526
10.8M
    p0_uv_8x16_1 = _mm_add_epi16(p0_uv_8x16, in_macro);
527
10.8M
    q0_uv_8x16_1 = _mm_sub_epi16(q0_uv_8x16, in_macro);
528
529
10.8M
    q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero);
530
10.8M
    q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero);
531
10.8M
    p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero);
532
10.8M
    p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero);
533
534
10.8M
    diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
535
10.8M
    diff = _mm_abs_epi16(diff);
536
10.8M
    alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
537
10.8M
    flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
538
539
10.8M
    diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
540
10.8M
    diff = _mm_abs_epi16(diff);
541
10.8M
    beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
542
10.8M
    flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
543
544
10.8M
    diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
545
10.8M
    diff = _mm_abs_epi16(diff);
546
10.8M
    flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
547
548
10.8M
    diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16);
549
10.8M
    diff = _mm_slli_epi16(diff, 2);
550
10.8M
    diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16);
551
10.8M
    diff = _mm_add_epi16(diff, diff1);
552
10.8M
    diff = _mm_add_epi16(diff, _mm_set1_epi16(4));
553
10.8M
    in_macro = _mm_srai_epi16(diff, 3);
554
555
10.8M
    C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
556
10.8M
                               pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
557
10.8M
                               pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2],
558
10.8M
                               pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2]);
559
560
10.8M
    C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1));
561
562
10.8M
    in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3
563
10.8M
    C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16);
564
10.8M
    in_macro = _mm_max_epi16(C0_uv_8x16, in_macro);
565
566
10.8M
    p0_uv_8x16_2 = _mm_add_epi16(p0_uv_8x16, in_macro);
567
10.8M
    q0_uv_8x16_2 = _mm_sub_epi16(q0_uv_8x16, in_macro);
568
569
10.8M
    p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2);
570
10.8M
    q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2);
571
572
10.8M
    flag1 = _mm_packs_epi16(flag1, flag2);
573
10.8M
    flag1 = _mm_and_si128(flag1, flag_bs); //Final flag (BS condition + other 3 conditions)
574
575
10.8M
    p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
576
10.8M
                                 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
577
10.8M
    p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
578
10.8M
    p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);
579
580
10.8M
    q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
581
10.8M
                                 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
582
10.8M
    q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
583
10.8M
    q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);
584
585
    /* Inverse-transpose and store back */
586
10.8M
    temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8);
587
10.8M
    temp2 = _mm_unpackhi_epi16(p1_uv_16x8, p0_uv_16x8);
588
10.8M
    temp3 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8);
589
10.8M
    temp4 = _mm_unpackhi_epi16(q0_uv_16x8, q1_uv_16x8);
590
591
10.8M
    linea = _mm_unpacklo_epi32(temp1, temp3);
592
10.8M
    lineb = _mm_srli_si128(linea, 8);
593
10.8M
    linec = _mm_unpackhi_epi32(temp1, temp3);
594
10.8M
    lined = _mm_srli_si128(linec, 8);
595
10.8M
    linee = _mm_unpacklo_epi32(temp2, temp4);
596
10.8M
    linef = _mm_srli_si128(linee, 8);
597
10.8M
    lineg = _mm_unpackhi_epi32(temp2, temp4);
598
10.8M
    lineh = _mm_srli_si128(lineg, 8);
599
600
10.8M
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea);
601
10.8M
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb);
602
10.8M
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec);
603
10.8M
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined);
604
10.8M
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd), linee);
605
10.8M
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd), linef);
606
10.8M
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd), lineg);
607
10.8M
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd), lineh);
608
609
10.8M
}
610
611
/*****************************************************************************/
612
/*                                                                           */
613
/*  Function Name : ih264_deblk_chroma_horz_bslt4_ssse3()                    */
614
/*                                                                           */
615
/*  Description   : This function performs filtering of a chroma block       */
616
/*                  horizontal edge when the boundary strength is less than  */
617
/*                  4 in high profile.                                       */
618
/*                                                                           */
619
/*  Inputs        : pu1_src          - pointer to the src sample q0 of U     */
620
/*                  src_strd         - source stride                         */
621
/*                  alpha_cb         - alpha value for the boundary in U     */
622
/*                  beta_cb          - beta value for the boundary in U      */
623
/*                  alpha_cr         - alpha value for the boundary in V     */
624
/*                  beta_cr          - beta value for the boundary in V      */
625
/*                  u4_bs            - packed Boundary strength array        */
626
/*                  pu1_cliptab_cb   - tc0_table for U                       */
627
/*                  pu1_cliptab_cr   - tc0_table for V                       */
628
/*                                                                           */
629
/*  Globals       : None                                                     */
630
/*                                                                           */
631
/*  Processing    : This operation is described in Sec. 8.7.2.3 under the    */
632
/*                  title "Filtering process for edges for bS less than 4"   */
633
/*                  in ITU T Rec H.264 with alpha and beta values different  */
634
/*                  in U and V.                                              */
635
/*                                                                           */
636
/*  Outputs       : None                                                     */
637
/*                                                                           */
638
/*  Returns       : None                                                     */
639
/*                                                                           */
640
/*  Issues        : None                                                     */
641
/*                                                                           */
642
/*  Revision History:                                                        */
643
/*                                                                           */
644
/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
645
/*         12 02 2015   Naveen Kumar P  Initial version                      */
646
/*                                                                           */
647
/*****************************************************************************/
648
void ih264_deblk_chroma_horz_bslt4_ssse3(UWORD8 *pu1_src,
649
                                         WORD32 src_strd,
650
                                         WORD32 alpha_cb,
651
                                         WORD32 beta_cb,
652
                                         WORD32 alpha_cr,
653
                                         WORD32 beta_cr,
654
                                         UWORD32 u4_bs,
655
                                         const UWORD8 *pu1_cliptab_cb,
656
                                         const UWORD8 *pu1_cliptab_cr)
657
10.9M
{
658
10.9M
    UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
659
10.9M
    WORD16 i16_posP1, i16_posP0, i16_posQ1;
660
10.9M
    UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
661
662
10.9M
    UWORD8 *pu1_HorzPixelUV; /*! < Pointer to the first pixel of the boundary */
663
10.9M
    WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
664
10.9M
    WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
665
10.9M
    __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
666
10.9M
    __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
667
10.9M
    __m128i flag_bs, flag1, flag2;
668
10.9M
    __m128i diff, diff1, alpha_cbcr_16x8, beta_cbcr_16x8, in_macro;
669
10.9M
    __m128i zero = _mm_setzero_si128();
670
10.9M
    __m128i C0_uv_8x16;
671
10.9M
    __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;
672
673
10.9M
    pu1_HorzPixelUV = pu1_src_uv - (src_strd << 1);
674
675
10.9M
    i16_posQ1 = src_strd;
676
10.9M
    i16_posP0 = src_strd;
677
10.9M
    i16_posP1 = 0;
678
679
10.9M
    u1_Bs0 = (u4_bs >> 24) & 0xff;
680
10.9M
    u1_Bs1 = (u4_bs >> 16) & 0xff;
681
10.9M
    u1_Bs2 = (u4_bs >> 8) & 0xff;
682
10.9M
    u1_Bs3 = (u4_bs >> 0) & 0xff;
683
684
10.9M
    flag_bs = _mm_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs2, u1_Bs2,
685
10.9M
                           u1_Bs2, u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,
686
10.9M
                           u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0);
687
10.9M
    flag_bs = _mm_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s
688
10.9M
    flag_bs = _mm_xor_si128(flag_bs, _mm_set1_epi8(0xFF)); //Invert for required mask
689
690
10.9M
    q0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv));
691
10.9M
    q1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv + i16_posQ1));
692
10.9M
    p1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP1));
693
10.9M
    p0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0));
694
695
10.9M
    q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
696
10.9M
    q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
697
10.9M
    p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
698
10.9M
    p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);
699
700
10.9M
    diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
701
10.9M
    diff = _mm_abs_epi16(diff);
702
10.9M
    alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
703
10.9M
    flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
704
705
10.9M
    diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
706
10.9M
    diff = _mm_abs_epi16(diff);
707
10.9M
    beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
708
10.9M
    flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
709
710
10.9M
    diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
711
10.9M
    diff = _mm_abs_epi16(diff);
712
10.9M
    flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
713
714
10.9M
    diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16);
715
10.9M
    diff = _mm_slli_epi16(diff, 2);
716
10.9M
    diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16);
717
10.9M
    diff = _mm_add_epi16(diff, diff1);
718
10.9M
    diff = _mm_add_epi16(diff, _mm_set1_epi16(4));
719
10.9M
    in_macro = _mm_srai_epi16(diff, 3);
720
721
10.9M
    C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
722
10.9M
                               pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
723
10.9M
                               pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0],
724
10.9M
                               pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0]);
725
726
10.9M
    C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1));
727
728
10.9M
    in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3
729
10.9M
    C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16);
730
10.9M
    in_macro = _mm_max_epi16(C0_uv_8x16, in_macro);
731
732
10.9M
    p0_uv_8x16_1 = _mm_add_epi16(p0_uv_8x16, in_macro);
733
10.9M
    q0_uv_8x16_1 = _mm_sub_epi16(q0_uv_8x16, in_macro);
734
735
10.9M
    q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero);
736
10.9M
    q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero);
737
10.9M
    p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero);
738
10.9M
    p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero);
739
740
10.9M
    diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
741
10.9M
    diff = _mm_abs_epi16(diff);
742
10.9M
    alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
743
10.9M
    flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
744
745
10.9M
    diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
746
10.9M
    diff = _mm_abs_epi16(diff);
747
10.9M
    beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
748
10.9M
    flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
749
750
10.9M
    diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
751
10.9M
    diff = _mm_abs_epi16(diff);
752
10.9M
    flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
753
754
10.9M
    diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16);
755
10.9M
    diff = _mm_slli_epi16(diff, 2);
756
10.9M
    diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16);
757
10.9M
    diff = _mm_add_epi16(diff, diff1);
758
10.9M
    diff = _mm_add_epi16(diff, _mm_set1_epi16(4));
759
10.9M
    in_macro = _mm_srai_epi16(diff, 3);
760
761
10.9M
    C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
762
10.9M
                               pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
763
10.9M
                               pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2],
764
10.9M
                               pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2]);
765
766
10.9M
    C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1));
767
768
10.9M
    in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3
769
10.9M
    C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16);
770
10.9M
    in_macro = _mm_max_epi16(C0_uv_8x16, in_macro);
771
772
10.9M
    p0_uv_8x16_2 = _mm_add_epi16(p0_uv_8x16, in_macro);
773
10.9M
    q0_uv_8x16_2 = _mm_sub_epi16(q0_uv_8x16, in_macro);
774
775
10.9M
    p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2);
776
10.9M
    q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2);
777
778
10.9M
    flag1 = _mm_packs_epi16(flag1, flag2);
779
10.9M
    flag1 = _mm_and_si128(flag1, flag_bs); //Final flag (BS condition + other 3 conditions)
780
781
10.9M
    p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
782
10.9M
                                 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
783
10.9M
    p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
784
10.9M
    p0_uv_8x16_1 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);
785
10.9M
    _mm_storeu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0), p0_uv_8x16_1);
786
787
10.9M
    q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
788
10.9M
                                 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
789
10.9M
    q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
790
10.9M
    q0_uv_8x16_1 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);
791
10.9M
    _mm_storeu_si128((__m128i *)(pu1_src_uv), q0_uv_8x16_1);
792
793
10.9M
}
794
795
/*****************************************************************************/
796
/*                                                                           */
797
/*  Function Name : ih264_deblk_chroma_vert_bs4_mbaff_ssse3()                */
798
/*                                                                           */
799
/*  Description   : This function performs filtering of a chroma block       */
800
/*                  vertical edge when boundary strength is set to 4 in high */
801
/*                  profile.                                                 */
802
/*                                                                           */
803
/*  Inputs        : pu1_src          - pointer to the src sample q0 of U     */
804
/*                  src_strd         - source stride                         */
805
/*                  alpha_cb         - alpha value for the boundary in U     */
806
/*                  beta_cb          - beta value for the boundary in U      */
807
/*                  alpha_cr         - alpha value for the boundary in V     */
808
/*                  beta_cr          - beta value for the boundary in V      */
809
/*                  u4_bs            - packed Boundary strength array        */
810
/*                  pu1_cliptab_cb   - tc0_table for U                       */
811
/*                  pu1_cliptab_cr   - tc0_table for V                       */
812
/*                                                                           */
813
/*  Globals       : None                                                     */
814
/*                                                                           */
815
/*  Processing    : When the function is called twice, this operation is as  */
816
/*                  described in Sec. 8.7.2.4 under the title "Filtering     */
817
/*                  process for edges for bS equal to 4" in ITU T Rec H.264  */
818
/*                  with alpha and beta values different in U and V.         */
819
/*                                                                           */
820
/*  Outputs       : None                                                     */
821
/*                                                                           */
822
/*  Returns       : None                                                     */
823
/*                                                                           */
824
/*  Issues        : None                                                     */
825
/*                                                                           */
826
/*  Revision History:                                                        */
827
/*                                                                           */
828
/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
829
/*         12 02 2015   Naveen Kumar P  Initial version                      */
830
/*                                                                           */
831
/*****************************************************************************/
832
void ih264_deblk_chroma_vert_bs4_mbaff_ssse3(UWORD8 *pu1_src,
833
                                             WORD32 src_strd,
834
                                             WORD32 alpha_cb,
835
                                             WORD32 beta_cb,
836
                                             WORD32 alpha_cr,
837
                                             WORD32 beta_cr)
838
23.6k
{
839
23.6k
    UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
840
23.6k
    WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
841
23.6k
    WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
842
23.6k
    __m128i linea, lineb, linec, lined;
843
23.6k
    __m128i temp1, temp2;
844
845
23.6k
    __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
846
23.6k
    __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
847
23.6k
    __m128i flag1;
848
23.6k
    __m128i diff, alpha_cbcr_16x8, beta_cbcr_16x8;
849
23.6k
    __m128i zero = _mm_setzero_si128();
850
23.6k
    __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;
851
852
    /* Load and transpose the pixel values */
853
23.6k
    linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4));
854
23.6k
    lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd));
855
23.6k
    linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd));
856
23.6k
    lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd));
857
858
23.6k
    temp1 = _mm_unpacklo_epi16(linea, lineb);
859
23.6k
    temp2 = _mm_unpacklo_epi16(linec, lined);
860
861
23.6k
    p1_uv_16x8 = _mm_unpacklo_epi32(temp1, temp2);
862
23.6k
    p0_uv_16x8 = _mm_srli_si128(p1_uv_16x8, 8);
863
23.6k
    q0_uv_16x8 = _mm_unpackhi_epi32(temp1, temp2);
864
23.6k
    q1_uv_16x8 = _mm_srli_si128(q0_uv_16x8, 8);
865
    /* End of transpose */
866
867
23.6k
    q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
868
23.6k
    q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
869
23.6k
    p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
870
23.6k
    p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);
871
872
23.6k
    diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
873
23.6k
    diff = _mm_abs_epi16(diff);
874
23.6k
    alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
875
23.6k
    flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
876
877
23.6k
    diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
878
23.6k
    diff = _mm_abs_epi16(diff);
879
23.6k
    beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
880
23.6k
    flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
881
882
23.6k
    diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
883
23.6k
    diff = _mm_abs_epi16(diff);
884
23.6k
    flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
885
886
23.6k
    temp1 = _mm_slli_epi16(p1_uv_8x16, 1);
887
23.6k
    temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16);
888
23.6k
    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
889
23.6k
    temp1 = _mm_add_epi16(temp1, temp2);
890
23.6k
    p0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);
891
892
23.6k
    temp1 = _mm_slli_epi16(q1_uv_8x16, 1);
893
23.6k
    temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16);
894
23.6k
    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
895
23.6k
    temp1 = _mm_add_epi16(temp1, temp2);
896
23.6k
    q0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);
897
898
23.6k
    p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_1);
899
23.6k
    q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_1);
900
901
23.6k
    flag1 = _mm_packs_epi16(flag1, flag1);
902
903
23.6k
    p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
904
23.6k
                                 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
905
23.6k
    p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
906
23.6k
    p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);
907
908
23.6k
    q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
909
23.6k
                                 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
910
23.6k
    q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
911
23.6k
    q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);
912
913
    /* Inverse-transpose and store back */
914
23.6k
    temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8);
915
23.6k
    temp2 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8);
916
917
23.6k
    linea = _mm_unpacklo_epi32(temp1, temp2);
918
23.6k
    lineb = _mm_srli_si128(linea, 8);
919
23.6k
    linec = _mm_unpackhi_epi32(temp1, temp2);
920
23.6k
    lined = _mm_srli_si128(linec, 8);
921
922
23.6k
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea);
923
23.6k
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb);
924
23.6k
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec);
925
23.6k
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined);
926
927
23.6k
}
928
929
/*****************************************************************************/
930
/*                                                                           */
931
/*  Function Name : ih264_deblk_chroma_vert_bslt4_mbaff_ssse3()              */
932
/*                                                                           */
933
/*  Description   : This function performs filtering of a chroma block       */
934
/*                  vertical edge when boundary strength is less than 4 in   */
935
/*                  high profile.                                            */
936
/*                                                                           */
937
/*  Inputs        : pu1_src          - pointer to the src sample q0 of U     */
938
/*                  src_strd         - source stride                         */
939
/*                  alpha_cb         - alpha value for the boundary in U     */
940
/*                  beta_cb          - beta value for the boundary in U      */
941
/*                  alpha_cr         - alpha value for the boundary in V     */
942
/*                  beta_cr          - beta value for the boundary in V      */
943
/*                  u4_bs            - packed Boundary strength array        */
944
/*                  pu1_cliptab_cb   - tc0_table for U                       */
945
/*                  pu1_cliptab_cr   - tc0_table for V                       */
946
/*                                                                           */
947
/*  Globals       : None                                                     */
948
/*                                                                           */
949
/*  Processing    : When the function is called twice, this operation is as  */
950
/*                  described in Sec. 8.7.2.4 under the title "Filtering     */
951
/*                  process for edges for bS less than 4" in ITU T Rec H.264 */
952
/*                  with alpha and beta values different in U and V.         */
953
/*                                                                           */
954
/*  Outputs       : None                                                     */
955
/*                                                                           */
956
/*  Returns       : None                                                     */
957
/*                                                                           */
958
/*  Issues        : None                                                     */
959
/*                                                                           */
960
/*  Revision History:                                                        */
961
/*                                                                           */
962
/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
963
/*         12 02 2015   Naveen Kumar P  Initial version                      */
964
/*                                                                           */
965
/*****************************************************************************/
966
void ih264_deblk_chroma_vert_bslt4_mbaff_ssse3(UWORD8 *pu1_src,
967
                                               WORD32 src_strd,
968
                                               WORD32 alpha_cb,
969
                                               WORD32 beta_cb,
970
                                               WORD32 alpha_cr,
971
                                               WORD32 beta_cr,
972
                                               UWORD32 u4_bs,
973
                                               const UWORD8 *pu1_cliptab_cb,
974
                                               const UWORD8 *pu1_cliptab_cr)
975
16.0k
{
976
16.0k
    UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
977
16.0k
    UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
978
16.0k
    WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
979
16.0k
    WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
980
16.0k
    __m128i linea, lineb, linec, lined;
981
16.0k
    __m128i temp1, temp2;
982
983
16.0k
    __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
984
16.0k
    __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
985
16.0k
    __m128i flag_bs, flag1;
986
16.0k
    __m128i diff, diff1, alpha_cbcr_16x8, beta_cbcr_16x8, in_macro;
987
16.0k
    __m128i zero = _mm_setzero_si128();
988
16.0k
    __m128i C0_uv_8x16;
989
16.0k
    __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;
990
991
16.0k
    u1_Bs0 = (u4_bs >> 24) & 0xff;
992
16.0k
    u1_Bs1 = (u4_bs >> 16) & 0xff;
993
16.0k
    u1_Bs2 = (u4_bs >> 8) & 0xff;
994
16.0k
    u1_Bs3 = (u4_bs >> 0) & 0xff;
995
996
16.0k
    flag_bs = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, u1_Bs3, u1_Bs3, u1_Bs2,
997
16.0k
                           u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs0, u1_Bs0);
998
16.0k
    flag_bs = _mm_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s
999
16.0k
    flag_bs = _mm_xor_si128(flag_bs, _mm_set1_epi8(0xFF)); //Invert for required mask
1000
1001
    /* Load and transpose the pixel values */
1002
16.0k
    linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4));
1003
16.0k
    lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd));
1004
16.0k
    linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd));
1005
16.0k
    lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd));
1006
1007
16.0k
    temp1 = _mm_unpacklo_epi16(linea, lineb);
1008
16.0k
    temp2 = _mm_unpacklo_epi16(linec, lined);
1009
1010
16.0k
    p1_uv_16x8 = _mm_unpacklo_epi32(temp1, temp2);
1011
16.0k
    p0_uv_16x8 = _mm_srli_si128(p1_uv_16x8, 8);
1012
16.0k
    q0_uv_16x8 = _mm_unpackhi_epi32(temp1, temp2);
1013
16.0k
    q1_uv_16x8 = _mm_srli_si128(q0_uv_16x8, 8);
1014
    /* End of transpose */
1015
1016
16.0k
    q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
1017
16.0k
    q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
1018
16.0k
    p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
1019
16.0k
    p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);
1020
1021
16.0k
    diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
1022
16.0k
    diff = _mm_abs_epi16(diff);
1023
16.0k
    alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
1024
16.0k
    flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
1025
1026
16.0k
    diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
1027
16.0k
    diff = _mm_abs_epi16(diff);
1028
16.0k
    beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
1029
16.0k
    flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
1030
1031
16.0k
    diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
1032
16.0k
    diff = _mm_abs_epi16(diff);
1033
16.0k
    flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
1034
1035
16.0k
    diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16);
1036
16.0k
    diff = _mm_slli_epi16(diff, 2);
1037
16.0k
    diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16);
1038
16.0k
    diff = _mm_add_epi16(diff, diff1);
1039
16.0k
    diff = _mm_add_epi16(diff, _mm_set1_epi16(4));
1040
16.0k
    in_macro = _mm_srai_epi16(diff, 3);
1041
1042
16.0k
    C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
1043
16.0k
                               pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2],
1044
16.0k
                               pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
1045
16.0k
                               pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0]);
1046
1047
16.0k
    C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1));
1048
1049
16.0k
    in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3
1050
16.0k
    C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16);
1051
16.0k
    in_macro = _mm_max_epi16(C0_uv_8x16, in_macro);
1052
1053
16.0k
    p0_uv_8x16_1 = _mm_add_epi16(p0_uv_8x16, in_macro);
1054
16.0k
    q0_uv_8x16_1 = _mm_sub_epi16(q0_uv_8x16, in_macro);
1055
1056
16.0k
    p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_1);
1057
16.0k
    q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_1);
1058
1059
16.0k
    flag1 = _mm_packs_epi16(flag1, flag1);
1060
16.0k
    flag1 = _mm_and_si128(flag1, flag_bs); //Final flag (BS condition + other 3 conditions)
1061
1062
16.0k
    p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
1063
16.0k
                                 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
1064
16.0k
    p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
1065
16.0k
    p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);
1066
1067
16.0k
    q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
1068
16.0k
                                 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
1069
16.0k
    q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
1070
16.0k
    q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);
1071
1072
    /* Inverse-transpose and store back */
1073
16.0k
    temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8);
1074
16.0k
    temp2 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8);
1075
1076
16.0k
    linea = _mm_unpacklo_epi32(temp1, temp2);
1077
16.0k
    lineb = _mm_srli_si128(linea, 8);
1078
16.0k
    linec = _mm_unpackhi_epi32(temp1, temp2);
1079
16.0k
    lined = _mm_srli_si128(linec, 8);
1080
1081
16.0k
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea);
1082
16.0k
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb);
1083
16.0k
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec);
1084
16.0k
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined);
1085
1086
16.0k
}
1087