Coverage Report

Created: 2025-12-14 06:42

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libavc/common/x86/ih264_deblk_chroma_ssse3.c
Line
Count
Source
1
/******************************************************************************
2
 *
3
 * Copyright (C) 2015 The Android Open Source Project
4
 *
5
 * Licensed under the Apache License, Version 2.0 (the "License");
6
 * you may not use this file except in compliance with the License.
7
 * You may obtain a copy of the License at:
8
 *
9
 * http://www.apache.org/licenses/LICENSE-2.0
10
 *
11
 * Unless required by applicable law or agreed to in writing, software
12
 * distributed under the License is distributed on an "AS IS" BASIS,
13
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 * See the License for the specific language governing permissions and
15
 * limitations under the License.
16
 *
17
 *****************************************************************************
18
 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19
*/
20
/*****************************************************************************/
21
/*                                                                           */
22
/*  File Name         : ih264_deblk_chroma_ssse3.c                           */
23
/*                                                                           */
24
/*  Description       : Contains function definitions for deblocking         */
25
/*                                                                           */
26
/*  List of Functions : ih264_deblk_chroma_vert_bs4_ssse3()                  */
27
/*                      ih264_deblk_chroma_horz_bs4_ssse3()                  */
28
/*                      ih264_deblk_chroma_vert_bslt4_ssse3()                */
29
/*                      ih264_deblk_chroma_horz_bslt4_ssse3()                */
30
/*                      ih264_deblk_chroma_vert_bs4_mbaff_ssse3()            */
31
/*                      ih264_deblk_chroma_vert_bslt4_mbaff_ssse3()          */
32
/*                                                                           */
33
/*  Issues / Problems : None                                                 */
34
/*                                                                           */
35
/*  Revision History  :                                                      */
36
/*                                                                           */
37
/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
38
/*         12 02 2015   Naveen Kumar P  Added chrom deblocking ssse3         */
39
/*                                      intrinsics                           */
40
/*                                                                           */
41
/*****************************************************************************/
42
43
/*****************************************************************************/
44
/* File Includes                                                             */
45
/*****************************************************************************/
46
47
/* System include files */
48
#include <stdio.h>
49
50
/* User include files */
51
#include "ih264_typedefs.h"
52
#include "ih264_platform_macros.h"
53
#include "ih264_deblk_edge_filters.h"
54
#include "ih264_macros.h"
55
56
/*****************************************************************************/
57
/* Function Definitions                                                      */
58
/*****************************************************************************/
59
60
/*****************************************************************************/
61
/*                                                                           */
62
/*  Function Name : ih264_deblk_chroma_vert_bs4_ssse3()                      */
63
/*                                                                           */
64
/*  Description   : This function performs filtering of a chroma block       */
65
/*                  vertical edge when the boundary strength is set to 4 in  */
66
/*                  high profile.                                            */
67
/*                                                                           */
68
/*  Inputs        : pu1_src    - pointer to the src sample q0 of U           */
69
/*                  src_strd   - source stride                               */
70
/*                  alpha_cb   - alpha value for the boundary in U           */
71
/*                  beta_cb    - beta value for the boundary in U            */
72
/*                  alpha_cr   - alpha value for the boundary in V           */
73
/*                  beta_cr    - beta value for the boundary in V            */
74
/*                                                                           */
75
/*  Globals       : None                                                     */
76
/*                                                                           */
77
/*  Processing    : This operation is described in Sec. 8.7.2.4 under the    */
78
/*                  title "Filtering process for edges for bS equal to 4" in */
79
/*                  ITU T Rec H.264 with alpha and beta values different in  */
80
/*                  U and V.                                                 */
81
/*                                                                           */
82
/*  Outputs       : None                                                     */
83
/*                                                                           */
84
/*  Returns       : None                                                     */
85
/*                                                                           */
86
/*  Issues        : None                                                     */
87
/*                                                                           */
88
/*  Revision History:                                                        */
89
/*                                                                           */
90
/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
91
/*         12 02 2015   Naveen Kumar P  Initial version                      */
92
/*                                                                           */
93
/*****************************************************************************/
94
void ih264_deblk_chroma_vert_bs4_ssse3(UWORD8 *pu1_src,
95
                                       WORD32 src_strd,
96
                                       WORD32 alpha_cb,
97
                                       WORD32 beta_cb,
98
                                       WORD32 alpha_cr,
99
                                       WORD32 beta_cr)
100
7.85M
{
101
7.85M
    UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
102
7.85M
    WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
103
7.85M
    WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
104
7.85M
    __m128i linea, lineb, linec, lined, linee, linef, lineg, lineh;
105
7.85M
    __m128i temp1, temp2, temp3, temp4;
106
107
7.85M
    __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
108
7.85M
    __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
109
7.85M
    __m128i flag1, flag2;
110
7.85M
    __m128i diff, alpha_cbcr_16x8, beta_cbcr_16x8;
111
7.85M
    __m128i zero = _mm_setzero_si128();
112
7.85M
    __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;
113
114
    /* Load and transpose the pixel values */
115
7.85M
    linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4));
116
7.85M
    lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd));
117
7.85M
    linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd));
118
7.85M
    lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd));
119
7.85M
    linee = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd));
120
7.85M
    linef = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd));
121
7.85M
    lineg = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd));
122
7.85M
    lineh = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd));
123
124
7.85M
    temp1 = _mm_unpacklo_epi16(linea, lineb);
125
7.85M
    temp2 = _mm_unpacklo_epi16(linec, lined);
126
7.85M
    temp3 = _mm_unpacklo_epi16(linee, linef);
127
7.85M
    temp4 = _mm_unpacklo_epi16(lineg, lineh);
128
129
7.85M
    p1_uv_8x16 = _mm_unpacklo_epi32(temp1, temp2);
130
7.85M
    p0_uv_8x16 = _mm_unpacklo_epi32(temp3, temp4);
131
7.85M
    q0_uv_8x16 = _mm_unpackhi_epi32(temp1, temp2);
132
7.85M
    q1_uv_8x16 = _mm_unpackhi_epi32(temp3, temp4);
133
134
7.85M
    p1_uv_16x8 = _mm_unpacklo_epi64(p1_uv_8x16, p0_uv_8x16);
135
7.85M
    p0_uv_16x8 = _mm_unpackhi_epi64(p1_uv_8x16, p0_uv_8x16);
136
7.85M
    q0_uv_16x8 = _mm_unpacklo_epi64(q0_uv_8x16, q1_uv_8x16);
137
7.85M
    q1_uv_16x8 = _mm_unpackhi_epi64(q0_uv_8x16, q1_uv_8x16);
138
    /* End of transpose */
139
140
7.85M
    q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
141
7.85M
    q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
142
7.85M
    p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
143
7.85M
    p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);
144
145
7.85M
    diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
146
7.85M
    diff = _mm_abs_epi16(diff);
147
7.85M
    alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
148
7.85M
    flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
149
150
7.85M
    diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
151
7.85M
    diff = _mm_abs_epi16(diff);
152
7.85M
    beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
153
7.85M
    flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
154
155
7.85M
    diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
156
7.85M
    diff = _mm_abs_epi16(diff);
157
7.85M
    flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
158
159
7.85M
    temp1 = _mm_slli_epi16(p1_uv_8x16, 1);
160
7.85M
    temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16);
161
7.85M
    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
162
7.85M
    temp1 = _mm_add_epi16(temp1, temp2);
163
7.85M
    p0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);
164
165
7.85M
    temp1 = _mm_slli_epi16(q1_uv_8x16, 1);
166
7.85M
    temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16);
167
7.85M
    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
168
7.85M
    temp1 = _mm_add_epi16(temp1, temp2);
169
7.85M
    q0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);
170
171
7.85M
    q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero);
172
7.85M
    q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero);
173
7.85M
    p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero);
174
7.85M
    p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero);
175
176
7.85M
    diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
177
7.85M
    diff = _mm_abs_epi16(diff);
178
7.85M
    alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
179
7.85M
    flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
180
181
7.85M
    diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
182
7.85M
    diff = _mm_abs_epi16(diff);
183
7.85M
    beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
184
7.85M
    flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
185
186
7.85M
    diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
187
7.85M
    diff = _mm_abs_epi16(diff);
188
7.85M
    flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
189
190
7.85M
    temp1 = _mm_slli_epi16(p1_uv_8x16, 1);
191
7.85M
    temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16);
192
7.85M
    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
193
7.85M
    temp1 = _mm_add_epi16(temp1, temp2);
194
7.85M
    p0_uv_8x16_2 = _mm_srai_epi16(temp1, 2);
195
196
7.85M
    temp1 = _mm_slli_epi16(q1_uv_8x16, 1);
197
7.85M
    temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16);
198
7.85M
    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
199
7.85M
    temp1 = _mm_add_epi16(temp1, temp2);
200
7.85M
    q0_uv_8x16_2 = _mm_srai_epi16(temp1, 2);
201
202
7.85M
    p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2);
203
7.85M
    q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2);
204
205
7.85M
    flag1 = _mm_packs_epi16(flag1, flag2);
206
207
7.85M
    p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
208
7.85M
                                 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
209
7.85M
    p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
210
7.85M
    p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);
211
212
7.85M
    q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
213
7.85M
                                 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
214
7.85M
    q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
215
7.85M
    q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);
216
217
    /* Inverse-transpose and store back */
218
7.85M
    temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8);
219
7.85M
    temp2 = _mm_unpackhi_epi16(p1_uv_16x8, p0_uv_16x8);
220
7.85M
    temp3 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8);
221
7.85M
    temp4 = _mm_unpackhi_epi16(q0_uv_16x8, q1_uv_16x8);
222
223
7.85M
    linea = _mm_unpacklo_epi32(temp1, temp3);
224
7.85M
    lineb = _mm_srli_si128(linea, 8);
225
7.85M
    linec = _mm_unpackhi_epi32(temp1, temp3);
226
7.85M
    lined = _mm_srli_si128(linec, 8);
227
7.85M
    linee = _mm_unpacklo_epi32(temp2, temp4);
228
7.85M
    linef = _mm_srli_si128(linee, 8);
229
7.85M
    lineg = _mm_unpackhi_epi32(temp2, temp4);
230
7.85M
    lineh = _mm_srli_si128(lineg, 8);
231
232
7.85M
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea);
233
7.85M
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb);
234
7.85M
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec);
235
7.85M
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined);
236
7.85M
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd), linee);
237
7.85M
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd), linef);
238
7.85M
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd), lineg);
239
7.85M
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd), lineh);
240
241
7.85M
}
242
243
/*****************************************************************************/
244
/*                                                                           */
245
/*  Function Name : ih264_deblk_chroma_horz_bs4_ssse3()                      */
246
/*                                                                           */
247
/*  Description   : This function performs filtering of a chroma block       */
248
/*                  horizontal edge when the boundary strength is set to 4   */
249
/*                  in high profile.                                         */
250
/*                                                                           */
251
/*  Inputs        : pu1_src    - pointer to the src sample q0 of U           */
252
/*                  src_strd   - source stride                               */
253
/*                  alpha_cb   - alpha value for the boundary in U           */
254
/*                  beta_cb    - beta value for the boundary in U            */
255
/*                  alpha_cr   - alpha value for the boundary in V           */
256
/*                  beta_cr    - beta value for the boundary in V            */
257
/*                                                                           */
258
/*  Globals       : None                                                     */
259
/*                                                                           */
260
/*  Processing    : This operation is described in Sec. 8.7.2.4 under the    */
261
/*                  title "Filtering process for edges for bS equal to 4" in */
262
/*                  ITU T Rec H.264 with alpha and beta values different in  */
263
/*                  U and V.                                                 */
264
/*                                                                           */
265
/*  Outputs       : None                                                     */
266
/*                                                                           */
267
/*  Returns       : None                                                     */
268
/*                                                                           */
269
/*  Issues        : None                                                     */
270
/*                                                                           */
271
/*  Revision History:                                                        */
272
/*                                                                           */
273
/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
274
/*         12 02 2015   Naveen Kumar P  Initial version                      */
275
/*                                                                           */
276
/*****************************************************************************/
277
void ih264_deblk_chroma_horz_bs4_ssse3(UWORD8 *pu1_src,
278
                                       WORD32 src_strd,
279
                                       WORD32 alpha_cb,
280
                                       WORD32 beta_cb,
281
                                       WORD32 alpha_cr,
282
                                       WORD32 beta_cr)
283
7.76M
{
284
7.76M
    UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
285
7.76M
    WORD16 i16_posP1, i16_posP0, i16_posQ1;
286
287
7.76M
    UWORD8 *pu1_HorzPixelUV; /*! < Pointer to the first pixel of the boundary */
288
7.76M
    WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
289
7.76M
    WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
290
7.76M
    __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
291
7.76M
    __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
292
7.76M
    __m128i flag1, flag2;
293
7.76M
    __m128i diff, alpha_cbcr_16x8, beta_cbcr_16x8;
294
7.76M
    __m128i zero = _mm_setzero_si128();
295
7.76M
    __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;
296
7.76M
    __m128i temp1, temp2;
297
298
7.76M
    pu1_HorzPixelUV = pu1_src_uv - (src_strd << 1);
299
300
7.76M
    i16_posQ1 = src_strd;
301
7.76M
    i16_posP0 = src_strd;
302
7.76M
    i16_posP1 = 0;
303
304
7.76M
    q0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv));
305
7.76M
    q1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv + i16_posQ1));
306
7.76M
    p1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP1));
307
7.76M
    p0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0));
308
309
7.76M
    q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
310
7.76M
    q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
311
7.76M
    p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
312
7.76M
    p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);
313
314
7.76M
    diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
315
7.76M
    diff = _mm_abs_epi16(diff);
316
7.76M
    alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
317
7.76M
    flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
318
319
7.76M
    diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
320
7.76M
    diff = _mm_abs_epi16(diff);
321
7.76M
    beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
322
7.76M
    flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
323
324
7.76M
    diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
325
7.76M
    diff = _mm_abs_epi16(diff);
326
7.76M
    flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
327
328
7.76M
    temp1 = _mm_slli_epi16(p1_uv_8x16, 1);
329
7.76M
    temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16);
330
7.76M
    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
331
7.76M
    temp1 = _mm_add_epi16(temp1, temp2);
332
7.76M
    p0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);
333
334
7.76M
    temp1 = _mm_slli_epi16(q1_uv_8x16, 1);
335
7.76M
    temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16);
336
7.76M
    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
337
7.76M
    temp1 = _mm_add_epi16(temp1, temp2);
338
7.76M
    q0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);
339
340
7.76M
    q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero);
341
7.76M
    q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero);
342
7.76M
    p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero);
343
7.76M
    p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero);
344
345
7.76M
    diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
346
7.76M
    diff = _mm_abs_epi16(diff);
347
7.76M
    alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
348
7.76M
    flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
349
350
7.76M
    diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
351
7.76M
    diff = _mm_abs_epi16(diff);
352
7.76M
    beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
353
7.76M
    flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
354
355
7.76M
    diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
356
7.76M
    diff = _mm_abs_epi16(diff);
357
7.76M
    flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
358
359
7.76M
    temp1 = _mm_slli_epi16(p1_uv_8x16, 1);
360
7.76M
    temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16);
361
7.76M
    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
362
7.76M
    temp1 = _mm_add_epi16(temp1, temp2);
363
7.76M
    p0_uv_8x16_2 = _mm_srai_epi16(temp1, 2);
364
365
7.76M
    temp1 = _mm_slli_epi16(q1_uv_8x16, 1);
366
7.76M
    temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16);
367
7.76M
    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
368
7.76M
    temp1 = _mm_add_epi16(temp1, temp2);
369
7.76M
    q0_uv_8x16_2 = _mm_srai_epi16(temp1, 2);
370
371
7.76M
    p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2);
372
7.76M
    q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2);
373
374
7.76M
    flag1 = _mm_packs_epi16(flag1, flag2);
375
376
7.76M
    p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
377
7.76M
                                 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
378
7.76M
    p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
379
7.76M
    p0_uv_8x16_1 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);
380
7.76M
    _mm_storeu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0), p0_uv_8x16_1);
381
382
7.76M
    q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
383
7.76M
                                 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
384
7.76M
    q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
385
7.76M
    q0_uv_8x16_1 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);
386
7.76M
    _mm_storeu_si128((__m128i *)(pu1_src_uv), q0_uv_8x16_1);
387
388
7.76M
}
389
390
/*****************************************************************************/
391
/*                                                                           */
392
/*  Function Name : ih264_deblk_chroma_vert_bslt4_ssse3()                    */
393
/*                                                                           */
394
/*  Description   : This function performs filtering of a chroma block       */
395
/*                  vertical edge when the boundary strength is less than 4  */
396
/*                  in high profile.                                         */
397
/*                                                                           */
398
/*  Inputs        : pu1_src          - pointer to the src sample q0 of U     */
399
/*                  src_strd         - source stride                         */
400
/*                  alpha_cb         - alpha value for the boundary in U     */
401
/*                  beta_cb          - beta value for the boundary in U      */
402
/*                  alpha_cr         - alpha value for the boundary in V     */
403
/*                  beta_cr          - beta value for the boundary in V      */
404
/*                  u4_bs            - packed Boundary strength array        */
405
/*                  pu1_cliptab_cb   - tc0_table for U                       */
406
/*                  pu1_cliptab_cr   - tc0_table for V                       */
407
/*                                                                           */
408
/*  Globals       : None                                                     */
409
/*                                                                           */
410
/*  Processing    : This operation is described in Sec. 8.7.2.3 under the    */
411
/*                  title "Filtering process for edges for bS less than 4"   */
412
/*                  in ITU T Rec H.264 with alpha and beta values different  */
413
/*                  in U and V.                                              */
414
/*                                                                           */
415
/*  Outputs       : None                                                     */
416
/*                                                                           */
417
/*  Returns       : None                                                     */
418
/*                                                                           */
419
/*  Issues        : None                                                     */
420
/*                                                                           */
421
/*  Revision History:                                                        */
422
/*                                                                           */
423
/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
424
/*         12 02 2015   Naveen Kumar P  Initial version                      */
425
/*                                                                           */
426
/*****************************************************************************/
427
void ih264_deblk_chroma_vert_bslt4_ssse3(UWORD8 *pu1_src,
428
                                         WORD32 src_strd,
429
                                         WORD32 alpha_cb,
430
                                         WORD32 beta_cb,
431
                                         WORD32 alpha_cr,
432
                                         WORD32 beta_cr,
433
                                         UWORD32 u4_bs,
434
                                         const UWORD8 *pu1_cliptab_cb,
435
                                         const UWORD8 *pu1_cliptab_cr)
436
9.96M
{
437
9.96M
    UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
438
9.96M
    UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
439
9.96M
    WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
440
9.96M
    WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
441
9.96M
    __m128i linea, lineb, linec, lined, linee, linef, lineg, lineh;
442
9.96M
    __m128i temp1, temp2, temp3, temp4;
443
444
9.96M
    __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
445
9.96M
    __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
446
9.96M
    __m128i flag_bs, flag1, flag2;
447
9.96M
    __m128i diff, diff1, alpha_cbcr_16x8, beta_cbcr_16x8, in_macro;
448
9.96M
    __m128i zero = _mm_setzero_si128();
449
9.96M
    __m128i C0_uv_8x16;
450
9.96M
    __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;
451
452
9.96M
    u1_Bs0 = (u4_bs >> 24) & 0xff;
453
9.96M
    u1_Bs1 = (u4_bs >> 16) & 0xff;
454
9.96M
    u1_Bs2 = (u4_bs >> 8) & 0xff;
455
9.96M
    u1_Bs3 = (u4_bs >> 0) & 0xff;
456
457
9.96M
    flag_bs = _mm_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs2, u1_Bs2,
458
9.96M
                           u1_Bs2, u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,
459
9.96M
                           u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0);
460
9.96M
    flag_bs = _mm_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s
461
9.96M
    flag_bs = _mm_xor_si128(flag_bs, _mm_set1_epi8(0xFF)); //Invert for required mask
462
463
    /* Load and transpose the pixel values */
464
9.96M
    linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4));
465
9.96M
    lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd));
466
9.96M
    linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd));
467
9.96M
    lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd));
468
9.96M
    linee = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd));
469
9.96M
    linef = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd));
470
9.96M
    lineg = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd));
471
9.96M
    lineh = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd));
472
473
9.96M
    temp1 = _mm_unpacklo_epi16(linea, lineb);
474
9.96M
    temp2 = _mm_unpacklo_epi16(linec, lined);
475
9.96M
    temp3 = _mm_unpacklo_epi16(linee, linef);
476
9.96M
    temp4 = _mm_unpacklo_epi16(lineg, lineh);
477
478
9.96M
    p1_uv_8x16 = _mm_unpacklo_epi32(temp1, temp2);
479
9.96M
    p0_uv_8x16 = _mm_unpacklo_epi32(temp3, temp4);
480
9.96M
    q0_uv_8x16 = _mm_unpackhi_epi32(temp1, temp2);
481
9.96M
    q1_uv_8x16 = _mm_unpackhi_epi32(temp3, temp4);
482
483
9.96M
    p1_uv_16x8 = _mm_unpacklo_epi64(p1_uv_8x16, p0_uv_8x16);
484
9.96M
    p0_uv_16x8 = _mm_unpackhi_epi64(p1_uv_8x16, p0_uv_8x16);
485
9.96M
    q0_uv_16x8 = _mm_unpacklo_epi64(q0_uv_8x16, q1_uv_8x16);
486
9.96M
    q1_uv_16x8 = _mm_unpackhi_epi64(q0_uv_8x16, q1_uv_8x16);
487
    /* End of transpose */
488
489
9.96M
    q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
490
9.96M
    q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
491
9.96M
    p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
492
9.96M
    p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);
493
494
9.96M
    diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
495
9.96M
    diff = _mm_abs_epi16(diff);
496
9.96M
    alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
497
9.96M
    flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
498
499
9.96M
    diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
500
9.96M
    diff = _mm_abs_epi16(diff);
501
9.96M
    beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
502
9.96M
    flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
503
504
9.96M
    diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
505
9.96M
    diff = _mm_abs_epi16(diff);
506
9.96M
    flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
507
508
9.96M
    diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16);
509
9.96M
    diff = _mm_slli_epi16(diff, 2);
510
9.96M
    diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16);
511
9.96M
    diff = _mm_add_epi16(diff, diff1);
512
9.96M
    diff = _mm_add_epi16(diff, _mm_set1_epi16(4));
513
9.96M
    in_macro = _mm_srai_epi16(diff, 3);
514
515
9.96M
    C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
516
9.96M
                               pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
517
9.96M
                               pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0],
518
9.96M
                               pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0]);
519
520
9.96M
    C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1));
521
522
9.96M
    in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3
523
9.96M
    C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16);
524
9.96M
    in_macro = _mm_max_epi16(C0_uv_8x16, in_macro);
525
526
9.96M
    p0_uv_8x16_1 = _mm_add_epi16(p0_uv_8x16, in_macro);
527
9.96M
    q0_uv_8x16_1 = _mm_sub_epi16(q0_uv_8x16, in_macro);
528
529
9.96M
    q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero);
530
9.96M
    q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero);
531
9.96M
    p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero);
532
9.96M
    p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero);
533
534
9.96M
    diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
535
9.96M
    diff = _mm_abs_epi16(diff);
536
9.96M
    alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
537
9.96M
    flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
538
539
9.96M
    diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
540
9.96M
    diff = _mm_abs_epi16(diff);
541
9.96M
    beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
542
9.96M
    flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
543
544
9.96M
    diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
545
9.96M
    diff = _mm_abs_epi16(diff);
546
9.96M
    flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
547
548
9.96M
    diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16);
549
9.96M
    diff = _mm_slli_epi16(diff, 2);
550
9.96M
    diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16);
551
9.96M
    diff = _mm_add_epi16(diff, diff1);
552
9.96M
    diff = _mm_add_epi16(diff, _mm_set1_epi16(4));
553
9.96M
    in_macro = _mm_srai_epi16(diff, 3);
554
555
9.96M
    C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
556
9.96M
                               pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
557
9.96M
                               pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2],
558
9.96M
                               pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2]);
559
560
9.96M
    C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1));
561
562
9.96M
    in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3
563
9.96M
    C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16);
564
9.96M
    in_macro = _mm_max_epi16(C0_uv_8x16, in_macro);
565
566
9.96M
    p0_uv_8x16_2 = _mm_add_epi16(p0_uv_8x16, in_macro);
567
9.96M
    q0_uv_8x16_2 = _mm_sub_epi16(q0_uv_8x16, in_macro);
568
569
9.96M
    p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2);
570
9.96M
    q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2);
571
572
9.96M
    flag1 = _mm_packs_epi16(flag1, flag2);
573
9.96M
    flag1 = _mm_and_si128(flag1, flag_bs); //Final flag (BS condition + other 3 conditions)
574
575
9.96M
    p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
576
9.96M
                                 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
577
9.96M
    p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
578
9.96M
    p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);
579
580
9.96M
    q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
581
9.96M
                                 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
582
9.96M
    q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
583
9.96M
    q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);
584
585
    /* Inverse-transpose and store back */
586
9.96M
    temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8);
587
9.96M
    temp2 = _mm_unpackhi_epi16(p1_uv_16x8, p0_uv_16x8);
588
9.96M
    temp3 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8);
589
9.96M
    temp4 = _mm_unpackhi_epi16(q0_uv_16x8, q1_uv_16x8);
590
591
9.96M
    linea = _mm_unpacklo_epi32(temp1, temp3);
592
9.96M
    lineb = _mm_srli_si128(linea, 8);
593
9.96M
    linec = _mm_unpackhi_epi32(temp1, temp3);
594
9.96M
    lined = _mm_srli_si128(linec, 8);
595
9.96M
    linee = _mm_unpacklo_epi32(temp2, temp4);
596
9.96M
    linef = _mm_srli_si128(linee, 8);
597
9.96M
    lineg = _mm_unpackhi_epi32(temp2, temp4);
598
9.96M
    lineh = _mm_srli_si128(lineg, 8);
599
600
9.96M
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea);
601
9.96M
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb);
602
9.96M
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec);
603
9.96M
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined);
604
9.96M
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd), linee);
605
9.96M
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd), linef);
606
9.96M
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd), lineg);
607
9.96M
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd), lineh);
608
609
9.96M
}
610
611
/*****************************************************************************/
612
/*                                                                           */
613
/*  Function Name : ih264_deblk_chroma_horz_bslt4_ssse3()                    */
614
/*                                                                           */
615
/*  Description   : This function performs filtering of a chroma block       */
616
/*                  horizontal edge when the boundary strength is less than  */
617
/*                  4 in high profile.                                       */
618
/*                                                                           */
619
/*  Inputs        : pu1_src          - pointer to the src sample q0 of U     */
620
/*                  src_strd         - source stride                         */
621
/*                  alpha_cb         - alpha value for the boundary in U     */
622
/*                  beta_cb          - beta value for the boundary in U      */
623
/*                  alpha_cr         - alpha value for the boundary in V     */
624
/*                  beta_cr          - beta value for the boundary in V      */
625
/*                  u4_bs            - packed Boundary strength array        */
626
/*                  pu1_cliptab_cb   - tc0_table for U                       */
627
/*                  pu1_cliptab_cr   - tc0_table for V                       */
628
/*                                                                           */
629
/*  Globals       : None                                                     */
630
/*                                                                           */
631
/*  Processing    : This operation is described in Sec. 8.7.2.3 under the    */
632
/*                  title "Filtering process for edges for bS less than 4"   */
633
/*                  in ITU T Rec H.264 with alpha and beta values different  */
634
/*                  in U and V.                                              */
635
/*                                                                           */
636
/*  Outputs       : None                                                     */
637
/*                                                                           */
638
/*  Returns       : None                                                     */
639
/*                                                                           */
640
/*  Issues        : None                                                     */
641
/*                                                                           */
642
/*  Revision History:                                                        */
643
/*                                                                           */
644
/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
645
/*         12 02 2015   Naveen Kumar P  Initial version                      */
646
/*                                                                           */
647
/*****************************************************************************/
648
void ih264_deblk_chroma_horz_bslt4_ssse3(UWORD8 *pu1_src,
649
                                         WORD32 src_strd,
650
                                         WORD32 alpha_cb,
651
                                         WORD32 beta_cb,
652
                                         WORD32 alpha_cr,
653
                                         WORD32 beta_cr,
654
                                         UWORD32 u4_bs,
655
                                         const UWORD8 *pu1_cliptab_cb,
656
                                         const UWORD8 *pu1_cliptab_cr)
657
10.0M
{
658
10.0M
    UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
659
10.0M
    WORD16 i16_posP1, i16_posP0, i16_posQ1;
660
10.0M
    UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
661
662
10.0M
    UWORD8 *pu1_HorzPixelUV; /*! < Pointer to the first pixel of the boundary */
663
10.0M
    WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
664
10.0M
    WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
665
10.0M
    __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
666
10.0M
    __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
667
10.0M
    __m128i flag_bs, flag1, flag2;
668
10.0M
    __m128i diff, diff1, alpha_cbcr_16x8, beta_cbcr_16x8, in_macro;
669
10.0M
    __m128i zero = _mm_setzero_si128();
670
10.0M
    __m128i C0_uv_8x16;
671
10.0M
    __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;
672
673
10.0M
    pu1_HorzPixelUV = pu1_src_uv - (src_strd << 1);
674
675
10.0M
    i16_posQ1 = src_strd;
676
10.0M
    i16_posP0 = src_strd;
677
10.0M
    i16_posP1 = 0;
678
679
10.0M
    u1_Bs0 = (u4_bs >> 24) & 0xff;
680
10.0M
    u1_Bs1 = (u4_bs >> 16) & 0xff;
681
10.0M
    u1_Bs2 = (u4_bs >> 8) & 0xff;
682
10.0M
    u1_Bs3 = (u4_bs >> 0) & 0xff;
683
684
10.0M
    flag_bs = _mm_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs2, u1_Bs2,
685
10.0M
                           u1_Bs2, u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,
686
10.0M
                           u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0);
687
10.0M
    flag_bs = _mm_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s
688
10.0M
    flag_bs = _mm_xor_si128(flag_bs, _mm_set1_epi8(0xFF)); //Invert for required mask
689
690
10.0M
    q0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv));
691
10.0M
    q1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv + i16_posQ1));
692
10.0M
    p1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP1));
693
10.0M
    p0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0));
694
695
10.0M
    q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
696
10.0M
    q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
697
10.0M
    p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
698
10.0M
    p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);
699
700
10.0M
    diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
701
10.0M
    diff = _mm_abs_epi16(diff);
702
10.0M
    alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
703
10.0M
    flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
704
705
10.0M
    diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
706
10.0M
    diff = _mm_abs_epi16(diff);
707
10.0M
    beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
708
10.0M
    flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
709
710
10.0M
    diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
711
10.0M
    diff = _mm_abs_epi16(diff);
712
10.0M
    flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
713
714
10.0M
    diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16);
715
10.0M
    diff = _mm_slli_epi16(diff, 2);
716
10.0M
    diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16);
717
10.0M
    diff = _mm_add_epi16(diff, diff1);
718
10.0M
    diff = _mm_add_epi16(diff, _mm_set1_epi16(4));
719
10.0M
    in_macro = _mm_srai_epi16(diff, 3);
720
721
10.0M
    C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
722
10.0M
                               pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
723
10.0M
                               pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0],
724
10.0M
                               pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0]);
725
726
10.0M
    C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1));
727
728
10.0M
    in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3
729
10.0M
    C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16);
730
10.0M
    in_macro = _mm_max_epi16(C0_uv_8x16, in_macro);
731
732
10.0M
    p0_uv_8x16_1 = _mm_add_epi16(p0_uv_8x16, in_macro);
733
10.0M
    q0_uv_8x16_1 = _mm_sub_epi16(q0_uv_8x16, in_macro);
734
735
10.0M
    q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero);
736
10.0M
    q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero);
737
10.0M
    p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero);
738
10.0M
    p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero);
739
740
10.0M
    diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
741
10.0M
    diff = _mm_abs_epi16(diff);
742
10.0M
    alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
743
10.0M
    flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
744
745
10.0M
    diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
746
10.0M
    diff = _mm_abs_epi16(diff);
747
10.0M
    beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
748
10.0M
    flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
749
750
10.0M
    diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
751
10.0M
    diff = _mm_abs_epi16(diff);
752
10.0M
    flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
753
754
10.0M
    diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16);
755
10.0M
    diff = _mm_slli_epi16(diff, 2);
756
10.0M
    diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16);
757
10.0M
    diff = _mm_add_epi16(diff, diff1);
758
10.0M
    diff = _mm_add_epi16(diff, _mm_set1_epi16(4));
759
10.0M
    in_macro = _mm_srai_epi16(diff, 3);
760
761
10.0M
    C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
762
10.0M
                               pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
763
10.0M
                               pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2],
764
10.0M
                               pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2]);
765
766
10.0M
    C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1));
767
768
10.0M
    in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3
769
10.0M
    C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16);
770
10.0M
    in_macro = _mm_max_epi16(C0_uv_8x16, in_macro);
771
772
10.0M
    p0_uv_8x16_2 = _mm_add_epi16(p0_uv_8x16, in_macro);
773
10.0M
    q0_uv_8x16_2 = _mm_sub_epi16(q0_uv_8x16, in_macro);
774
775
10.0M
    p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2);
776
10.0M
    q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2);
777
778
10.0M
    flag1 = _mm_packs_epi16(flag1, flag2);
779
10.0M
    flag1 = _mm_and_si128(flag1, flag_bs); //Final flag (BS condition + other 3 conditions)
780
781
10.0M
    p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
782
10.0M
                                 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
783
10.0M
    p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
784
10.0M
    p0_uv_8x16_1 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);
785
10.0M
    _mm_storeu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0), p0_uv_8x16_1);
786
787
10.0M
    q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
788
10.0M
                                 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
789
10.0M
    q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
790
10.0M
    q0_uv_8x16_1 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);
791
10.0M
    _mm_storeu_si128((__m128i *)(pu1_src_uv), q0_uv_8x16_1);
792
793
10.0M
}
794
795
/*****************************************************************************/
796
/*                                                                           */
797
/*  Function Name : ih264_deblk_chroma_vert_bs4_mbaff_ssse3()                */
798
/*                                                                           */
799
/*  Description   : This function performs filtering of a chroma block       */
800
/*                  vertical edge when boundary strength is set to 4 in high */
801
/*                  profile.                                                 */
802
/*                                                                           */
803
/*  Inputs        : pu1_src          - pointer to the src sample q0 of U     */
804
/*                  src_strd         - source stride                         */
805
/*                  alpha_cb         - alpha value for the boundary in U     */
806
/*                  beta_cb          - beta value for the boundary in U      */
807
/*                  alpha_cr         - alpha value for the boundary in V     */
808
/*                  beta_cr          - beta value for the boundary in V      */
809
/*                  u4_bs            - packed Boundary strength array        */
810
/*                  pu1_cliptab_cb   - tc0_table for U                       */
811
/*                  pu1_cliptab_cr   - tc0_table for V                       */
812
/*                                                                           */
813
/*  Globals       : None                                                     */
814
/*                                                                           */
815
/*  Processing    : When the function is called twice, this operation is as  */
816
/*                  described in Sec. 8.7.2.4 under the title "Filtering     */
817
/*                  process for edges for bS equal to 4" in ITU T Rec H.264  */
818
/*                  with alpha and beta values different in U and V.         */
819
/*                                                                           */
820
/*  Outputs       : None                                                     */
821
/*                                                                           */
822
/*  Returns       : None                                                     */
823
/*                                                                           */
824
/*  Issues        : None                                                     */
825
/*                                                                           */
826
/*  Revision History:                                                        */
827
/*                                                                           */
828
/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
829
/*         12 02 2015   Naveen Kumar P  Initial version                      */
830
/*                                                                           */
831
/*****************************************************************************/
832
void ih264_deblk_chroma_vert_bs4_mbaff_ssse3(UWORD8 *pu1_src,
833
                                             WORD32 src_strd,
834
                                             WORD32 alpha_cb,
835
                                             WORD32 beta_cb,
836
                                             WORD32 alpha_cr,
837
                                             WORD32 beta_cr)
838
35.3k
{
839
35.3k
    UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
840
35.3k
    WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
841
35.3k
    WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
842
35.3k
    __m128i linea, lineb, linec, lined;
843
35.3k
    __m128i temp1, temp2;
844
845
35.3k
    __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
846
35.3k
    __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
847
35.3k
    __m128i flag1;
848
35.3k
    __m128i diff, alpha_cbcr_16x8, beta_cbcr_16x8;
849
35.3k
    __m128i zero = _mm_setzero_si128();
850
35.3k
    __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;
851
852
    /* Load and transpose the pixel values */
853
35.3k
    linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4));
854
35.3k
    lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd));
855
35.3k
    linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd));
856
35.3k
    lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd));
857
858
35.3k
    temp1 = _mm_unpacklo_epi16(linea, lineb);
859
35.3k
    temp2 = _mm_unpacklo_epi16(linec, lined);
860
861
35.3k
    p1_uv_16x8 = _mm_unpacklo_epi32(temp1, temp2);
862
35.3k
    p0_uv_16x8 = _mm_srli_si128(p1_uv_16x8, 8);
863
35.3k
    q0_uv_16x8 = _mm_unpackhi_epi32(temp1, temp2);
864
35.3k
    q1_uv_16x8 = _mm_srli_si128(q0_uv_16x8, 8);
865
    /* End of transpose */
866
867
35.3k
    q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
868
35.3k
    q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
869
35.3k
    p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
870
35.3k
    p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);
871
872
35.3k
    diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
873
35.3k
    diff = _mm_abs_epi16(diff);
874
35.3k
    alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
875
35.3k
    flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
876
877
35.3k
    diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
878
35.3k
    diff = _mm_abs_epi16(diff);
879
35.3k
    beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
880
35.3k
    flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
881
882
35.3k
    diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
883
35.3k
    diff = _mm_abs_epi16(diff);
884
35.3k
    flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
885
886
35.3k
    temp1 = _mm_slli_epi16(p1_uv_8x16, 1);
887
35.3k
    temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16);
888
35.3k
    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
889
35.3k
    temp1 = _mm_add_epi16(temp1, temp2);
890
35.3k
    p0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);
891
892
35.3k
    temp1 = _mm_slli_epi16(q1_uv_8x16, 1);
893
35.3k
    temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16);
894
35.3k
    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
895
35.3k
    temp1 = _mm_add_epi16(temp1, temp2);
896
35.3k
    q0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);
897
898
35.3k
    p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_1);
899
35.3k
    q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_1);
900
901
35.3k
    flag1 = _mm_packs_epi16(flag1, flag1);
902
903
35.3k
    p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
904
35.3k
                                 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
905
35.3k
    p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
906
35.3k
    p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);
907
908
35.3k
    q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
909
35.3k
                                 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
910
35.3k
    q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
911
35.3k
    q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);
912
913
    /* Inverse-transpose and store back */
914
35.3k
    temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8);
915
35.3k
    temp2 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8);
916
917
35.3k
    linea = _mm_unpacklo_epi32(temp1, temp2);
918
35.3k
    lineb = _mm_srli_si128(linea, 8);
919
35.3k
    linec = _mm_unpackhi_epi32(temp1, temp2);
920
35.3k
    lined = _mm_srli_si128(linec, 8);
921
922
35.3k
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea);
923
35.3k
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb);
924
35.3k
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec);
925
35.3k
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined);
926
927
35.3k
}
928
929
/*****************************************************************************/
930
/*                                                                           */
931
/*  Function Name : ih264_deblk_chroma_vert_bslt4_mbaff_ssse3()              */
932
/*                                                                           */
933
/*  Description   : This function performs filtering of a chroma block       */
934
/*                  vertical edge when boundary strength is less than 4 in   */
935
/*                  high profile.                                            */
936
/*                                                                           */
937
/*  Inputs        : pu1_src          - pointer to the src sample q0 of U     */
938
/*                  src_strd         - source stride                         */
939
/*                  alpha_cb         - alpha value for the boundary in U     */
940
/*                  beta_cb          - beta value for the boundary in U      */
941
/*                  alpha_cr         - alpha value for the boundary in V     */
942
/*                  beta_cr          - beta value for the boundary in V      */
943
/*                  u4_bs            - packed Boundary strength array        */
944
/*                  pu1_cliptab_cb   - tc0_table for U                       */
945
/*                  pu1_cliptab_cr   - tc0_table for V                       */
946
/*                                                                           */
947
/*  Globals       : None                                                     */
948
/*                                                                           */
949
/*  Processing    : When the function is called twice, this operation is as  */
950
/*                  described in Sec. 8.7.2.4 under the title "Filtering     */
951
/*                  process for edges for bS less than 4" in ITU T Rec H.264 */
952
/*                  with alpha and beta values different in U and V.         */
953
/*                                                                           */
954
/*  Outputs       : None                                                     */
955
/*                                                                           */
956
/*  Returns       : None                                                     */
957
/*                                                                           */
958
/*  Issues        : None                                                     */
959
/*                                                                           */
960
/*  Revision History:                                                        */
961
/*                                                                           */
962
/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
963
/*         12 02 2015   Naveen Kumar P  Initial version                      */
964
/*                                                                           */
965
/*****************************************************************************/
966
void ih264_deblk_chroma_vert_bslt4_mbaff_ssse3(UWORD8 *pu1_src,
967
                                               WORD32 src_strd,
968
                                               WORD32 alpha_cb,
969
                                               WORD32 beta_cb,
970
                                               WORD32 alpha_cr,
971
                                               WORD32 beta_cr,
972
                                               UWORD32 u4_bs,
973
                                               const UWORD8 *pu1_cliptab_cb,
974
                                               const UWORD8 *pu1_cliptab_cr)
975
10.7k
{
976
10.7k
    UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
977
10.7k
    UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
978
10.7k
    WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
979
10.7k
    WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
980
10.7k
    __m128i linea, lineb, linec, lined;
981
10.7k
    __m128i temp1, temp2;
982
983
10.7k
    __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
984
10.7k
    __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
985
10.7k
    __m128i flag_bs, flag1;
986
10.7k
    __m128i diff, diff1, alpha_cbcr_16x8, beta_cbcr_16x8, in_macro;
987
10.7k
    __m128i zero = _mm_setzero_si128();
988
10.7k
    __m128i C0_uv_8x16;
989
10.7k
    __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;
990
991
10.7k
    u1_Bs0 = (u4_bs >> 24) & 0xff;
992
10.7k
    u1_Bs1 = (u4_bs >> 16) & 0xff;
993
10.7k
    u1_Bs2 = (u4_bs >> 8) & 0xff;
994
10.7k
    u1_Bs3 = (u4_bs >> 0) & 0xff;
995
996
10.7k
    flag_bs = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, u1_Bs3, u1_Bs3, u1_Bs2,
997
10.7k
                           u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs0, u1_Bs0);
998
10.7k
    flag_bs = _mm_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s
999
10.7k
    flag_bs = _mm_xor_si128(flag_bs, _mm_set1_epi8(0xFF)); //Invert for required mask
1000
1001
    /* Load and transpose the pixel values */
1002
10.7k
    linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4));
1003
10.7k
    lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd));
1004
10.7k
    linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd));
1005
10.7k
    lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd));
1006
1007
10.7k
    temp1 = _mm_unpacklo_epi16(linea, lineb);
1008
10.7k
    temp2 = _mm_unpacklo_epi16(linec, lined);
1009
1010
10.7k
    p1_uv_16x8 = _mm_unpacklo_epi32(temp1, temp2);
1011
10.7k
    p0_uv_16x8 = _mm_srli_si128(p1_uv_16x8, 8);
1012
10.7k
    q0_uv_16x8 = _mm_unpackhi_epi32(temp1, temp2);
1013
10.7k
    q1_uv_16x8 = _mm_srli_si128(q0_uv_16x8, 8);
1014
    /* End of transpose */
1015
1016
10.7k
    q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
1017
10.7k
    q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
1018
10.7k
    p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
1019
10.7k
    p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);
1020
1021
10.7k
    diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
1022
10.7k
    diff = _mm_abs_epi16(diff);
1023
10.7k
    alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
1024
10.7k
    flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
1025
1026
10.7k
    diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
1027
10.7k
    diff = _mm_abs_epi16(diff);
1028
10.7k
    beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
1029
10.7k
    flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
1030
1031
10.7k
    diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
1032
10.7k
    diff = _mm_abs_epi16(diff);
1033
10.7k
    flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
1034
1035
10.7k
    diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16);
1036
10.7k
    diff = _mm_slli_epi16(diff, 2);
1037
10.7k
    diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16);
1038
10.7k
    diff = _mm_add_epi16(diff, diff1);
1039
10.7k
    diff = _mm_add_epi16(diff, _mm_set1_epi16(4));
1040
10.7k
    in_macro = _mm_srai_epi16(diff, 3);
1041
1042
10.7k
    C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
1043
10.7k
                               pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2],
1044
10.7k
                               pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
1045
10.7k
                               pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0]);
1046
1047
10.7k
    C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1));
1048
1049
10.7k
    in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3
1050
10.7k
    C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16);
1051
10.7k
    in_macro = _mm_max_epi16(C0_uv_8x16, in_macro);
1052
1053
10.7k
    p0_uv_8x16_1 = _mm_add_epi16(p0_uv_8x16, in_macro);
1054
10.7k
    q0_uv_8x16_1 = _mm_sub_epi16(q0_uv_8x16, in_macro);
1055
1056
10.7k
    p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_1);
1057
10.7k
    q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_1);
1058
1059
10.7k
    flag1 = _mm_packs_epi16(flag1, flag1);
1060
10.7k
    flag1 = _mm_and_si128(flag1, flag_bs); //Final flag (BS condition + other 3 conditions)
1061
1062
10.7k
    p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
1063
10.7k
                                 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
1064
10.7k
    p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
1065
10.7k
    p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);
1066
1067
10.7k
    q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
1068
10.7k
                                 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
1069
10.7k
    q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
1070
10.7k
    q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);
1071
1072
    /* Inverse-transpose and store back */
1073
10.7k
    temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8);
1074
10.7k
    temp2 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8);
1075
1076
10.7k
    linea = _mm_unpacklo_epi32(temp1, temp2);
1077
10.7k
    lineb = _mm_srli_si128(linea, 8);
1078
10.7k
    linec = _mm_unpackhi_epi32(temp1, temp2);
1079
10.7k
    lined = _mm_srli_si128(linec, 8);
1080
1081
10.7k
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea);
1082
10.7k
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb);
1083
10.7k
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec);
1084
10.7k
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined);
1085
1086
10.7k
}
1087