Coverage Report

Created: 2026-05-30 06:23

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libavc/common/x86/ih264_deblk_chroma_avx2.c
Line
Count
Source
1
/******************************************************************************
2
 *
3
 * Copyright (C) 2015 The Android Open Source Project
4
 *
5
 * Licensed under the Apache License, Version 2.0 (the "License");
6
 * you may not use this file except in compliance with the License.
7
 * You may obtain a copy of the License at:
8
 *
9
 * http://www.apache.org/licenses/LICENSE-2.0
10
 *
11
 * Unless required by applicable law or agreed to in writing, software
12
 * distributed under the License is distributed on an "AS IS" BASIS,
13
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 * See the License for the specific language governing permissions and
15
 * limitations under the License.
16
 *
17
 *****************************************************************************
18
 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19
*/
20
/*****************************************************************************/
21
22
/*****************************************************************************/
23
/* File Includes                                                             */
24
/*****************************************************************************/
25
26
/* System include files */
27
#include <stdio.h>
28
29
#ifdef __ANDROID__
30
#include "log/log.h"
31
#include <cutils/log.h>
32
#endif
33
34
/* User include files */
35
#include "ih264_typedefs.h"
36
#include "ih264_platform_macros.h"
37
#include "ih264_deblk_edge_filters.h"
38
#include "ih264_macros.h"
39
40
#include <stdint.h>
41
#include <string.h>
42
#include <immintrin.h>
43
44
45
46
/*****************************************************************************/
47
/*                                                                           */
48
/*  Function Name : ih264_deblk_chroma_vert_bslt4_avx2()                    */
49
/*                                                                           */
50
/*  Description   : This function performs filtering of a chroma block       */
51
/*                  vertical edge when the boundary strength is less than 4  */
52
/*                  in high profile.                                         */
53
/*                                                                           */
54
/*  Inputs        : pu1_src          - pointer to the src sample q0 of U     */
55
/*                  src_strd         - source stride                         */
56
/*                  alpha_cb         - alpha value for the boundary in U     */
57
/*                  beta_cb          - beta value for the boundary in U      */
58
/*                  alpha_cr         - alpha value for the boundary in V     */
59
/*                  beta_cr          - beta value for the boundary in V      */
60
/*                  u4_bs            - packed Boundary strength array        */
61
/*                  pu1_cliptab_cb   - tc0_table for U                       */
62
/*                  pu1_cliptab_cr   - tc0_table for V                       */
63
/*                                                                           */
64
/*  Globals       : None                                                     */
65
/*                                                                           */
66
/*  Processing    : This operation is described in Sec. 8.7.2.3 under the    */
67
/*                  title "Filtering process for edges for bS less than 4"   */
68
/*                  in ITU T Rec H.264 with alpha and beta values different  */
69
/*                  in U and V.                                              */
70
/*                                                                           */
71
/*  Outputs       : None                                                     */
72
/*                                                                           */
73
/*  Returns       : None                                                     */
74
/*                                                                           */
75
/*  Issues        : None                                                     */
76
/*                                                                           */
77
/*  Revision History:                                                        */
78
/*                                                                           */
79
/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
80
/*         12 02 2015   Naveen Kumar P  Initial version                      */
81
/*         15 09 2020   Priyanka Bose   AVX2 Intel Intrinsics Support        */
82
/*****************************************************************************/
83
84
void ih264_deblk_chroma_vert_bslt4_avx2(UWORD8 *pu1_src,
85
                                         WORD32 src_strd,
86
                                         WORD32 alpha_cb,
87
                                         WORD32 beta_cb,
88
                                         WORD32 alpha_cr,
89
                                         WORD32 beta_cr,
90
                                         UWORD32 u4_bs,
91
                                         const UWORD8 *pu1_cliptab_cb,
92
                                         const UWORD8 *pu1_cliptab_cr)
93
77.0k
{
94
77.0k
    UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
95
77.0k
    UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
96
77.0k
    WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
97
77.0k
    WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
98
77.0k
    __m128i linea, lineb, linec, lined, linee, linef, lineg, lineh;
99
77.0k
    __m256i lineab, linecd, lineef, linegh, lineae, linebf, linecg, linedh;
100
77.0k
    __m256i temp1, temp2, temp3, temp4;
101
77.0k
    __m256i t1,t3, t2,t4,pq0_uv_32x8,pq1_uv_32x8,tmp1,tmp2,p0_uv_8x32,q0_uv_8x32;
102
103
77.0k
    __m256i pq0_uv_8x32, pq1_uv_8x32, p1_uv_8x32,pq0_uv_8x32_1,pq0_uv_8x32_2;
104
77.0k
    __m256i flag_bs, flag1, flag2;
105
77.0k
    __m256i diff, diff1, alpha_cbcr_32x8, beta_cbcr_32x8, in_macro;
106
77.0k
    __m256i zero = _mm256_setzero_si256();
107
77.0k
    __m256i C0_uv_8x32;
108
77.0k
    __m256i p0_uv_8x32_1, p0_uv_8x32_2, q0_uv_8x32_1, q0_uv_8x32_2,p0_uv_32x8_1,q0_uv_32x8_1;
109
110
77.0k
    u1_Bs0 = (u4_bs >> 24) & 0xff;
111
77.0k
    u1_Bs1 = (u4_bs >> 16) & 0xff;
112
77.0k
    u1_Bs2 = (u4_bs >> 8) & 0xff;
113
77.0k
    u1_Bs3 = (u4_bs >> 0) & 0xff;
114
115
77.0k
    flag_bs = _mm256_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3,u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3,
116
77.0k
                              u1_Bs2, u1_Bs2, u1_Bs2, u1_Bs2, u1_Bs2, u1_Bs2,u1_Bs2, u1_Bs2,
117
77.0k
                              u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,
118
77.0k
                              u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0,u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0);
119
77.0k
    flag_bs = _mm256_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s
120
77.0k
    flag_bs = _mm256_xor_si256(flag_bs, _mm256_set1_epi8(0xFF)); //Invert for required mask
121
122
    /* Load and transpose the pixel values */
123
77.0k
    lineab =  _mm256_loadu2_m128i((__m128i *)(pu1_src_uv - 4 + src_strd), (__m128i *)(pu1_src_uv - 4));
124
77.0k
    linecd =  _mm256_loadu2_m128i((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), (__m128i *)(pu1_src_uv - 4 + 2 * src_strd));
125
77.0k
    lineef =  _mm256_loadu2_m128i((__m128i *)(pu1_src_uv - 4 + 5 * src_strd), (__m128i *)(pu1_src_uv - 4 + 4 * src_strd));
126
77.0k
    linegh =  _mm256_loadu2_m128i((__m128i *)(pu1_src_uv - 4 + 7 * src_strd), (__m128i *)(pu1_src_uv - 4 + 6 * src_strd));
127
128
77.0k
    temp1 = _mm256_unpacklo_epi64(lineab, zero);  //a0 -- a7  000.. b0..b7 000
129
77.0k
    temp2 = _mm256_unpacklo_epi64(linecd, zero);
130
77.0k
    temp3 = _mm256_unpacklo_epi64(lineef, zero);  //e0 -- e7  000.. f0..f7 000
131
77.0k
    temp4 = _mm256_unpacklo_epi64(linegh, zero);
132
133
77.0k
    temp1 = _mm256_unpacklo_epi16(temp1, temp2);  //a0 a1 c0 c1 --  a6 a7 c6 c7 b0 b1 d0 d1.. b6 b7 d6 d7
134
77.0k
    temp2 = _mm256_unpacklo_epi16(temp3, temp4);  //e0 e1 g0 g1                f0 f1 h0 h1
135
136
77.0k
    t2 = _mm256_permute2f128_si256(temp1, temp2, 0x20);
137
77.0k
    t3 = _mm256_permute2f128_si256(temp1, temp2, 0x31);
138
139
77.0k
    tmp1 = _mm256_unpacklo_epi16(t2, t3);    //a0 a1 b0 b1 c0 c1 d0 d1 -a2 a3 b2 b3 ....  e0 e1 f0 f1 g0 g1 h0 h1  -e2 e3..
140
77.0k
    tmp2 = _mm256_unpackhi_epi16(t2, t3);    //a4 a5 b4 b5             -a6 a7 b6 b7
141
142
143
77.0k
    temp1 = _mm256_unpacklo_epi8(tmp1,zero); // a0 0 a1 0 b0 0 b1 0 c0 0 c1 0 d0 0 d1 0 -  e0 0 e1 0 ..   => p1
144
77.0k
    temp2 = _mm256_unpackhi_epi8(tmp1,zero); // a2 0 a3 0                                                 => p0
145
77.0k
    temp3 = _mm256_unpacklo_epi8(tmp2,zero); //a4 0 a5 0                                                  => q0
146
77.0k
    temp4 = _mm256_unpackhi_epi8(tmp2,zero); //a6 0 a7 0                                                 => q1
147
148
77.0k
    pq1_uv_32x8 = _mm256_packus_epi16(temp1,temp4);     // 0213
149
77.0k
    pq0_uv_32x8 = _mm256_packus_epi16(temp2,temp3);     //0213
150
151
77.0k
    diff = _mm256_subs_epi16(temp2, temp3); //Condn 1    (p0 -q0) - set (3), set(3)
152
77.0k
    diff = _mm256_abs_epi16(diff);
153
77.0k
    alpha_cbcr_32x8 = _mm256_set1_epi32(alpha_cbcr);
154
77.0k
    flag1 = _mm256_cmpgt_epi16(alpha_cbcr_32x8, diff);
155
156
77.0k
    diff = _mm256_subs_epi16(temp4, temp3); //Condtn 2   (q1 -q0)
157
77.0k
    diff = _mm256_abs_epi16(diff);
158
77.0k
    beta_cbcr_32x8 = _mm256_set1_epi32(beta_cbcr);
159
77.0k
    flag1 = _mm256_and_si256(flag1, _mm256_cmpgt_epi16(beta_cbcr_32x8, diff));
160
161
162
77.0k
    diff = _mm256_subs_epi16(temp1, temp2); //Condtn 3  (p1 -p0)
163
77.0k
    diff = _mm256_abs_epi16(diff);
164
77.0k
    flag1 = _mm256_and_si256(flag1, _mm256_cmpgt_epi16(beta_cbcr_32x8, diff));
165
166
77.0k
    diff = _mm256_subs_epi16(temp3, temp2);     //(q0 -p0)
167
77.0k
    diff = _mm256_slli_epi16(diff, 2);
168
169
77.0k
    diff1 = _mm256_subs_epi16(temp1, temp4);     //(p1 -q1)
170
77.0k
    diff = _mm256_add_epi16(diff, diff1);
171
172
77.0k
    diff = _mm256_add_epi16(diff, _mm256_set1_epi16(4));
173
77.0k
    in_macro = _mm256_srai_epi16(diff, 3);
174
175
176
77.0k
    C0_uv_8x32 = _mm256_set_epi16(pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
177
77.0k
                               pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
178
77.0k
                               pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0],
179
77.0k
                               pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0],
180
77.0k
                               pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
181
77.0k
                               pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
182
77.0k
                               pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2],
183
77.0k
                               pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2]);
184
185
77.0k
    C0_uv_8x32 = _mm256_add_epi16(C0_uv_8x32, _mm256_set1_epi16(1));
186
187
77.0k
    in_macro = _mm256_min_epi16(C0_uv_8x32, in_macro); //CLIP3
188
77.0k
    C0_uv_8x32 = _mm256_subs_epi16(zero, C0_uv_8x32);
189
77.0k
    in_macro = _mm256_max_epi16(C0_uv_8x32, in_macro);
190
191
77.0k
    p0_uv_8x32_1 = _mm256_add_epi16(temp2, in_macro);
192
77.0k
    q0_uv_8x32_1 = _mm256_sub_epi16(temp3, in_macro);
193
194
195
77.0k
    flag1 = _mm256_and_si256(flag1, flag_bs);
196
77.0k
    flag1 = _mm256_packs_epi16(flag1, flag1);  // 0213
197
198
77.0k
    pq0_uv_8x32 = _mm256_packus_epi16(p0_uv_8x32_1,q0_uv_8x32_1); //0213
199
200
77.0k
    pq0_uv_8x32_1 = _mm256_and_si256(pq0_uv_32x8,
201
77.0k
                                 _mm256_xor_si256(flag1, _mm256_set1_epi8(0xFF)));
202
77.0k
    pq0_uv_8x32_2 = _mm256_and_si256(pq0_uv_8x32, flag1);
203
77.0k
    pq0_uv_32x8 = _mm256_add_epi8(pq0_uv_8x32_1, pq0_uv_8x32_2);
204
205
206
77.0k
    t1 = _mm256_unpacklo_epi16(pq1_uv_32x8, pq0_uv_32x8);   // temp1 temp3
207
77.0k
    t2 = _mm256_unpackhi_epi16(pq1_uv_32x8, pq0_uv_32x8);   // temp2 temp4
208
209
77.0k
    t4 = _mm256_shufflelo_epi16(t2, _MM_SHUFFLE(2, 3, 0, 1));  // pshuflw
210
77.0k
    t4 = _mm256_shufflehi_epi16(t4, _MM_SHUFFLE(2, 3, 0, 1));
211
212
77.0k
    lineae = _mm256_unpacklo_epi32(t1, t4);   // temp1 temp3
213
77.0k
    linecg = _mm256_unpackhi_epi32(t1, t4);   // temp2 temp4
214
215
77.0k
    linea =  _mm256_castsi256_si128(lineae);
216
77.0k
    lineb = _mm256_castsi256_si128(_mm256_srli_si256(lineae, 8));
217
77.0k
    lineae =  _mm256_permute2f128_si256(lineae, lineae, 0x1);
218
77.0k
    linee =  _mm256_castsi256_si128(lineae);
219
77.0k
    linef = _mm256_castsi256_si128(_mm256_srli_si256(lineae, 8));
220
221
222
77.0k
    linec =  _mm256_castsi256_si128(linecg);
223
77.0k
    lined =  _mm256_castsi256_si128(_mm256_srli_si256(linecg, 8));
224
77.0k
    linecg =  _mm256_permute2f128_si256(linecg, linecg, 0x1);
225
77.0k
    lineg =   _mm256_castsi256_si128(linecg);
226
77.0k
    lineh = _mm256_castsi256_si128(_mm256_srli_si256(linecg, 8));
227
228
77.0k
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea);
229
77.0k
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb);
230
77.0k
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec);
231
77.0k
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined);
232
77.0k
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd), linee);
233
77.0k
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd), linef);
234
77.0k
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd), lineg);
235
77.0k
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd), lineh);
236
237
77.0k
}
238
239
/*****************************************************************************/
240
/*                                                                           */
241
/*  Function Name : ih264_deblk_chroma_horz_bslt4_avx2()                    */
242
/*                                                                           */
243
/*  Description   : This function performs filtering of a chroma block       */
244
/*                  horizontal edge when the boundary strength is less than  */
245
/*                  4 in high profile.                                       */
246
/*                                                                           */
247
/*  Inputs        : pu1_src          - pointer to the src sample q0 of U     */
248
/*                  src_strd         - source stride                         */
249
/*                  alpha_cb         - alpha value for the boundary in U     */
250
/*                  beta_cb          - beta value for the boundary in U      */
251
/*                  alpha_cr         - alpha value for the boundary in V     */
252
/*                  beta_cr          - beta value for the boundary in V      */
253
/*                  u4_bs            - packed Boundary strength array        */
254
/*                  pu1_cliptab_cb   - tc0_table for U                       */
255
/*                  pu1_cliptab_cr   - tc0_table for V                       */
256
/*                                                                           */
257
/*  Globals       : None                                                     */
258
/*                                                                           */
259
/*  Processing    : This operation is described in Sec. 8.7.2.3 under the    */
260
/*                  title "Filtering process for edges for bS less than 4"   */
261
/*                  in ITU T Rec H.264 with alpha and beta values different  */
262
/*                  in U and V.                                              */
263
/*                                                                           */
264
/*  Outputs       : None                                                     */
265
/*                                                                           */
266
/*  Returns       : None                                                     */
267
/*                                                                           */
268
/*  Issues        : None                                                     */
269
/*                                                                           */
270
/*  Revision History:                                                        */
271
/*                                                                           */
272
/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
273
/*         12 02 2015   Naveen Kumar P  Initial version                      */
274
/*         12 10 2020   Priyanka Bose   AVX2 Intel Intrinsics Support        */
275
/*****************************************************************************/
276
void ih264_deblk_chroma_horz_bslt4_avx2 (UWORD8 *pu1_src,
277
                                         WORD32 src_strd,
278
                                         WORD32 alpha_cb,
279
                                         WORD32 beta_cb,
280
                                         WORD32 alpha_cr,
281
                                         WORD32 beta_cr,
282
                                         UWORD32 u4_bs,
283
                                         const UWORD8 *pu1_cliptab_cb,
284
                                         const UWORD8 *pu1_cliptab_cr)
285
86.1k
{
286
86.1k
    UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
287
86.1k
    WORD16 i16_posP1, i16_posP0, i16_posQ1;
288
86.1k
    UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
289
290
86.1k
    UWORD8 *pu1_HorzPixelUV; /*! < Pointer to the first pixel of the boundary */
291
86.1k
    WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
292
86.1k
    WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
293
86.1k
    __m256i p0q0_uv_32x8,p1q1_uv_32x8;
294
86.1k
    __m256i temp1,temp2,temp3,temp4;
295
86.1k
    __m256i flag_bs, flag1, flag2;
296
86.1k
    __m256i diff, diff1, alpha_cbcr_32x8, beta_cbcr_32x8, in_macro;
297
86.1k
    __m256i zero = _mm256_setzero_si256();
298
86.1k
    __m256i C0_uv_8x32;
299
86.1k
    __m256i p0q0_uv_8x32_1, p0q0_uv_8x32_2,res1,res2,p0_uv_8x32_1,q0_uv_8x32_1;
300
301
86.1k
    pu1_HorzPixelUV = pu1_src_uv - (src_strd << 1);
302
303
86.1k
    i16_posQ1 = src_strd;
304
86.1k
    i16_posP0 = src_strd;
305
86.1k
    i16_posP1 = 0;
306
307
86.1k
    u1_Bs0 = (u4_bs >> 24) & 0xff;
308
86.1k
    u1_Bs1 = (u4_bs >> 16) & 0xff;
309
86.1k
    u1_Bs2 = (u4_bs >> 8) & 0xff;
310
86.1k
    u1_Bs3 = (u4_bs >> 0) & 0xff;
311
312
86.1k
    flag_bs = _mm256_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3,
313
86.1k
                           u1_Bs2, u1_Bs2, u1_Bs2, u1_Bs2,
314
86.1k
                           u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3,
315
86.1k
                           u1_Bs2, u1_Bs2,u1_Bs2, u1_Bs2,
316
86.1k
                           u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,
317
86.1k
                           u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0,
318
86.1k
                           u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,
319
86.1k
                           u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0);
320
86.1k
    flag_bs = _mm256_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s
321
86.1k
    flag_bs = _mm256_xor_si256(flag_bs, _mm256_set1_epi8(0xFF)); //Invert for required mask
322
323
86.1k
    p0q0_uv_32x8 = _mm256_loadu2_m128i((__m128i *)(pu1_src_uv), (__m128i *)(pu1_HorzPixelUV + i16_posP0));
324
86.1k
    p1q1_uv_32x8 = _mm256_loadu2_m128i((__m128i *)(pu1_src_uv + i16_posQ1), (__m128i *)(pu1_HorzPixelUV + i16_posP1));
325
326
86.1k
    res1 = _mm256_permute4x64_epi64(p0q0_uv_32x8,0xD8);
327
86.1k
    res2 = _mm256_permute4x64_epi64(p1q1_uv_32x8,0xD8);
328
329
86.1k
    temp3 = _mm256_unpacklo_epi8(res1, zero); //p0 l 0 h 0
330
86.1k
    temp4 = _mm256_unpackhi_epi8(res1, zero); //q0
331
86.1k
    temp1 = _mm256_unpacklo_epi8(res2, zero); //p1
332
86.1k
    temp2 = _mm256_unpackhi_epi8(res2, zero); //q1
333
334
86.1k
    diff = _mm256_subs_epi16(temp3, temp4); //Condn 1 //p0 l h - q0 l h
335
86.1k
    diff = _mm256_abs_epi16(diff);
336
86.1k
    alpha_cbcr_32x8 = _mm256_set1_epi32(alpha_cbcr);
337
86.1k
    flag1 = _mm256_cmpgt_epi16(alpha_cbcr_32x8, diff);
338
339
86.1k
    diff = _mm256_subs_epi16(temp2, temp4); //Condtn 2
340
86.1k
    diff = _mm256_abs_epi16(diff);
341
86.1k
    beta_cbcr_32x8 = _mm256_set1_epi32(beta_cbcr);
342
86.1k
    flag1 = _mm256_and_si256(flag1, _mm256_cmpgt_epi16(beta_cbcr_32x8, diff));
343
344
86.1k
    diff = _mm256_subs_epi16(temp1, temp3); //Condtn 3
345
86.1k
    diff = _mm256_abs_epi16(diff);
346
86.1k
    flag1 = _mm256_and_si256(flag1, _mm256_cmpgt_epi16(beta_cbcr_32x8, diff));
347
348
86.1k
    diff = _mm256_subs_epi16(temp4, temp3);
349
86.1k
    diff = _mm256_slli_epi16(diff, 2);
350
86.1k
    diff1 = _mm256_subs_epi16(temp1, temp2);
351
86.1k
    diff = _mm256_add_epi16(diff, diff1);
352
86.1k
    diff = _mm256_add_epi16(diff, _mm256_set1_epi16(4));
353
86.1k
    in_macro = _mm256_srai_epi16(diff, 3);
354
355
86.1k
    C0_uv_8x32 = _mm256_set_epi16(
356
86.1k
                               pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
357
86.1k
                               pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
358
86.1k
                               pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2],
359
86.1k
                               pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2],
360
86.1k
                               pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
361
86.1k
                               pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
362
86.1k
                               pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0],
363
86.1k
                               pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0]);
364
365
86.1k
    C0_uv_8x32 = _mm256_add_epi16(C0_uv_8x32, _mm256_set1_epi16(1));
366
367
86.1k
    in_macro = _mm256_min_epi16(C0_uv_8x32, in_macro); //CLIP3
368
86.1k
    C0_uv_8x32 = _mm256_subs_epi16(zero, C0_uv_8x32);
369
86.1k
    in_macro = _mm256_max_epi16(C0_uv_8x32, in_macro);
370
371
86.1k
    p0_uv_8x32_1 = _mm256_add_epi16(temp3, in_macro);
372
86.1k
    q0_uv_8x32_1 = _mm256_sub_epi16(temp4, in_macro);
373
374
86.1k
    p0q0_uv_8x32_2 = _mm256_packus_epi16(p0_uv_8x32_1,q0_uv_8x32_1);
375
86.1k
    flag1 = _mm256_packs_epi16(flag1, flag1);
376
86.1k
    flag1 = _mm256_and_si256(flag1, flag_bs); //Final flag (BS condition + other 3 conditions)
377
378
86.1k
    p0q0_uv_8x32_1 = _mm256_and_si256(res1,
379
86.1k
                                 _mm256_xor_si256(flag1, _mm256_set1_epi8(0xFF)));
380
86.1k
    p0q0_uv_8x32_2 = _mm256_and_si256(p0q0_uv_8x32_2, flag1);
381
86.1k
    p0q0_uv_8x32_1 = _mm256_add_epi8(p0q0_uv_8x32_1, p0q0_uv_8x32_2);
382
86.1k
    p0q0_uv_8x32_1 = _mm256_permute4x64_epi64(p0q0_uv_8x32_1,0xD8);
383
384
86.1k
    _mm256_storeu2_m128i((__m128i *)(pu1_src_uv),(__m128i *)(pu1_HorzPixelUV + i16_posP0), p0q0_uv_8x32_1);
385
386
86.1k
}