/src/libavc/common/x86/ih264_deblk_luma_avx2.c

Source
/******************************************************************************
 *
 * Copyright (C) 2015 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 *****************************************************************************
 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
*/
/*****************************************************************************/
/*                                                                           */
/*  File Name         : ih264_deblk_luma_avx2.c                              */
/*                                                                           */
/*  Description       : Contains function definitions for deblocking         */
/*                                                                           */
/*  List of Functions : ih264_deblk_luma_horz_bslt4_avx2()                   */
/*                      ih264_deblk_luma_vert_bslt4_avx2()                   */
/*                                                                           */
/*  Issues / Problems : None                                                 */
/*                                                                           */
/*  Revision History  :                                                      */
/*                                                                           */
/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
/*         12 02 2015   Naveen Kumar P  Added luma deblocking ssse3          */
/*                                      intrinsics                           */
/*         15 09 2020   Priyanka Bose   AVX2 Intel Intrinsics Support        */
/*****************************************************************************/

/*****************************************************************************/
/* File Includes                                                             */
/*****************************************************************************/

/* System include files */
#include <stdio.h>
#ifdef __ANDROID__
#include "log/log.h"
#include <cutils/log.h>
#endif

/* User include files */
#include "ih264_typedefs.h"
#include "ih264_platform_macros.h"
#include "ih264_deblk_edge_filters.h"
#include "ih264_macros.h"


/*****************************************************************************/
/*                                                                           */
/*  Function Name : ih264_deblk_luma_horz_bslt4_avx2()                       */
/*                                                                           */
/*  Description   : This function performs filtering of a luma block         */
/*                  horizontal edge when boundary strength is less than 4.   */
/*                                                                           */
/*  Inputs        : pu1_src       - pointer to the src sample q0             */
/*                  src_strd      - source stride                            */
/*                  alpha         - alpha value for the boundary             */
/*                  beta          - beta value for the boundary              */
/*                  u4_bs         - packed Boundary strength array           */
/*                  pu1_cliptab   - tc0_table                                */
/*                                                                           */
/*  Globals       : None                                                     */
/*                                                                           */
/*  Processing    : This operation is described in Sec. 8.7.2.3 under the    */
/*                  title "Filtering process for edges for bS less than 4"   */
/*                  in ITU T Rec H.264.                                      */
/*                                                                           */
/*  Outputs       : None                                                     */
/*                                                                           */
/*  Returns       : None                                                     */
/*                                                                           */
/*  Issues        : None                                                     */
/*                                                                           */
/*  Revision History:                                                        */
/*                                                                           */
/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
/*         12 02 2015   Naveen Kumar P  Initial version                      */
/*         15 09 2020   Priyanka Bose   AVX2 Intel Intrinsics Support        */
/*****************************************************************************/
void ih264_deblk_luma_horz_bslt4_avx2(UWORD8 *pu1_src,
                                       WORD32 src_strd,
                                       WORD32 alpha,
                                       WORD32 beta,
                                       UWORD32 u4_bs,
                                       const UWORD8 *pu1_cliptab)
{

    WORD16 i16_posP2, i16_posP1, i16_posP0, i16_posQ1, i16_posQ2;
    UWORD8 *pu1_HorzPixel;
    __m256i zero = _mm256_setzero_si256();
    __m128i zero_128 = _mm_setzero_si128();
    __m128i Alpha_8x16,bs_flag_16x8b, C0_16x8, C0_8x16, C0_hi_8x16;
    __m256i Beta_8x32,in_macro_32x8,in_macro_1,in_macro_2,flag1_32x8,flag2_32x8;
    __m256i C_8x32,C0_8x32_res,temp1,temp2,temp3,temp4,res1,res2,q0p1_32x8,p0q1_32x8;
    __m128i p0_16x8,q0_16x8,temp1_128,temp2_128,flag1_16x8_128;
    __m256i const_val4_8x32,p0q0_32x8,p1q1_32x8,p2q2_32x8,q0p0_32x8;
    UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
    UWORD8 clip0, clip1, clip2, clip3;

    pu1_HorzPixel = pu1_src - (src_strd << 2);

    i16_posQ1 = src_strd;
    i16_posQ2 = X2(src_strd);
    i16_posP0 = X3(src_strd);
    i16_posP1 = X2(src_strd);
    i16_posP2 = src_strd;

    p0q0_32x8 = _mm256_loadu2_m128i((__m128i *)(pu1_src), (__m128i *)(pu1_HorzPixel + i16_posP0)); //lower -p0 higher-q0
    p1q1_32x8 = _mm256_loadu2_m128i((__m128i *)(pu1_src + i16_posQ1), (__m128i *)(pu1_HorzPixel + i16_posP1)); //l= p1, h=q1
    p2q2_32x8 = _mm256_loadu2_m128i((__m128i *)(pu1_src + i16_posQ2), (__m128i *)(pu1_HorzPixel + i16_posP2));

    u1_Bs0 = (u4_bs >> 24) & 0xff;
    u1_Bs1 = (u4_bs >> 16) & 0xff;
    u1_Bs2 = (u4_bs >> 8) & 0xff;
    u1_Bs3 = (u4_bs >> 0) & 0xff;
    clip0 = pu1_cliptab[u1_Bs0];
    clip1 = pu1_cliptab[u1_Bs1];
    clip2 = pu1_cliptab[u1_Bs2];
    clip3 = pu1_cliptab[u1_Bs3];

    Alpha_8x16 = _mm_set1_epi16(alpha);
    Beta_8x32 = _mm256_set1_epi16(beta);

    bs_flag_16x8b = _mm_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3,
                                   u1_Bs2, u1_Bs2, u1_Bs2, u1_Bs2,
                           u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,
                                   u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0);

    C0_16x8 = _mm_set_epi8(clip3, clip3, clip3, clip3, clip2, clip2, clip2,
                           clip2, clip1, clip1, clip1, clip1, clip0, clip0,
                           clip0, clip0);

    bs_flag_16x8b = _mm_cmpeq_epi8(bs_flag_16x8b, zero_128);
    bs_flag_16x8b = _mm_xor_si128(bs_flag_16x8b, _mm_set1_epi8(0xFF)); //Invert for required mask
    C0_8x16 = _mm_unpacklo_epi8(C0_16x8, zero_128);
    C0_hi_8x16 = _mm_unpackhi_epi8(C0_16x8, zero_128);
    C0_8x32_res = _mm256_set_m128i(C0_hi_8x16,C0_8x16);

    //Cond1 (ABS(p0 - q0) < alpha)
    p0_16x8 = _mm256_castsi256_si128(p0q0_32x8);
    q0p0_32x8 = _mm256_permute2x128_si256(p0q0_32x8, p0q0_32x8, 0x1);
    q0_16x8   = _mm256_castsi256_si128(p0q0_32x8);
    temp1_128 = _mm_subs_epu8(q0_16x8, p0_16x8);
    temp2_128 = _mm_subs_epu8(p0_16x8, q0_16x8);
    temp1_128 = _mm_add_epi8(temp1_128, temp2_128);

    temp2_128 = _mm_unpacklo_epi8(temp1_128, zero_128);
    temp1_128 = _mm_unpackhi_epi8(temp1_128, zero_128);

    temp2_128 = _mm_cmpgt_epi16(Alpha_8x16, temp2_128);
    temp1_128 = _mm_cmpgt_epi16(Alpha_8x16, temp1_128);
    flag1_16x8_128 = _mm_packs_epi16(temp2_128, temp1_128);
    flag1_16x8_128 = _mm_and_si128(flag1_16x8_128, bs_flag_16x8b);

    flag1_32x8  = _mm256_set_m128i(flag1_16x8_128,flag1_16x8_128);

    //Cond2 (ABS(q1 - q0) < beta) & Cond3 (ABS(p1 - p0) < beta)
    temp1 = _mm256_subs_epu8(p0q0_32x8, p1q1_32x8);
    temp2 = _mm256_subs_epu8(p1q1_32x8, p0q0_32x8);
    temp1 = _mm256_add_epi8(temp1, temp2);

    temp2 = _mm256_unpacklo_epi8(temp1, zero);
    temp1 = _mm256_unpackhi_epi8(temp1, zero);

    temp2 = _mm256_cmpgt_epi16(Beta_8x32, temp2);
    temp1 = _mm256_cmpgt_epi16(Beta_8x32, temp1);

    flag2_32x8 = _mm256_packs_epi16(temp2, temp1);

    //!((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta))
    flag1_32x8 = _mm256_and_si256(flag1_32x8, flag2_32x8);

   //(ABS(p2 - p0) < beta) & (ABS(q2 - q0) < beta)
    temp1 = _mm256_subs_epu8(p0q0_32x8, p2q2_32x8);
    temp2 = _mm256_subs_epu8(p2q2_32x8, p0q0_32x8);
    temp1 = _mm256_add_epi8(temp1, temp2);

    temp2 = _mm256_unpacklo_epi8(temp1, zero);
    temp1 = _mm256_unpackhi_epi8(temp1, zero);
    temp2 = _mm256_cmpgt_epi16(Beta_8x32, temp2);
    temp1 = _mm256_cmpgt_epi16(Beta_8x32, temp1);

    flag2_32x8 = _mm256_packs_epi16(temp2, temp1);
    flag2_32x8 = _mm256_and_si256(flag1_32x8, flag2_32x8);

    temp2 = _mm256_subs_epi16(zero, temp2);
    temp1 = _mm256_subs_epi16(zero, temp1);

    temp3 = _mm256_permute2x128_si256(temp2,temp1,0x20); // low adding
    temp4 = _mm256_permute2x128_si256(temp2,temp1,0x31); //high adding
    temp2 = _mm256_add_epi16(temp3,temp4);
    C_8x32 = _mm256_add_epi16(C0_8x32_res, temp2);       //
    const_val4_8x32 = _mm256_set1_epi16(4);

    res1 = _mm256_permute4x64_epi64(q0p0_32x8, 0xD8);
    res2 = _mm256_permute4x64_epi64(p1q1_32x8, 0xD8);

    temp3 = _mm256_subs_epi16(_mm256_unpacklo_epi8(res1, zero),
                              _mm256_unpackhi_epi8(res1, zero));
    temp4 = _mm256_subs_epi16(_mm256_unpacklo_epi8(res2, zero),
                              _mm256_unpackhi_epi8(res2, zero));

    temp1 = _mm256_slli_epi16(temp3, 2);
    temp1 = _mm256_add_epi16(temp1, temp4);
    temp1 = _mm256_add_epi16(temp1, const_val4_8x32);
    in_macro_32x8 = _mm256_srai_epi16(temp1, 3);

    in_macro_32x8 = _mm256_min_epi16(C_8x32, in_macro_32x8); //CLIP3
    C_8x32 = _mm256_subs_epi16(zero, C_8x32);
    in_macro_32x8 = _mm256_max_epi16(C_8x32, in_macro_32x8); //CLIP3

    temp3 = _mm256_unpacklo_epi8(res1, zero); //q0
    temp4 = _mm256_unpackhi_epi8(res1, zero); //p0

    temp1 = _mm256_add_epi16(temp4, in_macro_32x8);
    temp2 = _mm256_sub_epi16(temp3, in_macro_32x8);

    temp1 = _mm256_packus_epi16(temp2, temp1); // Suffle needed

    temp1 = _mm256_and_si256(temp1, flag1_32x8); //q0 p0

    temp2 = _mm256_and_si256(res1,
                          _mm256_xor_si256(flag1_32x8, _mm256_set1_epi16(0xFFFF)));

    temp1 = _mm256_add_epi8(temp1, temp2);
    temp1 = _mm256_permute4x64_epi64(temp1, 0xD8);
    _mm256_storeu2_m128i((__m128i *)(pu1_HorzPixel + i16_posP0),(__m128i *)(pu1_src),temp1);

    //if(Ap < Beta) if(Aq < Beta)
     temp1 = _mm256_avg_epu16(_mm256_unpacklo_epi8(res1, zero),
                              _mm256_unpackhi_epi8(res1, zero));

     temp2 = _mm256_slli_epi16(_mm256_unpacklo_epi8(p1q1_32x8, zero), 1);
     temp3 = _mm256_subs_epi16(_mm256_unpacklo_epi8(p2q2_32x8, zero), temp2);

     temp2 = _mm256_slli_epi16(_mm256_unpackhi_epi8(p1q1_32x8, zero), 1);
     temp2 = _mm256_subs_epi16(_mm256_unpackhi_epi8(p2q2_32x8, zero), temp2);

     temp4 = _mm256_permute2x128_si256(temp3, temp2, 0x20);     //p0 q0
     temp3 = _mm256_permute2x128_si256(temp3, temp2, 0x31);
     temp4 = _mm256_add_epi16(temp1, temp4); //p
     in_macro_1  = _mm256_srai_epi16(temp4, 1);
     temp3 = _mm256_add_epi16(temp1, temp3); //q
     in_macro_2  = _mm256_srai_epi16(temp3, 1);

     in_macro_1 = _mm256_min_epi16(C0_8x32_res, in_macro_1); //CLIP3
     C0_8x32_res = _mm256_subs_epi16(zero, C0_8x32_res);
     in_macro_1 = _mm256_max_epi16(C0_8x32_res, in_macro_1); //CLIP3

     in_macro_2 = _mm256_max_epi16(C0_8x32_res, in_macro_2); //CLIP3
     C0_8x32_res = _mm256_subs_epi16(zero, C0_8x32_res);
     in_macro_2 = _mm256_min_epi16(C0_8x32_res, in_macro_2); //CLIP3

     temp1 = _mm256_unpacklo_epi8(res2, zero);
     temp2 = _mm256_unpackhi_epi8(res2, zero);

     temp1 = _mm256_add_epi16(temp1, in_macro_1);
     temp2 = _mm256_add_epi16(temp2, in_macro_2);
     temp1 = _mm256_packus_epi16(temp1, temp2); // pl ph ql qh
     temp1 = _mm256_and_si256(temp1, flag2_32x8);
     temp2 = _mm256_and_si256(res2,_mm256_xor_si256(flag2_32x8, _mm256_set1_epi16(0xFFFF)));
     temp1 = _mm256_add_epi8(temp1, temp2);
     temp1 = _mm256_permute4x64_epi64(temp1, 0xD8);
     _mm256_storeu2_m128i((__m128i *)(pu1_src + i16_posQ1),(__m128i *)(pu1_HorzPixel + i16_posP1),temp1);

}

Coverage Report

Created: 2026-03-31 06:08

Line	Count	Source
1		/******************************************************************************
2		*
3		* Copyright (C) 2015 The Android Open Source Project
4		*
5		* Licensed under the Apache License, Version 2.0 (the "License");
6		* you may not use this file except in compliance with the License.
7		* You may obtain a copy of the License at:
8		*
9		* http://www.apache.org/licenses/LICENSE-2.0
10		*
11		* Unless required by applicable law or agreed to in writing, software
12		* distributed under the License is distributed on an "AS IS" BASIS,
13		* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14		* See the License for the specific language governing permissions and
15		* limitations under the License.
16		*
17		*****************************************************************************
18		* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19		*/
20		/*****************************************************************************/
21		/* */
22		/* File Name : ih264_deblk_luma_avx2.c */
23		/* */
24		/* Description : Contains function definitions for deblocking */
25		/* */
26		/* List of Functions : ih264_deblk_luma_horz_bslt4_avx2() */
27		/* ih264_deblk_luma_vert_bslt4_avx2() */
28		/* */
29		/* Issues / Problems : None */
30		/* */
31		/* Revision History : */
32		/* */
33		/* DD MM YYYY Author(s) Changes (Describe the changes made) */
34		/* 12 02 2015 Naveen Kumar P Added luma deblocking ssse3 */
35		/* intrinsics */
36		/* 15 09 2020 Priyanka Bose AVX2 Intel Intrinsics Support */
37		/*****************************************************************************/
38
39		/*****************************************************************************/
40		/* File Includes */
41		/*****************************************************************************/
42
43		/* System include files */
44		#include <stdio.h>
45		#ifdef __ANDROID__
46		#include "log/log.h"
47		#include <cutils/log.h>
48		#endif
49
50		/* User include files */
51		#include "ih264_typedefs.h"
52		#include "ih264_platform_macros.h"
53		#include "ih264_deblk_edge_filters.h"
54		#include "ih264_macros.h"
55
56
57		/*****************************************************************************/
58		/* */
59		/* Function Name : ih264_deblk_luma_horz_bslt4_avx2() */
60		/* */
61		/* Description : This function performs filtering of a luma block */
62		/* horizontal edge when boundary strength is less than 4. */
63		/* */
64		/* Inputs : pu1_src - pointer to the src sample q0 */
65		/* src_strd - source stride */
66		/* alpha - alpha value for the boundary */
67		/* beta - beta value for the boundary */
68		/* u4_bs - packed Boundary strength array */
69		/* pu1_cliptab - tc0_table */
70		/* */
71		/* Globals : None */
72		/* */
73		/* Processing : This operation is described in Sec. 8.7.2.3 under the */
74		/* title "Filtering process for edges for bS less than 4" */
75		/* in ITU T Rec H.264. */
76		/* */
77		/* Outputs : None */
78		/* */
79		/* Returns : None */
80		/* */
81		/* Issues : None */
82		/* */
83		/* Revision History: */
84		/* */
85		/* DD MM YYYY Author(s) Changes (Describe the changes made) */
86		/* 12 02 2015 Naveen Kumar P Initial version */
87		/* 15 09 2020 Priyanka Bose AVX2 Intel Intrinsics Support */
88		/*****************************************************************************/
89		void ih264_deblk_luma_horz_bslt4_avx2(UWORD8 *pu1_src,
90		WORD32 src_strd,
91		WORD32 alpha,
92		WORD32 beta,
93		UWORD32 u4_bs,
94		const UWORD8 *pu1_cliptab)
95	35.1k	{
96
97	35.1k	WORD16 i16_posP2, i16_posP1, i16_posP0, i16_posQ1, i16_posQ2;
98	35.1k	UWORD8 *pu1_HorzPixel;
99	35.1k	__m256i zero = _mm256_setzero_si256();
100	35.1k	__m128i zero_128 = _mm_setzero_si128();
101	35.1k	__m128i Alpha_8x16,bs_flag_16x8b, C0_16x8, C0_8x16, C0_hi_8x16;
102	35.1k	__m256i Beta_8x32,in_macro_32x8,in_macro_1,in_macro_2,flag1_32x8,flag2_32x8;
103	35.1k	__m256i C_8x32,C0_8x32_res,temp1,temp2,temp3,temp4,res1,res2,q0p1_32x8,p0q1_32x8;
104	35.1k	__m128i p0_16x8,q0_16x8,temp1_128,temp2_128,flag1_16x8_128;
105	35.1k	__m256i const_val4_8x32,p0q0_32x8,p1q1_32x8,p2q2_32x8,q0p0_32x8;
106	35.1k	UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
107	35.1k	UWORD8 clip0, clip1, clip2, clip3;
108
109	35.1k	pu1_HorzPixel = pu1_src - (src_strd << 2);
110
111	35.1k	i16_posQ1 = src_strd;
112	35.1k	i16_posQ2 = X2(src_strd);
113	35.1k	i16_posP0 = X3(src_strd);
114	35.1k	i16_posP1 = X2(src_strd);
115	35.1k	i16_posP2 = src_strd;
116
117	35.1k	p0q0_32x8 = _mm256_loadu2_m128i((__m128i )(pu1_src), (__m128i )(pu1_HorzPixel + i16_posP0)); //lower -p0 higher-q0
118	35.1k	p1q1_32x8 = _mm256_loadu2_m128i((__m128i )(pu1_src + i16_posQ1), (__m128i )(pu1_HorzPixel + i16_posP1)); //l= p1, h=q1
119	35.1k	p2q2_32x8 = _mm256_loadu2_m128i((__m128i )(pu1_src + i16_posQ2), (__m128i )(pu1_HorzPixel + i16_posP2));
120
121	35.1k	u1_Bs0 = (u4_bs >> 24) & 0xff;
122	35.1k	u1_Bs1 = (u4_bs >> 16) & 0xff;
123	35.1k	u1_Bs2 = (u4_bs >> 8) & 0xff;
124	35.1k	u1_Bs3 = (u4_bs >> 0) & 0xff;
125	35.1k	clip0 = pu1_cliptab[u1_Bs0];
126	35.1k	clip1 = pu1_cliptab[u1_Bs1];
127	35.1k	clip2 = pu1_cliptab[u1_Bs2];
128	35.1k	clip3 = pu1_cliptab[u1_Bs3];
129
130	35.1k	Alpha_8x16 = _mm_set1_epi16(alpha);
131	35.1k	Beta_8x32 = _mm256_set1_epi16(beta);
132
133	35.1k	bs_flag_16x8b = _mm_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3,
134	35.1k	u1_Bs2, u1_Bs2, u1_Bs2, u1_Bs2,
135	35.1k	u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,
136	35.1k	u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0);
137
138	35.1k	C0_16x8 = _mm_set_epi8(clip3, clip3, clip3, clip3, clip2, clip2, clip2,
139	35.1k	clip2, clip1, clip1, clip1, clip1, clip0, clip0,
140	35.1k	clip0, clip0);
141
142	35.1k	bs_flag_16x8b = _mm_cmpeq_epi8(bs_flag_16x8b, zero_128);
143	35.1k	bs_flag_16x8b = _mm_xor_si128(bs_flag_16x8b, _mm_set1_epi8(0xFF)); //Invert for required mask
144	35.1k	C0_8x16 = _mm_unpacklo_epi8(C0_16x8, zero_128);
145	35.1k	C0_hi_8x16 = _mm_unpackhi_epi8(C0_16x8, zero_128);
146	35.1k	C0_8x32_res = _mm256_set_m128i(C0_hi_8x16,C0_8x16);
147
148		//Cond1 (ABS(p0 - q0) < alpha)
149	35.1k	p0_16x8 = _mm256_castsi256_si128(p0q0_32x8);
150	35.1k	q0p0_32x8 = _mm256_permute2x128_si256(p0q0_32x8, p0q0_32x8, 0x1);
151	35.1k	q0_16x8 = _mm256_castsi256_si128(p0q0_32x8);
152	35.1k	temp1_128 = _mm_subs_epu8(q0_16x8, p0_16x8);
153	35.1k	temp2_128 = _mm_subs_epu8(p0_16x8, q0_16x8);
154	35.1k	temp1_128 = _mm_add_epi8(temp1_128, temp2_128);
155
156	35.1k	temp2_128 = _mm_unpacklo_epi8(temp1_128, zero_128);
157	35.1k	temp1_128 = _mm_unpackhi_epi8(temp1_128, zero_128);
158
159	35.1k	temp2_128 = _mm_cmpgt_epi16(Alpha_8x16, temp2_128);
160	35.1k	temp1_128 = _mm_cmpgt_epi16(Alpha_8x16, temp1_128);
161	35.1k	flag1_16x8_128 = _mm_packs_epi16(temp2_128, temp1_128);
162	35.1k	flag1_16x8_128 = _mm_and_si128(flag1_16x8_128, bs_flag_16x8b);
163
164	35.1k	flag1_32x8 = _mm256_set_m128i(flag1_16x8_128,flag1_16x8_128);
165
166		//Cond2 (ABS(q1 - q0) < beta) & Cond3 (ABS(p1 - p0) < beta)
167	35.1k	temp1 = _mm256_subs_epu8(p0q0_32x8, p1q1_32x8);
168	35.1k	temp2 = _mm256_subs_epu8(p1q1_32x8, p0q0_32x8);
169	35.1k	temp1 = _mm256_add_epi8(temp1, temp2);
170
171	35.1k	temp2 = _mm256_unpacklo_epi8(temp1, zero);
172	35.1k	temp1 = _mm256_unpackhi_epi8(temp1, zero);
173
174	35.1k	temp2 = _mm256_cmpgt_epi16(Beta_8x32, temp2);
175	35.1k	temp1 = _mm256_cmpgt_epi16(Beta_8x32, temp1);
176
177	35.1k	flag2_32x8 = _mm256_packs_epi16(temp2, temp1);
178
179		//!((ABS(p0 - q0) < alpha) \|\| (ABS(q1 - q0) < beta) \|\| (ABS(p1 - p0) < beta))
180	35.1k	flag1_32x8 = _mm256_and_si256(flag1_32x8, flag2_32x8);
181
182		//(ABS(p2 - p0) < beta) & (ABS(q2 - q0) < beta)
183	35.1k	temp1 = _mm256_subs_epu8(p0q0_32x8, p2q2_32x8);
184	35.1k	temp2 = _mm256_subs_epu8(p2q2_32x8, p0q0_32x8);
185	35.1k	temp1 = _mm256_add_epi8(temp1, temp2);
186
187	35.1k	temp2 = _mm256_unpacklo_epi8(temp1, zero);
188	35.1k	temp1 = _mm256_unpackhi_epi8(temp1, zero);
189	35.1k	temp2 = _mm256_cmpgt_epi16(Beta_8x32, temp2);
190	35.1k	temp1 = _mm256_cmpgt_epi16(Beta_8x32, temp1);
191
192	35.1k	flag2_32x8 = _mm256_packs_epi16(temp2, temp1);
193	35.1k	flag2_32x8 = _mm256_and_si256(flag1_32x8, flag2_32x8);
194
195	35.1k	temp2 = _mm256_subs_epi16(zero, temp2);
196	35.1k	temp1 = _mm256_subs_epi16(zero, temp1);
197
198	35.1k	temp3 = _mm256_permute2x128_si256(temp2,temp1,0x20); // low adding
199	35.1k	temp4 = _mm256_permute2x128_si256(temp2,temp1,0x31); //high adding
200	35.1k	temp2 = _mm256_add_epi16(temp3,temp4);
201	35.1k	C_8x32 = _mm256_add_epi16(C0_8x32_res, temp2); //
202	35.1k	const_val4_8x32 = _mm256_set1_epi16(4);
203
204	35.1k	res1 = _mm256_permute4x64_epi64(q0p0_32x8, 0xD8);
205	35.1k	res2 = _mm256_permute4x64_epi64(p1q1_32x8, 0xD8);
206
207	35.1k	temp3 = _mm256_subs_epi16(_mm256_unpacklo_epi8(res1, zero),
208	35.1k	_mm256_unpackhi_epi8(res1, zero));
209	35.1k	temp4 = _mm256_subs_epi16(_mm256_unpacklo_epi8(res2, zero),
210	35.1k	_mm256_unpackhi_epi8(res2, zero));
211
212	35.1k	temp1 = _mm256_slli_epi16(temp3, 2);
213	35.1k	temp1 = _mm256_add_epi16(temp1, temp4);
214	35.1k	temp1 = _mm256_add_epi16(temp1, const_val4_8x32);
215	35.1k	in_macro_32x8 = _mm256_srai_epi16(temp1, 3);
216
217	35.1k	in_macro_32x8 = _mm256_min_epi16(C_8x32, in_macro_32x8); //CLIP3
218	35.1k	C_8x32 = _mm256_subs_epi16(zero, C_8x32);
219	35.1k	in_macro_32x8 = _mm256_max_epi16(C_8x32, in_macro_32x8); //CLIP3
220
221	35.1k	temp3 = _mm256_unpacklo_epi8(res1, zero); //q0
222	35.1k	temp4 = _mm256_unpackhi_epi8(res1, zero); //p0
223
224	35.1k	temp1 = _mm256_add_epi16(temp4, in_macro_32x8);
225	35.1k	temp2 = _mm256_sub_epi16(temp3, in_macro_32x8);
226
227	35.1k	temp1 = _mm256_packus_epi16(temp2, temp1); // Suffle needed
228
229	35.1k	temp1 = _mm256_and_si256(temp1, flag1_32x8); //q0 p0
230
231	35.1k	temp2 = _mm256_and_si256(res1,
232	35.1k	_mm256_xor_si256(flag1_32x8, _mm256_set1_epi16(0xFFFF)));
233
234	35.1k	temp1 = _mm256_add_epi8(temp1, temp2);
235	35.1k	temp1 = _mm256_permute4x64_epi64(temp1, 0xD8);
236	35.1k	_mm256_storeu2_m128i((__m128i )(pu1_HorzPixel + i16_posP0),(__m128i )(pu1_src),temp1);
237
238		//if(Ap < Beta) if(Aq < Beta)
239	35.1k	temp1 = _mm256_avg_epu16(_mm256_unpacklo_epi8(res1, zero),
240	35.1k	_mm256_unpackhi_epi8(res1, zero));
241
242	35.1k	temp2 = _mm256_slli_epi16(_mm256_unpacklo_epi8(p1q1_32x8, zero), 1);
243	35.1k	temp3 = _mm256_subs_epi16(_mm256_unpacklo_epi8(p2q2_32x8, zero), temp2);
244
245	35.1k	temp2 = _mm256_slli_epi16(_mm256_unpackhi_epi8(p1q1_32x8, zero), 1);
246	35.1k	temp2 = _mm256_subs_epi16(_mm256_unpackhi_epi8(p2q2_32x8, zero), temp2);
247
248	35.1k	temp4 = _mm256_permute2x128_si256(temp3, temp2, 0x20); //p0 q0
249	35.1k	temp3 = _mm256_permute2x128_si256(temp3, temp2, 0x31);
250	35.1k	temp4 = _mm256_add_epi16(temp1, temp4); //p
251	35.1k	in_macro_1 = _mm256_srai_epi16(temp4, 1);
252	35.1k	temp3 = _mm256_add_epi16(temp1, temp3); //q
253	35.1k	in_macro_2 = _mm256_srai_epi16(temp3, 1);
254
255	35.1k	in_macro_1 = _mm256_min_epi16(C0_8x32_res, in_macro_1); //CLIP3
256	35.1k	C0_8x32_res = _mm256_subs_epi16(zero, C0_8x32_res);
257	35.1k	in_macro_1 = _mm256_max_epi16(C0_8x32_res, in_macro_1); //CLIP3
258
259	35.1k	in_macro_2 = _mm256_max_epi16(C0_8x32_res, in_macro_2); //CLIP3
260	35.1k	C0_8x32_res = _mm256_subs_epi16(zero, C0_8x32_res);
261	35.1k	in_macro_2 = _mm256_min_epi16(C0_8x32_res, in_macro_2); //CLIP3
262
263	35.1k	temp1 = _mm256_unpacklo_epi8(res2, zero);
264	35.1k	temp2 = _mm256_unpackhi_epi8(res2, zero);
265
266	35.1k	temp1 = _mm256_add_epi16(temp1, in_macro_1);
267	35.1k	temp2 = _mm256_add_epi16(temp2, in_macro_2);
268	35.1k	temp1 = _mm256_packus_epi16(temp1, temp2); // pl ph ql qh
269	35.1k	temp1 = _mm256_and_si256(temp1, flag2_32x8);
270	35.1k	temp2 = _mm256_and_si256(res2,_mm256_xor_si256(flag2_32x8, _mm256_set1_epi16(0xFFFF)));
271	35.1k	temp1 = _mm256_add_epi8(temp1, temp2);
272	35.1k	temp1 = _mm256_permute4x64_epi64(temp1, 0xD8);
273	35.1k	_mm256_storeu2_m128i((__m128i )(pu1_src + i16_posQ1),(__m128i )(pu1_HorzPixel + i16_posP1),temp1);
274
275	35.1k	}