/src/libavc/common/x86/ih264_deblk_chroma_avx2.c

Source
/******************************************************************************
 *
 * Copyright (C) 2015 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 *****************************************************************************
 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
*/
/*****************************************************************************/

/*****************************************************************************/
/* File Includes                                                             */
/*****************************************************************************/

/* System include files */
#include <stdio.h>

#ifdef __ANDROID__
#include "log/log.h"
#include <cutils/log.h>
#endif

/* User include files */
#include "ih264_typedefs.h"
#include "ih264_platform_macros.h"
#include "ih264_deblk_edge_filters.h"
#include "ih264_macros.h"

#include <stdint.h>
#include <string.h>
#include <immintrin.h>



/*****************************************************************************/
/*                                                                           */
/*  Function Name : ih264_deblk_chroma_vert_bslt4_avx2()                    */
/*                                                                           */
/*  Description   : This function performs filtering of a chroma block       */
/*                  vertical edge when the boundary strength is less than 4  */
/*                  in high profile.                                         */
/*                                                                           */
/*  Inputs        : pu1_src          - pointer to the src sample q0 of U     */
/*                  src_strd         - source stride                         */
/*                  alpha_cb         - alpha value for the boundary in U     */
/*                  beta_cb          - beta value for the boundary in U      */
/*                  alpha_cr         - alpha value for the boundary in V     */
/*                  beta_cr          - beta value for the boundary in V      */
/*                  u4_bs            - packed Boundary strength array        */
/*                  pu1_cliptab_cb   - tc0_table for U                       */
/*                  pu1_cliptab_cr   - tc0_table for V                       */
/*                                                                           */
/*  Globals       : None                                                     */
/*                                                                           */
/*  Processing    : This operation is described in Sec. 8.7.2.3 under the    */
/*                  title "Filtering process for edges for bS less than 4"   */
/*                  in ITU T Rec H.264 with alpha and beta values different  */
/*                  in U and V.                                              */
/*                                                                           */
/*  Outputs       : None                                                     */
/*                                                                           */
/*  Returns       : None                                                     */
/*                                                                           */
/*  Issues        : None                                                     */
/*                                                                           */
/*  Revision History:                                                        */
/*                                                                           */
/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
/*         12 02 2015   Naveen Kumar P  Initial version                      */
/*         15 09 2020   Priyanka Bose   AVX2 Intel Intrinsics Support        */
/*****************************************************************************/

void ih264_deblk_chroma_vert_bslt4_avx2(UWORD8 *pu1_src,
                                         WORD32 src_strd,
                                         WORD32 alpha_cb,
                                         WORD32 beta_cb,
                                         WORD32 alpha_cr,
                                         WORD32 beta_cr,
                                         UWORD32 u4_bs,
                                         const UWORD8 *pu1_cliptab_cb,
                                         const UWORD8 *pu1_cliptab_cr)
{
    UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
    UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
    WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
    WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
    __m128i linea, lineb, linec, lined, linee, linef, lineg, lineh;
    __m256i lineab, linecd, lineef, linegh, lineae, linebf, linecg, linedh;
    __m256i temp1, temp2, temp3, temp4;
    __m256i t1,t3, t2,t4,pq0_uv_32x8,pq1_uv_32x8,tmp1,tmp2,p0_uv_8x32,q0_uv_8x32;

    __m256i pq0_uv_8x32, pq1_uv_8x32, p1_uv_8x32,pq0_uv_8x32_1,pq0_uv_8x32_2;
    __m256i flag_bs, flag1, flag2;
    __m256i diff, diff1, alpha_cbcr_32x8, beta_cbcr_32x8, in_macro;
    __m256i zero = _mm256_setzero_si256();
    __m256i C0_uv_8x32;
    __m256i p0_uv_8x32_1, p0_uv_8x32_2, q0_uv_8x32_1, q0_uv_8x32_2,p0_uv_32x8_1,q0_uv_32x8_1;

    u1_Bs0 = (u4_bs >> 24) & 0xff;
    u1_Bs1 = (u4_bs >> 16) & 0xff;
    u1_Bs2 = (u4_bs >> 8) & 0xff;
    u1_Bs3 = (u4_bs >> 0) & 0xff;

    flag_bs = _mm256_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3,u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3,
                              u1_Bs2, u1_Bs2, u1_Bs2, u1_Bs2, u1_Bs2, u1_Bs2,u1_Bs2, u1_Bs2,
                              u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,
                              u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0,u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0);
    flag_bs = _mm256_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s
    flag_bs = _mm256_xor_si256(flag_bs, _mm256_set1_epi8(0xFF)); //Invert for required mask

    /* Load and transpose the pixel values */
    lineab =  _mm256_loadu2_m128i((__m128i *)(pu1_src_uv - 4 + src_strd), (__m128i *)(pu1_src_uv - 4));
    linecd =  _mm256_loadu2_m128i((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), (__m128i *)(pu1_src_uv - 4 + 2 * src_strd));
    lineef =  _mm256_loadu2_m128i((__m128i *)(pu1_src_uv - 4 + 5 * src_strd), (__m128i *)(pu1_src_uv - 4 + 4 * src_strd));
    linegh =  _mm256_loadu2_m128i((__m128i *)(pu1_src_uv - 4 + 7 * src_strd), (__m128i *)(pu1_src_uv - 4 + 6 * src_strd));

    temp1 = _mm256_unpacklo_epi64(lineab, zero);  //a0 -- a7  000.. b0..b7 000
    temp2 = _mm256_unpacklo_epi64(linecd, zero);
    temp3 = _mm256_unpacklo_epi64(lineef, zero);  //e0 -- e7  000.. f0..f7 000
    temp4 = _mm256_unpacklo_epi64(linegh, zero);

    temp1 = _mm256_unpacklo_epi16(temp1, temp2);  //a0 a1 c0 c1 --  a6 a7 c6 c7 b0 b1 d0 d1.. b6 b7 d6 d7
    temp2 = _mm256_unpacklo_epi16(temp3, temp4);  //e0 e1 g0 g1                f0 f1 h0 h1

    t2 = _mm256_permute2f128_si256(temp1, temp2, 0x20);
    t3 = _mm256_permute2f128_si256(temp1, temp2, 0x31);

    tmp1 = _mm256_unpacklo_epi16(t2, t3);    //a0 a1 b0 b1 c0 c1 d0 d1 -a2 a3 b2 b3 ....  e0 e1 f0 f1 g0 g1 h0 h1  -e2 e3..
    tmp2 = _mm256_unpackhi_epi16(t2, t3);    //a4 a5 b4 b5             -a6 a7 b6 b7


    temp1 = _mm256_unpacklo_epi8(tmp1,zero); // a0 0 a1 0 b0 0 b1 0 c0 0 c1 0 d0 0 d1 0 -  e0 0 e1 0 ..   => p1
    temp2 = _mm256_unpackhi_epi8(tmp1,zero); // a2 0 a3 0                                                 => p0
    temp3 = _mm256_unpacklo_epi8(tmp2,zero); //a4 0 a5 0                                                  => q0
    temp4 = _mm256_unpackhi_epi8(tmp2,zero); //a6 0 a7 0                                                 => q1

    pq1_uv_32x8 = _mm256_packus_epi16(temp1,temp4);     // 0213
    pq0_uv_32x8 = _mm256_packus_epi16(temp2,temp3);     //0213

    diff = _mm256_subs_epi16(temp2, temp3); //Condn 1    (p0 -q0) - set (3), set(3)
    diff = _mm256_abs_epi16(diff);
    alpha_cbcr_32x8 = _mm256_set1_epi32(alpha_cbcr);
    flag1 = _mm256_cmpgt_epi16(alpha_cbcr_32x8, diff);

    diff = _mm256_subs_epi16(temp4, temp3); //Condtn 2   (q1 -q0)
    diff = _mm256_abs_epi16(diff);
    beta_cbcr_32x8 = _mm256_set1_epi32(beta_cbcr);
    flag1 = _mm256_and_si256(flag1, _mm256_cmpgt_epi16(beta_cbcr_32x8, diff));


    diff = _mm256_subs_epi16(temp1, temp2); //Condtn 3  (p1 -p0)
    diff = _mm256_abs_epi16(diff);
    flag1 = _mm256_and_si256(flag1, _mm256_cmpgt_epi16(beta_cbcr_32x8, diff));

    diff = _mm256_subs_epi16(temp3, temp2);     //(q0 -p0)
    diff = _mm256_slli_epi16(diff, 2);

    diff1 = _mm256_subs_epi16(temp1, temp4);     //(p1 -q1)
    diff = _mm256_add_epi16(diff, diff1);

    diff = _mm256_add_epi16(diff, _mm256_set1_epi16(4));
    in_macro = _mm256_srai_epi16(diff, 3);


    C0_uv_8x32 = _mm256_set_epi16(pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
                               pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
                               pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0],
                               pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0],
                               pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
                               pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
                               pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2],
                               pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2]);

    C0_uv_8x32 = _mm256_add_epi16(C0_uv_8x32, _mm256_set1_epi16(1));

    in_macro = _mm256_min_epi16(C0_uv_8x32, in_macro); //CLIP3
    C0_uv_8x32 = _mm256_subs_epi16(zero, C0_uv_8x32);
    in_macro = _mm256_max_epi16(C0_uv_8x32, in_macro);

    p0_uv_8x32_1 = _mm256_add_epi16(temp2, in_macro);
    q0_uv_8x32_1 = _mm256_sub_epi16(temp3, in_macro);


    flag1 = _mm256_and_si256(flag1, flag_bs);
    flag1 = _mm256_packs_epi16(flag1, flag1);  // 0213

    pq0_uv_8x32 = _mm256_packus_epi16(p0_uv_8x32_1,q0_uv_8x32_1); //0213

    pq0_uv_8x32_1 = _mm256_and_si256(pq0_uv_32x8,
                                 _mm256_xor_si256(flag1, _mm256_set1_epi8(0xFF)));
    pq0_uv_8x32_2 = _mm256_and_si256(pq0_uv_8x32, flag1);
    pq0_uv_32x8 = _mm256_add_epi8(pq0_uv_8x32_1, pq0_uv_8x32_2);


    t1 = _mm256_unpacklo_epi16(pq1_uv_32x8, pq0_uv_32x8);   // temp1 temp3
    t2 = _mm256_unpackhi_epi16(pq1_uv_32x8, pq0_uv_32x8);   // temp2 temp4

    t4 = _mm256_shufflelo_epi16(t2, _MM_SHUFFLE(2, 3, 0, 1));  // pshuflw
    t4 = _mm256_shufflehi_epi16(t4, _MM_SHUFFLE(2, 3, 0, 1));

    lineae = _mm256_unpacklo_epi32(t1, t4);   // temp1 temp3
    linecg = _mm256_unpackhi_epi32(t1, t4);   // temp2 temp4

    linea =  _mm256_castsi256_si128(lineae);
    lineb = _mm256_castsi256_si128(_mm256_srli_si256(lineae, 8));
    lineae =  _mm256_permute2f128_si256(lineae, lineae, 0x1);
    linee =  _mm256_castsi256_si128(lineae);
    linef = _mm256_castsi256_si128(_mm256_srli_si256(lineae, 8));


    linec =  _mm256_castsi256_si128(linecg);
    lined =  _mm256_castsi256_si128(_mm256_srli_si256(linecg, 8));
    linecg =  _mm256_permute2f128_si256(linecg, linecg, 0x1);
    lineg =   _mm256_castsi256_si128(linecg);
    lineh = _mm256_castsi256_si128(_mm256_srli_si256(linecg, 8));

    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea);
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb);
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec);
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined);
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd), linee);
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd), linef);
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd), lineg);
    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd), lineh);

}

/*****************************************************************************/
/*                                                                           */
/*  Function Name : ih264_deblk_chroma_horz_bslt4_avx2()                    */
/*                                                                           */
/*  Description   : This function performs filtering of a chroma block       */
/*                  horizontal edge when the boundary strength is less than  */
/*                  4 in high profile.                                       */
/*                                                                           */
/*  Inputs        : pu1_src          - pointer to the src sample q0 of U     */
/*                  src_strd         - source stride                         */
/*                  alpha_cb         - alpha value for the boundary in U     */
/*                  beta_cb          - beta value for the boundary in U      */
/*                  alpha_cr         - alpha value for the boundary in V     */
/*                  beta_cr          - beta value for the boundary in V      */
/*                  u4_bs            - packed Boundary strength array        */
/*                  pu1_cliptab_cb   - tc0_table for U                       */
/*                  pu1_cliptab_cr   - tc0_table for V                       */
/*                                                                           */
/*  Globals       : None                                                     */
/*                                                                           */
/*  Processing    : This operation is described in Sec. 8.7.2.3 under the    */
/*                  title "Filtering process for edges for bS less than 4"   */
/*                  in ITU T Rec H.264 with alpha and beta values different  */
/*                  in U and V.                                              */
/*                                                                           */
/*  Outputs       : None                                                     */
/*                                                                           */
/*  Returns       : None                                                     */
/*                                                                           */
/*  Issues        : None                                                     */
/*                                                                           */
/*  Revision History:                                                        */
/*                                                                           */
/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
/*         12 02 2015   Naveen Kumar P  Initial version                      */
/*         12 10 2020   Priyanka Bose   AVX2 Intel Intrinsics Support        */
/*****************************************************************************/
void ih264_deblk_chroma_horz_bslt4_avx2 (UWORD8 *pu1_src,
                                         WORD32 src_strd,
                                         WORD32 alpha_cb,
                                         WORD32 beta_cb,
                                         WORD32 alpha_cr,
                                         WORD32 beta_cr,
                                         UWORD32 u4_bs,
                                         const UWORD8 *pu1_cliptab_cb,
                                         const UWORD8 *pu1_cliptab_cr)
{
    UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
    WORD16 i16_posP1, i16_posP0, i16_posQ1;
    UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;

    UWORD8 *pu1_HorzPixelUV; /*! < Pointer to the first pixel of the boundary */
    WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
    WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
    __m256i p0q0_uv_32x8,p1q1_uv_32x8;
    __m256i temp1,temp2,temp3,temp4;
    __m256i flag_bs, flag1, flag2;
    __m256i diff, diff1, alpha_cbcr_32x8, beta_cbcr_32x8, in_macro;
    __m256i zero = _mm256_setzero_si256();
    __m256i C0_uv_8x32;
    __m256i p0q0_uv_8x32_1, p0q0_uv_8x32_2,res1,res2,p0_uv_8x32_1,q0_uv_8x32_1;

    pu1_HorzPixelUV = pu1_src_uv - (src_strd << 1);

    i16_posQ1 = src_strd;
    i16_posP0 = src_strd;
    i16_posP1 = 0;

    u1_Bs0 = (u4_bs >> 24) & 0xff;
    u1_Bs1 = (u4_bs >> 16) & 0xff;
    u1_Bs2 = (u4_bs >> 8) & 0xff;
    u1_Bs3 = (u4_bs >> 0) & 0xff;

    flag_bs = _mm256_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3,
                           u1_Bs2, u1_Bs2, u1_Bs2, u1_Bs2,
                           u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3,
                           u1_Bs2, u1_Bs2,u1_Bs2, u1_Bs2,
                           u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,
                           u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0,
                           u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,
                           u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0);
    flag_bs = _mm256_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s
    flag_bs = _mm256_xor_si256(flag_bs, _mm256_set1_epi8(0xFF)); //Invert for required mask

    p0q0_uv_32x8 = _mm256_loadu2_m128i((__m128i *)(pu1_src_uv), (__m128i *)(pu1_HorzPixelUV + i16_posP0));
    p1q1_uv_32x8 = _mm256_loadu2_m128i((__m128i *)(pu1_src_uv + i16_posQ1), (__m128i *)(pu1_HorzPixelUV + i16_posP1));

    res1 = _mm256_permute4x64_epi64(p0q0_uv_32x8,0xD8);
    res2 = _mm256_permute4x64_epi64(p1q1_uv_32x8,0xD8);

    temp3 = _mm256_unpacklo_epi8(res1, zero); //p0 l 0 h 0
    temp4 = _mm256_unpackhi_epi8(res1, zero); //q0
    temp1 = _mm256_unpacklo_epi8(res2, zero); //p1
    temp2 = _mm256_unpackhi_epi8(res2, zero); //q1

    diff = _mm256_subs_epi16(temp3, temp4); //Condn 1 //p0 l h - q0 l h
    diff = _mm256_abs_epi16(diff);
    alpha_cbcr_32x8 = _mm256_set1_epi32(alpha_cbcr);
    flag1 = _mm256_cmpgt_epi16(alpha_cbcr_32x8, diff);

    diff = _mm256_subs_epi16(temp2, temp4); //Condtn 2
    diff = _mm256_abs_epi16(diff);
    beta_cbcr_32x8 = _mm256_set1_epi32(beta_cbcr);
    flag1 = _mm256_and_si256(flag1, _mm256_cmpgt_epi16(beta_cbcr_32x8, diff));

    diff = _mm256_subs_epi16(temp1, temp3); //Condtn 3
    diff = _mm256_abs_epi16(diff);
    flag1 = _mm256_and_si256(flag1, _mm256_cmpgt_epi16(beta_cbcr_32x8, diff));

    diff = _mm256_subs_epi16(temp4, temp3);
    diff = _mm256_slli_epi16(diff, 2);
    diff1 = _mm256_subs_epi16(temp1, temp2);
    diff = _mm256_add_epi16(diff, diff1);
    diff = _mm256_add_epi16(diff, _mm256_set1_epi16(4));
    in_macro = _mm256_srai_epi16(diff, 3);

    C0_uv_8x32 = _mm256_set_epi16(
                               pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
                               pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
                               pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2],
                               pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2],
                               pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
                               pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
                               pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0],
                               pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0]);

    C0_uv_8x32 = _mm256_add_epi16(C0_uv_8x32, _mm256_set1_epi16(1));

    in_macro = _mm256_min_epi16(C0_uv_8x32, in_macro); //CLIP3
    C0_uv_8x32 = _mm256_subs_epi16(zero, C0_uv_8x32);
    in_macro = _mm256_max_epi16(C0_uv_8x32, in_macro);

    p0_uv_8x32_1 = _mm256_add_epi16(temp3, in_macro);
    q0_uv_8x32_1 = _mm256_sub_epi16(temp4, in_macro);

    p0q0_uv_8x32_2 = _mm256_packus_epi16(p0_uv_8x32_1,q0_uv_8x32_1);
    flag1 = _mm256_packs_epi16(flag1, flag1);
    flag1 = _mm256_and_si256(flag1, flag_bs); //Final flag (BS condition + other 3 conditions)

    p0q0_uv_8x32_1 = _mm256_and_si256(res1,
                                 _mm256_xor_si256(flag1, _mm256_set1_epi8(0xFF)));
    p0q0_uv_8x32_2 = _mm256_and_si256(p0q0_uv_8x32_2, flag1);
    p0q0_uv_8x32_1 = _mm256_add_epi8(p0q0_uv_8x32_1, p0q0_uv_8x32_2);
    p0q0_uv_8x32_1 = _mm256_permute4x64_epi64(p0q0_uv_8x32_1,0xD8);

    _mm256_storeu2_m128i((__m128i *)(pu1_src_uv),(__m128i *)(pu1_HorzPixelUV + i16_posP0), p0q0_uv_8x32_1);

}

Coverage Report

Created: 2026-05-30 06:23

Line	Count	Source
1		/******************************************************************************
2		*
3		* Copyright (C) 2015 The Android Open Source Project
4		*
5		* Licensed under the Apache License, Version 2.0 (the "License");
6		* you may not use this file except in compliance with the License.
7		* You may obtain a copy of the License at:
8		*
9		* http://www.apache.org/licenses/LICENSE-2.0
10		*
11		* Unless required by applicable law or agreed to in writing, software
12		* distributed under the License is distributed on an "AS IS" BASIS,
13		* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14		* See the License for the specific language governing permissions and
15		* limitations under the License.
16		*
17		*****************************************************************************
18		* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19		*/
20		/*****************************************************************************/
21
22		/*****************************************************************************/
23		/* File Includes */
24		/*****************************************************************************/
25
26		/* System include files */
27		#include <stdio.h>
28
29		#ifdef __ANDROID__
30		#include "log/log.h"
31		#include <cutils/log.h>
32		#endif
33
34		/* User include files */
35		#include "ih264_typedefs.h"
36		#include "ih264_platform_macros.h"
37		#include "ih264_deblk_edge_filters.h"
38		#include "ih264_macros.h"
39
40		#include <stdint.h>
41		#include <string.h>
42		#include <immintrin.h>
43
44
45
46		/*****************************************************************************/
47		/* */
48		/* Function Name : ih264_deblk_chroma_vert_bslt4_avx2() */
49		/* */
50		/* Description : This function performs filtering of a chroma block */
51		/* vertical edge when the boundary strength is less than 4 */
52		/* in high profile. */
53		/* */
54		/* Inputs : pu1_src - pointer to the src sample q0 of U */
55		/* src_strd - source stride */
56		/* alpha_cb - alpha value for the boundary in U */
57		/* beta_cb - beta value for the boundary in U */
58		/* alpha_cr - alpha value for the boundary in V */
59		/* beta_cr - beta value for the boundary in V */
60		/* u4_bs - packed Boundary strength array */
61		/* pu1_cliptab_cb - tc0_table for U */
62		/* pu1_cliptab_cr - tc0_table for V */
63		/* */
64		/* Globals : None */
65		/* */
66		/* Processing : This operation is described in Sec. 8.7.2.3 under the */
67		/* title "Filtering process for edges for bS less than 4" */
68		/* in ITU T Rec H.264 with alpha and beta values different */
69		/* in U and V. */
70		/* */
71		/* Outputs : None */
72		/* */
73		/* Returns : None */
74		/* */
75		/* Issues : None */
76		/* */
77		/* Revision History: */
78		/* */
79		/* DD MM YYYY Author(s) Changes (Describe the changes made) */
80		/* 12 02 2015 Naveen Kumar P Initial version */
81		/* 15 09 2020 Priyanka Bose AVX2 Intel Intrinsics Support */
82		/*****************************************************************************/
83
84		void ih264_deblk_chroma_vert_bslt4_avx2(UWORD8 *pu1_src,
85		WORD32 src_strd,
86		WORD32 alpha_cb,
87		WORD32 beta_cb,
88		WORD32 alpha_cr,
89		WORD32 beta_cr,
90		UWORD32 u4_bs,
91		const UWORD8 *pu1_cliptab_cb,
92		const UWORD8 *pu1_cliptab_cr)
93	77.0k	{
94	77.0k	UWORD8 pu1_src_uv = pu1_src; / Pointer to the src sample q0 of plane U*/
95	77.0k	UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
96	77.0k	WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
97	77.0k	WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
98	77.0k	__m128i linea, lineb, linec, lined, linee, linef, lineg, lineh;
99	77.0k	__m256i lineab, linecd, lineef, linegh, lineae, linebf, linecg, linedh;
100	77.0k	__m256i temp1, temp2, temp3, temp4;
101	77.0k	__m256i t1,t3, t2,t4,pq0_uv_32x8,pq1_uv_32x8,tmp1,tmp2,p0_uv_8x32,q0_uv_8x32;
102
103	77.0k	__m256i pq0_uv_8x32, pq1_uv_8x32, p1_uv_8x32,pq0_uv_8x32_1,pq0_uv_8x32_2;
104	77.0k	__m256i flag_bs, flag1, flag2;
105	77.0k	__m256i diff, diff1, alpha_cbcr_32x8, beta_cbcr_32x8, in_macro;
106	77.0k	__m256i zero = _mm256_setzero_si256();
107	77.0k	__m256i C0_uv_8x32;
108	77.0k	__m256i p0_uv_8x32_1, p0_uv_8x32_2, q0_uv_8x32_1, q0_uv_8x32_2,p0_uv_32x8_1,q0_uv_32x8_1;
109
110	77.0k	u1_Bs0 = (u4_bs >> 24) & 0xff;
111	77.0k	u1_Bs1 = (u4_bs >> 16) & 0xff;
112	77.0k	u1_Bs2 = (u4_bs >> 8) & 0xff;
113	77.0k	u1_Bs3 = (u4_bs >> 0) & 0xff;
114
115	77.0k	flag_bs = _mm256_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3,u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3,
116	77.0k	u1_Bs2, u1_Bs2, u1_Bs2, u1_Bs2, u1_Bs2, u1_Bs2,u1_Bs2, u1_Bs2,
117	77.0k	u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,
118	77.0k	u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0,u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0);
119	77.0k	flag_bs = _mm256_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s
120	77.0k	flag_bs = _mm256_xor_si256(flag_bs, _mm256_set1_epi8(0xFF)); //Invert for required mask
121
122		/* Load and transpose the pixel values */
123	77.0k	lineab = _mm256_loadu2_m128i((__m128i )(pu1_src_uv - 4 + src_strd), (__m128i )(pu1_src_uv - 4));
124	77.0k	linecd = _mm256_loadu2_m128i((__m128i )(pu1_src_uv - 4 + 3 src_strd), (__m128i )(pu1_src_uv - 4 + 2 src_strd));
125	77.0k	lineef = _mm256_loadu2_m128i((__m128i )(pu1_src_uv - 4 + 5 src_strd), (__m128i )(pu1_src_uv - 4 + 4 src_strd));
126	77.0k	linegh = _mm256_loadu2_m128i((__m128i )(pu1_src_uv - 4 + 7 src_strd), (__m128i )(pu1_src_uv - 4 + 6 src_strd));
127
128	77.0k	temp1 = _mm256_unpacklo_epi64(lineab, zero); //a0 -- a7 000.. b0..b7 000
129	77.0k	temp2 = _mm256_unpacklo_epi64(linecd, zero);
130	77.0k	temp3 = _mm256_unpacklo_epi64(lineef, zero); //e0 -- e7 000.. f0..f7 000
131	77.0k	temp4 = _mm256_unpacklo_epi64(linegh, zero);
132
133	77.0k	temp1 = _mm256_unpacklo_epi16(temp1, temp2); //a0 a1 c0 c1 -- a6 a7 c6 c7 b0 b1 d0 d1.. b6 b7 d6 d7
134	77.0k	temp2 = _mm256_unpacklo_epi16(temp3, temp4); //e0 e1 g0 g1 f0 f1 h0 h1
135
136	77.0k	t2 = _mm256_permute2f128_si256(temp1, temp2, 0x20);
137	77.0k	t3 = _mm256_permute2f128_si256(temp1, temp2, 0x31);
138
139	77.0k	tmp1 = _mm256_unpacklo_epi16(t2, t3); //a0 a1 b0 b1 c0 c1 d0 d1 -a2 a3 b2 b3 .... e0 e1 f0 f1 g0 g1 h0 h1 -e2 e3..
140	77.0k	tmp2 = _mm256_unpackhi_epi16(t2, t3); //a4 a5 b4 b5 -a6 a7 b6 b7
141
142
143	77.0k	temp1 = _mm256_unpacklo_epi8(tmp1,zero); // a0 0 a1 0 b0 0 b1 0 c0 0 c1 0 d0 0 d1 0 - e0 0 e1 0 .. => p1
144	77.0k	temp2 = _mm256_unpackhi_epi8(tmp1,zero); // a2 0 a3 0 => p0
145	77.0k	temp3 = _mm256_unpacklo_epi8(tmp2,zero); //a4 0 a5 0 => q0
146	77.0k	temp4 = _mm256_unpackhi_epi8(tmp2,zero); //a6 0 a7 0 => q1
147
148	77.0k	pq1_uv_32x8 = _mm256_packus_epi16(temp1,temp4); // 0213
149	77.0k	pq0_uv_32x8 = _mm256_packus_epi16(temp2,temp3); //0213
150
151	77.0k	diff = _mm256_subs_epi16(temp2, temp3); //Condn 1 (p0 -q0) - set (3), set(3)
152	77.0k	diff = _mm256_abs_epi16(diff);
153	77.0k	alpha_cbcr_32x8 = _mm256_set1_epi32(alpha_cbcr);
154	77.0k	flag1 = _mm256_cmpgt_epi16(alpha_cbcr_32x8, diff);
155
156	77.0k	diff = _mm256_subs_epi16(temp4, temp3); //Condtn 2 (q1 -q0)
157	77.0k	diff = _mm256_abs_epi16(diff);
158	77.0k	beta_cbcr_32x8 = _mm256_set1_epi32(beta_cbcr);
159	77.0k	flag1 = _mm256_and_si256(flag1, _mm256_cmpgt_epi16(beta_cbcr_32x8, diff));
160
161
162	77.0k	diff = _mm256_subs_epi16(temp1, temp2); //Condtn 3 (p1 -p0)
163	77.0k	diff = _mm256_abs_epi16(diff);
164	77.0k	flag1 = _mm256_and_si256(flag1, _mm256_cmpgt_epi16(beta_cbcr_32x8, diff));
165
166	77.0k	diff = _mm256_subs_epi16(temp3, temp2); //(q0 -p0)
167	77.0k	diff = _mm256_slli_epi16(diff, 2);
168
169	77.0k	diff1 = _mm256_subs_epi16(temp1, temp4); //(p1 -q1)
170	77.0k	diff = _mm256_add_epi16(diff, diff1);
171
172	77.0k	diff = _mm256_add_epi16(diff, _mm256_set1_epi16(4));
173	77.0k	in_macro = _mm256_srai_epi16(diff, 3);
174
175
176	77.0k	C0_uv_8x32 = _mm256_set_epi16(pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
177	77.0k	pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
178	77.0k	pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0],
179	77.0k	pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0],
180	77.0k	pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
181	77.0k	pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
182	77.0k	pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2],
183	77.0k	pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2]);
184
185	77.0k	C0_uv_8x32 = _mm256_add_epi16(C0_uv_8x32, _mm256_set1_epi16(1));
186
187	77.0k	in_macro = _mm256_min_epi16(C0_uv_8x32, in_macro); //CLIP3
188	77.0k	C0_uv_8x32 = _mm256_subs_epi16(zero, C0_uv_8x32);
189	77.0k	in_macro = _mm256_max_epi16(C0_uv_8x32, in_macro);
190
191	77.0k	p0_uv_8x32_1 = _mm256_add_epi16(temp2, in_macro);
192	77.0k	q0_uv_8x32_1 = _mm256_sub_epi16(temp3, in_macro);
193
194
195	77.0k	flag1 = _mm256_and_si256(flag1, flag_bs);
196	77.0k	flag1 = _mm256_packs_epi16(flag1, flag1); // 0213
197
198	77.0k	pq0_uv_8x32 = _mm256_packus_epi16(p0_uv_8x32_1,q0_uv_8x32_1); //0213
199
200	77.0k	pq0_uv_8x32_1 = _mm256_and_si256(pq0_uv_32x8,
201	77.0k	_mm256_xor_si256(flag1, _mm256_set1_epi8(0xFF)));
202	77.0k	pq0_uv_8x32_2 = _mm256_and_si256(pq0_uv_8x32, flag1);
203	77.0k	pq0_uv_32x8 = _mm256_add_epi8(pq0_uv_8x32_1, pq0_uv_8x32_2);
204
205
206	77.0k	t1 = _mm256_unpacklo_epi16(pq1_uv_32x8, pq0_uv_32x8); // temp1 temp3
207	77.0k	t2 = _mm256_unpackhi_epi16(pq1_uv_32x8, pq0_uv_32x8); // temp2 temp4
208
209	77.0k	t4 = _mm256_shufflelo_epi16(t2, _MM_SHUFFLE(2, 3, 0, 1)); // pshuflw
210	77.0k	t4 = _mm256_shufflehi_epi16(t4, _MM_SHUFFLE(2, 3, 0, 1));
211
212	77.0k	lineae = _mm256_unpacklo_epi32(t1, t4); // temp1 temp3
213	77.0k	linecg = _mm256_unpackhi_epi32(t1, t4); // temp2 temp4
214
215	77.0k	linea = _mm256_castsi256_si128(lineae);
216	77.0k	lineb = _mm256_castsi256_si128(_mm256_srli_si256(lineae, 8));
217	77.0k	lineae = _mm256_permute2f128_si256(lineae, lineae, 0x1);
218	77.0k	linee = _mm256_castsi256_si128(lineae);
219	77.0k	linef = _mm256_castsi256_si128(_mm256_srli_si256(lineae, 8));
220
221
222	77.0k	linec = _mm256_castsi256_si128(linecg);
223	77.0k	lined = _mm256_castsi256_si128(_mm256_srli_si256(linecg, 8));
224	77.0k	linecg = _mm256_permute2f128_si256(linecg, linecg, 0x1);
225	77.0k	lineg = _mm256_castsi256_si128(linecg);
226	77.0k	lineh = _mm256_castsi256_si128(_mm256_srli_si256(linecg, 8));
227
228	77.0k	_mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea);
229	77.0k	_mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb);
230	77.0k	_mm_storel_epi64((__m128i )(pu1_src_uv - 4 + 2 src_strd), linec);
231	77.0k	_mm_storel_epi64((__m128i )(pu1_src_uv - 4 + 3 src_strd), lined);
232	77.0k	_mm_storel_epi64((__m128i )(pu1_src_uv - 4 + 4 src_strd), linee);
233	77.0k	_mm_storel_epi64((__m128i )(pu1_src_uv - 4 + 5 src_strd), linef);
234	77.0k	_mm_storel_epi64((__m128i )(pu1_src_uv - 4 + 6 src_strd), lineg);
235	77.0k	_mm_storel_epi64((__m128i )(pu1_src_uv - 4 + 7 src_strd), lineh);
236
237	77.0k	}
238
239		/*****************************************************************************/
240		/* */
241		/* Function Name : ih264_deblk_chroma_horz_bslt4_avx2() */
242		/* */
243		/* Description : This function performs filtering of a chroma block */
244		/* horizontal edge when the boundary strength is less than */
245		/* 4 in high profile. */
246		/* */
247		/* Inputs : pu1_src - pointer to the src sample q0 of U */
248		/* src_strd - source stride */
249		/* alpha_cb - alpha value for the boundary in U */
250		/* beta_cb - beta value for the boundary in U */
251		/* alpha_cr - alpha value for the boundary in V */
252		/* beta_cr - beta value for the boundary in V */
253		/* u4_bs - packed Boundary strength array */
254		/* pu1_cliptab_cb - tc0_table for U */
255		/* pu1_cliptab_cr - tc0_table for V */
256		/* */
257		/* Globals : None */
258		/* */
259		/* Processing : This operation is described in Sec. 8.7.2.3 under the */
260		/* title "Filtering process for edges for bS less than 4" */
261		/* in ITU T Rec H.264 with alpha and beta values different */
262		/* in U and V. */
263		/* */
264		/* Outputs : None */
265		/* */
266		/* Returns : None */
267		/* */
268		/* Issues : None */
269		/* */
270		/* Revision History: */
271		/* */
272		/* DD MM YYYY Author(s) Changes (Describe the changes made) */
273		/* 12 02 2015 Naveen Kumar P Initial version */
274		/* 12 10 2020 Priyanka Bose AVX2 Intel Intrinsics Support */
275		/*****************************************************************************/
276		void ih264_deblk_chroma_horz_bslt4_avx2 (UWORD8 *pu1_src,
277		WORD32 src_strd,
278		WORD32 alpha_cb,
279		WORD32 beta_cb,
280		WORD32 alpha_cr,
281		WORD32 beta_cr,
282		UWORD32 u4_bs,
283		const UWORD8 *pu1_cliptab_cb,
284		const UWORD8 *pu1_cliptab_cr)
285	86.1k	{
286	86.1k	UWORD8 pu1_src_uv = pu1_src; / Pointer to the src sample q0 of plane U*/
287	86.1k	WORD16 i16_posP1, i16_posP0, i16_posQ1;
288	86.1k	UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
289
290	86.1k	UWORD8 pu1_HorzPixelUV; /! < Pointer to the first pixel of the boundary */
291	86.1k	WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
292	86.1k	WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
293	86.1k	__m256i p0q0_uv_32x8,p1q1_uv_32x8;
294	86.1k	__m256i temp1,temp2,temp3,temp4;
295	86.1k	__m256i flag_bs, flag1, flag2;
296	86.1k	__m256i diff, diff1, alpha_cbcr_32x8, beta_cbcr_32x8, in_macro;
297	86.1k	__m256i zero = _mm256_setzero_si256();
298	86.1k	__m256i C0_uv_8x32;
299	86.1k	__m256i p0q0_uv_8x32_1, p0q0_uv_8x32_2,res1,res2,p0_uv_8x32_1,q0_uv_8x32_1;
300
301	86.1k	pu1_HorzPixelUV = pu1_src_uv - (src_strd << 1);
302
303	86.1k	i16_posQ1 = src_strd;
304	86.1k	i16_posP0 = src_strd;
305	86.1k	i16_posP1 = 0;
306
307	86.1k	u1_Bs0 = (u4_bs >> 24) & 0xff;
308	86.1k	u1_Bs1 = (u4_bs >> 16) & 0xff;
309	86.1k	u1_Bs2 = (u4_bs >> 8) & 0xff;
310	86.1k	u1_Bs3 = (u4_bs >> 0) & 0xff;
311
312	86.1k	flag_bs = _mm256_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3,
313	86.1k	u1_Bs2, u1_Bs2, u1_Bs2, u1_Bs2,
314	86.1k	u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3,
315	86.1k	u1_Bs2, u1_Bs2,u1_Bs2, u1_Bs2,
316	86.1k	u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,
317	86.1k	u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0,
318	86.1k	u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,
319	86.1k	u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0);
320	86.1k	flag_bs = _mm256_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s
321	86.1k	flag_bs = _mm256_xor_si256(flag_bs, _mm256_set1_epi8(0xFF)); //Invert for required mask
322
323	86.1k	p0q0_uv_32x8 = _mm256_loadu2_m128i((__m128i )(pu1_src_uv), (__m128i )(pu1_HorzPixelUV + i16_posP0));
324	86.1k	p1q1_uv_32x8 = _mm256_loadu2_m128i((__m128i )(pu1_src_uv + i16_posQ1), (__m128i )(pu1_HorzPixelUV + i16_posP1));
325
326	86.1k	res1 = _mm256_permute4x64_epi64(p0q0_uv_32x8,0xD8);
327	86.1k	res2 = _mm256_permute4x64_epi64(p1q1_uv_32x8,0xD8);
328
329	86.1k	temp3 = _mm256_unpacklo_epi8(res1, zero); //p0 l 0 h 0
330	86.1k	temp4 = _mm256_unpackhi_epi8(res1, zero); //q0
331	86.1k	temp1 = _mm256_unpacklo_epi8(res2, zero); //p1
332	86.1k	temp2 = _mm256_unpackhi_epi8(res2, zero); //q1
333
334	86.1k	diff = _mm256_subs_epi16(temp3, temp4); //Condn 1 //p0 l h - q0 l h
335	86.1k	diff = _mm256_abs_epi16(diff);
336	86.1k	alpha_cbcr_32x8 = _mm256_set1_epi32(alpha_cbcr);
337	86.1k	flag1 = _mm256_cmpgt_epi16(alpha_cbcr_32x8, diff);
338
339	86.1k	diff = _mm256_subs_epi16(temp2, temp4); //Condtn 2
340	86.1k	diff = _mm256_abs_epi16(diff);
341	86.1k	beta_cbcr_32x8 = _mm256_set1_epi32(beta_cbcr);
342	86.1k	flag1 = _mm256_and_si256(flag1, _mm256_cmpgt_epi16(beta_cbcr_32x8, diff));
343
344	86.1k	diff = _mm256_subs_epi16(temp1, temp3); //Condtn 3
345	86.1k	diff = _mm256_abs_epi16(diff);
346	86.1k	flag1 = _mm256_and_si256(flag1, _mm256_cmpgt_epi16(beta_cbcr_32x8, diff));
347
348	86.1k	diff = _mm256_subs_epi16(temp4, temp3);
349	86.1k	diff = _mm256_slli_epi16(diff, 2);
350	86.1k	diff1 = _mm256_subs_epi16(temp1, temp2);
351	86.1k	diff = _mm256_add_epi16(diff, diff1);
352	86.1k	diff = _mm256_add_epi16(diff, _mm256_set1_epi16(4));
353	86.1k	in_macro = _mm256_srai_epi16(diff, 3);
354
355	86.1k	C0_uv_8x32 = _mm256_set_epi16(
356	86.1k	pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
357	86.1k	pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
358	86.1k	pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2],
359	86.1k	pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2],
360	86.1k	pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
361	86.1k	pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
362	86.1k	pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0],
363	86.1k	pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0]);
364
365	86.1k	C0_uv_8x32 = _mm256_add_epi16(C0_uv_8x32, _mm256_set1_epi16(1));
366
367	86.1k	in_macro = _mm256_min_epi16(C0_uv_8x32, in_macro); //CLIP3
368	86.1k	C0_uv_8x32 = _mm256_subs_epi16(zero, C0_uv_8x32);
369	86.1k	in_macro = _mm256_max_epi16(C0_uv_8x32, in_macro);
370
371	86.1k	p0_uv_8x32_1 = _mm256_add_epi16(temp3, in_macro);
372	86.1k	q0_uv_8x32_1 = _mm256_sub_epi16(temp4, in_macro);
373
374	86.1k	p0q0_uv_8x32_2 = _mm256_packus_epi16(p0_uv_8x32_1,q0_uv_8x32_1);
375	86.1k	flag1 = _mm256_packs_epi16(flag1, flag1);
376	86.1k	flag1 = _mm256_and_si256(flag1, flag_bs); //Final flag (BS condition + other 3 conditions)
377
378	86.1k	p0q0_uv_8x32_1 = _mm256_and_si256(res1,
379	86.1k	_mm256_xor_si256(flag1, _mm256_set1_epi8(0xFF)));
380	86.1k	p0q0_uv_8x32_2 = _mm256_and_si256(p0q0_uv_8x32_2, flag1);
381	86.1k	p0q0_uv_8x32_1 = _mm256_add_epi8(p0q0_uv_8x32_1, p0q0_uv_8x32_2);
382	86.1k	p0q0_uv_8x32_1 = _mm256_permute4x64_epi64(p0q0_uv_8x32_1,0xD8);
383
384	86.1k	_mm256_storeu2_m128i((__m128i )(pu1_src_uv),(__m128i )(pu1_HorzPixelUV + i16_posP0), p0q0_uv_8x32_1);
385
386	86.1k	}