/src/libavc/common/x86/ih264_weighted_pred_avx2.c

Source
/******************************************************************************
 *
 * Copyright (C) 2015 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 *****************************************************************************
 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
*/
/*****************************************************************************/
/*****************************************************************************/
/* File Includes                                                             */
/*****************************************************************************/

#include <immintrin.h>
#include "ih264_typedefs.h"
#include "ih264_macros.h"
#include "ih264_platform_macros.h"
#include "ih264_weighted_pred.h"
#include <stdint.h>
#include <string.h>

#include <stdio.h>


/*****************************************************************************/
/*                                                                           */
/*  Function Name : ih264_weighted_bi_pred_luma_avx2                         */
/*                                                                           */
/*  Description   : This function performs the weighted biprediction as      */
/*                  described in sec 8.4.2.3.2 titled "Weighted sample       */
/*                  prediction process" for luma. The function gets two      */
/*                  ht x wd blocks, weights them, adds them, rounds off the  */
/*                  sum, offsets it, saturates it to unsigned 8-bit and      */
/*                  stores it in the destination block. (ht,wd) can be       */
/*                  (4,4), (8,4), (4,8), (8,8), (16,8), (8,16) or (16,16).   */
/*                                                                           */
/*  Inputs        : pu1_src1  - Pointer to source 1                          */
/*                  pu1_src2  - Pointer to source 2                          */
/*                  pu1_dst   - Pointer to destination                       */
/*                  src_strd1 - stride for source 1                          */
/*                  src_strd2 - stride for source 2                          */
/*                  dst_strd2 - stride for destination                       */
/*                  log_wd    - number of bits to be rounded off             */
/*                  wt1       - weight value for source 1                    */
/*                  wt2       - weight value for source 2                    */
/*                  ofst1     - offset value for source 1                    */
/*                  ofst2     - offset value for source 2                    */
/*                  ht        - height of the block                          */
/*                  wd        - width of the block                           */
/*                                                                           */
/*  Issues        : None                                                     */
/*                                                                           */
/*  Revision History:                                                        */
/*                                                                           */
/*         DD MM YYYY   Author(s)       Changes                              */
/*         04 02 2015   Kaushik         Initial Version                      */
/*                      Senthoor                                             */
/*         15 09 2020   Priyanka Bose   AVX2 Intel Intrinsics Support        */
/*****************************************************************************/
void ih264_weighted_bi_pred_luma_avx2(UWORD8 *pu1_src1,
                                       UWORD8 *pu1_src2,
                                       UWORD8 *pu1_dst,
                                       WORD32 src_strd1,
                                       WORD32 src_strd2,
                                       WORD32 dst_strd,
                                       WORD32 log_wd,
                                       WORD32 wt1,
                                       WORD32 wt2,
                                       WORD32 ofst1,
                                       WORD32 ofst2,
                                       WORD32 ht,
                                       WORD32 wd)
{

    __m256i wt1_8x32b, wt2_8x32b;
    __m256i ofst_8x32b, round_8x32b;
    __m256i zero;
    zero = _mm256_set1_epi8(0);

    WORD32 ofst;
    WORD32 round_val, shft;

    wt1 = (WORD16)(wt1 & 0xffff);
    wt2 = (WORD16)(wt2 & 0xffff);
    round_val = 1 << log_wd;
    shft = log_wd + 1;
    ofst1 = (WORD8)(ofst1 & 0xff);
    ofst2 = (WORD8)(ofst2 & 0xff);
    ofst = (ofst1 + ofst2 + 1) >> 1;

    wt1_8x32b = _mm256_set1_epi16(wt1);
    wt2_8x32b = _mm256_set1_epi16(wt2);
    round_8x32b = _mm256_set1_epi16(round_val);
    ofst_8x32b = _mm256_set1_epi16(ofst);


    if(wd == 4)
    {
        __m128i y1_2_16x8b, y1_3_16x8b;
        __m128i y2_2_16x8b, y2_3_16x8b;

        __m256i y1_02_32x8b,y1_13_32x8b,y2_02_32x8b,y2_13_32x8b,y1_0_32x8b,y2_0_32x8b,y1_0_8x32b,y2_1_8x32b,y2_0_8x32b;
        __m128i y1_0_16x8b_128,y2_0_16x8b_128,y1_1_16x8b_128,y1_2_16x8b_128,y1_3_16x8b_128;

        do
        {
            y1_02_32x8b =  _mm256_loadu2_m128i((__m128i *)(pu1_src1 + (src_strd1 << 1)), (__m128i *)(pu1_src1));
            y1_13_32x8b =  _mm256_loadu2_m128i((__m128i *)(pu1_src1 + src_strd1 * 3), (__m128i *)(pu1_src1 + src_strd1));

            y2_02_32x8b =  _mm256_loadu2_m128i((__m128i *)(pu1_src2 + (src_strd2 << 1)), (__m128i *)(pu1_src2));
            y2_13_32x8b =  _mm256_loadu2_m128i((__m128i *)(pu1_src2 + src_strd2 * 3), (__m128i *)(pu1_src2 + src_strd2));

            y1_02_32x8b = _mm256_unpacklo_epi64(y1_02_32x8b, zero);
            y1_13_32x8b = _mm256_unpacklo_epi64(y1_13_32x8b, zero);
            y2_02_32x8b = _mm256_unpacklo_epi64(y2_02_32x8b, zero);
            y2_13_32x8b = _mm256_unpacklo_epi64(y2_13_32x8b, zero);

            y1_0_32x8b = _mm256_unpacklo_epi32(y1_02_32x8b, y1_13_32x8b);
            y2_0_32x8b = _mm256_unpacklo_epi32(y2_02_32x8b, y2_13_32x8b);
            y1_0_16x8b_128 = _mm256_castsi256_si128(_mm256_permute4x64_epi64(y1_0_32x8b, 0xD8));
            y2_0_16x8b_128 = _mm256_castsi256_si128(_mm256_permute4x64_epi64(y2_0_32x8b, 0xD8));

            y1_0_8x32b = _mm256_cvtepu8_epi16(y1_0_16x8b_128); // 8 to 16
            y2_0_8x32b = _mm256_cvtepu8_epi16(y2_0_16x8b_128);

            y1_0_8x32b = _mm256_mullo_epi16(y1_0_8x32b, wt1_8x32b);
            y2_0_8x32b = _mm256_mullo_epi16(y2_0_8x32b, wt2_8x32b);

            y1_0_8x32b = _mm256_adds_epi16(y1_0_8x32b, y2_0_8x32b);

            y1_0_8x32b = _mm256_srai_epi16(y1_0_8x32b, shft);

            y1_0_8x32b = _mm256_adds_epi16(ofst_8x32b, y1_0_8x32b);

            y1_0_16x8b_128 = _mm256_castsi256_si128(_mm256_packus_epi16(y1_0_8x32b, y1_0_8x32b));
            y1_2_16x8b_128 = _mm_srli_si128(y1_0_16x8b_128, 4);
            y1_1_16x8b_128 = _mm_srli_si128(y1_0_16x8b_128, 8);
            y1_3_16x8b_128 = _mm_srli_si128(y1_0_16x8b_128, 12);

            *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y1_0_16x8b_128);
            *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y1_1_16x8b_128);
            *((WORD32 *)(pu1_dst + (dst_strd << 1))) = _mm_cvtsi128_si32(y1_2_16x8b_128);
            *((WORD32 *)(pu1_dst + dst_strd * 3)) = _mm_cvtsi128_si32(y1_3_16x8b_128);

            ht -= 4;
            pu1_src1 += src_strd1 << 2;
            pu1_src2 += src_strd2 << 2;
            pu1_dst += dst_strd << 2;
        }
        while(ht > 0);
    }
    else if(wd == 8)
    {
        __m128i y1_0_16x8b_128,y2_0_16x8b_128,y1_2_16x8b_128,y1_1_16x8b_128,y1_3_16x8b_128;
        __m256i y1_02_32x8b,y1_13_32x8b,y2_02_32x8b,y2_13_32x8b,y1_0_32x8b,y2_0_32x8b,y1_0_8x32b;
        __m256i y1_1_8x32b,y2_0_8x32b,y2_1_8x32b;

        do
        {

            y1_02_32x8b =  _mm256_loadu2_m128i((__m128i *)(pu1_src1 + (src_strd1 << 1)), (__m128i *)(pu1_src1));
            y1_13_32x8b =  _mm256_loadu2_m128i((__m128i *)(pu1_src1 + src_strd1 * 3), (__m128i *)(pu1_src1 + src_strd1));

            y2_02_32x8b =  _mm256_loadu2_m128i((__m128i *)(pu1_src2 + (src_strd2 << 1)), (__m128i *)(pu1_src2));
            y2_13_32x8b =  _mm256_loadu2_m128i((__m128i *)(pu1_src2 + src_strd2 * 3), (__m128i *)(pu1_src2 + src_strd2));

            y1_02_32x8b = _mm256_unpacklo_epi64(y1_02_32x8b, zero);
            y1_13_32x8b = _mm256_unpacklo_epi64(y1_13_32x8b, zero);
            y2_02_32x8b = _mm256_unpacklo_epi64(y2_02_32x8b, zero);
            y2_13_32x8b = _mm256_unpacklo_epi64(y2_13_32x8b, zero);

            y1_0_32x8b = _mm256_unpacklo_epi64(y1_02_32x8b, y1_13_32x8b);
            y2_0_32x8b = _mm256_unpacklo_epi64(y2_02_32x8b, y2_13_32x8b);

            y1_0_8x32b = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(y1_0_32x8b));
            y1_0_16x8b_128 = _mm256_castsi256_si128(_mm256_permute2x128_si256(y1_0_32x8b,y1_0_32x8b,0x1));
            y1_1_8x32b = _mm256_cvtepu8_epi16(y1_0_16x8b_128);

            y2_0_8x32b = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(y2_0_32x8b));
            y2_0_16x8b_128 = _mm256_castsi256_si128(_mm256_permute2x128_si256(y2_0_32x8b,y2_0_32x8b,0x1));
            y2_1_8x32b = _mm256_cvtepu8_epi16(y2_0_16x8b_128);


            y1_0_8x32b = _mm256_mullo_epi16(y1_0_8x32b, wt1_8x32b);
            y2_0_8x32b = _mm256_mullo_epi16(y2_0_8x32b, wt2_8x32b);
            y1_1_8x32b = _mm256_mullo_epi16(y1_1_8x32b, wt1_8x32b);
            y2_1_8x32b = _mm256_mullo_epi16(y2_1_8x32b, wt2_8x32b);

            y1_0_8x32b = _mm256_adds_epi16(y1_0_8x32b, y2_0_8x32b);
            y1_1_8x32b = _mm256_adds_epi16(y1_1_8x32b, y2_1_8x32b);

            y1_0_8x32b = _mm256_srai_epi16(y1_0_8x32b, shft);
            y1_1_8x32b = _mm256_srai_epi16(y1_1_8x32b, shft);

            y1_0_8x32b = _mm256_adds_epi16(ofst_8x32b, y1_0_8x32b);
            y1_1_8x32b = _mm256_adds_epi16(ofst_8x32b, y1_1_8x32b);

            y1_0_32x8b = _mm256_packus_epi16(y1_0_8x32b, y1_1_8x32b);
            y1_0_16x8b_128 = _mm256_castsi256_si128(y1_0_32x8b);
            y1_2_16x8b_128 = _mm256_castsi256_si128(_mm256_srli_si256(y1_0_32x8b, 8));

            y1_0_32x8b = _mm256_permute2x128_si256(y1_0_32x8b,y1_0_32x8b,1);
            y1_1_16x8b_128 = _mm256_castsi256_si128(y1_0_32x8b);
            y1_3_16x8b_128 = _mm256_castsi256_si128(_mm256_srli_si256(y1_0_32x8b, 8));

            _mm_storel_epi64((__m128i *)pu1_dst, y1_0_16x8b_128);
            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b_128);
            _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd << 1)), y1_2_16x8b_128);
            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd * 3), y1_3_16x8b_128);

            ht -= 4;
            pu1_src1 += src_strd1 << 2;
            pu1_src2 += src_strd2 << 2;
            pu1_dst += dst_strd << 2;

        }
        while(ht > 0);
    }
    else // wd == 16
    {
        __m256i y1_0L_8x32b, y1_0H_8x32b, y1_1L_8x32b, y1_1H_8x32b;
        __m256i y2_0L_8x32b, y2_0H_8x32b, y2_1L_8x32b, y2_1H_8x32b;

        __m256i zero_32x8b,y1_0_32x8b,y2_0_32x8b;
        zero_32x8b = _mm256_set1_epi8(0);

        do
        {

            y1_0_32x8b = _mm256_loadu_si256((__m256i *)pu1_src1);
            y2_0_32x8b = _mm256_loadu_si256((__m256i *)pu1_src2);

            y1_0L_8x32b = _mm256_unpacklo_epi8(y1_0_32x8b, zero_32x8b);
            y1_0H_8x32b = _mm256_unpackhi_epi8(y1_0_32x8b, zero_32x8b);

            y2_0L_8x32b = _mm256_unpacklo_epi8(y2_0_32x8b,zero_32x8b);
            y2_0H_8x32b = _mm256_unpackhi_epi8(y2_0_32x8b, zero_32x8b);

            y1_0L_8x32b = _mm256_mullo_epi16(y1_0L_8x32b, wt1_8x32b);
            y1_0H_8x32b = _mm256_mullo_epi16(y1_0H_8x32b, wt1_8x32b);

            y2_0L_8x32b = _mm256_mullo_epi16(y2_0L_8x32b, wt2_8x32b);
            y2_0H_8x32b = _mm256_mullo_epi16(y2_0H_8x32b, wt2_8x32b);

            y1_0L_8x32b = _mm256_adds_epi16(y1_0L_8x32b, y2_0L_8x32b);
            y1_0H_8x32b = _mm256_adds_epi16(y1_0H_8x32b, y2_0H_8x32b);

            y1_0L_8x32b = _mm256_adds_epi16(round_8x32b, y1_0L_8x32b);
            y1_0H_8x32b = _mm256_adds_epi16(round_8x32b, y1_0H_8x32b);

            y1_0L_8x32b = _mm256_srai_epi16(y1_0L_8x32b, shft);
            y1_0H_8x32b = _mm256_srai_epi16(y1_0H_8x32b, shft);

            y1_0L_8x32b = _mm256_adds_epi16(ofst_8x32b, y1_0L_8x32b);
            y1_0H_8x32b = _mm256_adds_epi16(ofst_8x32b, y1_0H_8x32b);

            y1_0_32x8b = _mm256_packus_epi16(y1_0L_8x32b, y1_0H_8x32b);

            _mm256_storeu_si256((__m256i *)pu1_dst, y1_0_32x8b);

            ht -= 2;
            pu1_src1 += src_strd1 << 1;
            pu1_src2 += src_strd2 << 1;
            pu1_dst += dst_strd << 1;
        }
        while(ht > 0);
    }
}


/*****************************************************************************/
/*                                                                           */
/*  Function Name : ih264_weighted_bi_pred_chroma_avx2                       */
/*                                                                           */
/*  Description   : This function performs the weighted biprediction as      */
/*                  described in sec 8.4.2.3.2 titled "Weighted sample       */
/*                  prediction process" for chroma. The function gets two    */
/*                  ht x wd blocks, weights them, adds them, rounds off the  */
/*                  sum, offsets it, saturates it to unsigned 8-bit and      */
/*                  stores it in the destination block. (ht,wd) can be       */
/*                  (2,2), (4,2), (2,4), (4,4), (8,4), (4,8) or (8,8).       */
/*                                                                           */
/*  Inputs        : pu1_src1  - Pointer to source 1                          */
/*                  pu1_src2  - Pointer to source 2                          */
/*                  pu1_dst   - Pointer to destination                       */
/*                  src_strd1 - stride for source 1                          */
/*                  src_strd2 - stride for source 2                          */
/*                  dst_strd2 - stride for destination                       */
/*                  log_wd    - number of bits to be rounded off             */
/*                  wt1       - weight values for u and v in source 1        */
/*                  wt2       - weight values for u and v in source 2        */
/*                  ofst1     - offset value for u and v in source 1         */
/*                  ofst2     - offset value for u and v in source 2         */
/*                  ht        - height of the block                          */
/*                  wd        - width of the block                           */
/*                                                                           */
/*  Issues        : None                                                     */
/*                                                                           */
/*  Revision History:                                                        */
/*                                                                           */
/*         DD MM YYYY   Author(s)       Changes                              */
/*         04 02 2015   Kaushik         Initial Version                      */
/*                      Senthoor                                             */
/*         15 09 2020   Priyanka Bose   AVX2 Intel Intrinsics Support        */
/*****************************************************************************/
void ih264_weighted_bi_pred_chroma_avx2(UWORD8 *pu1_src1,
                                         UWORD8 *pu1_src2,
                                         UWORD8 *pu1_dst,
                                         WORD32 src_strd1,
                                         WORD32 src_strd2,
                                         WORD32 dst_strd,
                                         WORD32 log_wd,
                                         WORD32 wt1,
                                         WORD32 wt2,
                                         WORD32 ofst1,
                                         WORD32 ofst2,
                                         WORD32 ht,
                                         WORD32 wd)
{

    __m128i y1_0_16x8b, y1_1_16x8b;
    __m128i y2_0_16x8b, y2_1_16x8b;

    __m128i wt1_8x16b, wt2_8x16b;
    __m128i ofst_8x16b, round_8x16b;

    WORD32 ofst1_u, ofst2_u, ofst_u;
    WORD32 ofst1_v, ofst2_v, ofst_v;
    WORD32 round_val, shft, ofst_val,ofst_val_256;

    round_val = 1 << log_wd;
    shft = log_wd + 1;

    ofst1_u = (WORD8)(ofst1 & 0xff);
    ofst1_v = (WORD8)(ofst1 >> 8);
    ofst2_u = (WORD8)(ofst2 & 0xff);
    ofst2_v = (WORD8)(ofst2 >> 8);

    wt1_8x16b = _mm_set1_epi32(wt1);
    wt2_8x16b = _mm_set1_epi32(wt2);

    ofst_u = (ofst1_u + ofst2_u + 1) >> 1;
    ofst_v = (ofst1_v + ofst2_v + 1) >> 1;
    ofst_val = (ofst_u & 0xffff) | (ofst_v << 16);
    ofst_val_256 = (ofst_u & 0xffff) | (ofst_v << 16);

    round_8x16b = _mm_set1_epi16(round_val);
    ofst_8x16b =  _mm_set1_epi32(ofst_val);

    if(wd == 2)
    {
        __m128i y1_0_8x16b, y2_0_8x16b;

        do
        {
            y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);                 //Loading 64 bits from diff location
            y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));

            y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
            y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));

            y1_0_16x8b = _mm_unpacklo_epi32(y1_0_16x8b, y1_1_16x8b);
            y2_0_16x8b = _mm_unpacklo_epi32(y2_0_16x8b, y2_1_16x8b);

            y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
            y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);

            y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b);
            y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b);

            y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b);
            y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b);

            y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft);
            y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b);

            y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_0_8x16b);
            y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 4);

            *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y1_0_16x8b);
            *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y1_1_16x8b);

            ht -= 2;
            pu1_src1 += src_strd1 << 1;
            pu1_src2 += src_strd2 << 1;
            pu1_dst += dst_strd << 1;
        }
        while(ht > 0);
    }
    else if(wd == 4)
    {
        __m128i y1_0_8x16b, y1_1_8x16b;
        __m128i y2_0_8x16b, y2_1_8x16b;

        do
        {
            y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);                  //Loading 64 bits from diff location
            y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));

            y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
            y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));

            y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
            y1_1_8x16b = _mm_cvtepu8_epi16(y1_1_16x8b);

            y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
            y2_1_8x16b = _mm_cvtepu8_epi16(y2_1_16x8b);

            y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b);
            y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b);
            y1_1_8x16b = _mm_mullo_epi16(y1_1_8x16b, wt1_8x16b);
            y2_1_8x16b = _mm_mullo_epi16(y2_1_8x16b, wt2_8x16b);

            y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b);
            y1_1_8x16b = _mm_adds_epi16(y1_1_8x16b, y2_1_8x16b);

            y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b);
            y1_1_8x16b = _mm_adds_epi16(round_8x16b, y1_1_8x16b);

            y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft);
            y1_1_8x16b = _mm_srai_epi16(y1_1_8x16b, shft);

            y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b);
            y1_1_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1_8x16b);

            y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_1_8x16b);
            y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 8);

            _mm_storel_epi64((__m128i *)pu1_dst, y1_0_16x8b);
            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b);

            ht -= 2;
            pu1_src1 += src_strd1 << 1;
            pu1_src2 += src_strd2 << 1;
            pu1_dst += dst_strd << 1;
        }
        while(ht > 0);
    }
    else // wd == 8
    {
        __m256i y1_0L_8x32b, y1_0H_8x32b, y1_1L_8x32b, y1_1H_8x32b;
        __m256i y2_0L_8x32b, y2_0H_8x32b, y2_1L_8x32b, y2_1H_8x32b;
        __m256i y1_0_32x8b,y2_0_32x8b,ofst_8x32b,round_8x32b;
        __m256i wt1_8x32b, wt2_8x32b;
        __m256i zero_32x8b;

        wt1_8x32b = _mm256_set1_epi16(wt1);
        wt2_8x32b = _mm256_set1_epi16(wt2);
        round_8x32b = _mm256_set1_epi16(round_val);
        ofst_8x32b = _mm256_set1_epi32(ofst_val_256);
        zero_32x8b = _mm256_set1_epi8(0);

        do
        {
            y1_0_32x8b = _mm256_loadu_si256((__m256i *)pu1_src1);
            y2_0_32x8b = _mm256_loadu_si256((__m256i *)pu1_src2);
            y1_0L_8x32b = _mm256_unpacklo_epi8(y1_0_32x8b, zero_32x8b);
            y1_0H_8x32b = _mm256_unpackhi_epi8(y1_0_32x8b, zero_32x8b);
            y2_0L_8x32b = _mm256_unpacklo_epi8(y2_0_32x8b, zero_32x8b);
            y2_0H_8x32b = _mm256_unpackhi_epi8(y2_0_32x8b, zero_32x8b);
            y1_0L_8x32b = _mm256_mullo_epi16(y1_0L_8x32b, wt1_8x32b);
            y1_0H_8x32b = _mm256_mullo_epi16(y1_0H_8x32b, wt1_8x32b);

            y2_0L_8x32b = _mm256_mullo_epi16(y2_0L_8x32b, wt2_8x32b);
            y2_0H_8x32b = _mm256_mullo_epi16(y2_0H_8x32b, wt2_8x32b);

            y1_0L_8x32b = _mm256_adds_epi16(y1_0L_8x32b, y2_0L_8x32b);
            y1_0H_8x32b = _mm256_adds_epi16(y1_0H_8x32b, y2_0H_8x32b);

            y1_0L_8x32b = _mm256_adds_epi16(round_8x32b, y1_0L_8x32b);
            y1_0H_8x32b = _mm256_adds_epi16(round_8x32b, y1_0H_8x32b);

            y1_0L_8x32b = _mm256_srai_epi16(y1_0L_8x32b, shft);
            y1_0H_8x32b = _mm256_srai_epi16(y1_0H_8x32b, shft);

            y1_0L_8x32b = _mm256_adds_epi16(ofst_8x32b, y1_0L_8x32b);
            y1_0H_8x32b = _mm256_adds_epi16(ofst_8x32b, y1_0H_8x32b);


            y1_0_32x8b = _mm256_packus_epi16(y1_0L_8x32b, y1_0H_8x32b);
            _mm256_storeu_si256((__m256i *)pu1_dst, y1_0_32x8b);

            ht -= 2;
            pu1_src1 += src_strd1 << 1;
            pu1_src2 += src_strd2 << 1;
            pu1_dst += dst_strd << 1;
        }
        while(ht > 0);
    }
}

Coverage Report

Created: 2026-01-09 06:18

Line	Count	Source
1		/******************************************************************************
2		*
3		* Copyright (C) 2015 The Android Open Source Project
4		*
5		* Licensed under the Apache License, Version 2.0 (the "License");
6		* you may not use this file except in compliance with the License.
7		* You may obtain a copy of the License at:
8		*
9		* http://www.apache.org/licenses/LICENSE-2.0
10		*
11		* Unless required by applicable law or agreed to in writing, software
12		* distributed under the License is distributed on an "AS IS" BASIS,
13		* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14		* See the License for the specific language governing permissions and
15		* limitations under the License.
16		*
17		*****************************************************************************
18		* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19		*/
20		/*****************************************************************************/
21		/*****************************************************************************/
22		/* File Includes */
23		/*****************************************************************************/
24
25		#include <immintrin.h>
26		#include "ih264_typedefs.h"
27		#include "ih264_macros.h"
28		#include "ih264_platform_macros.h"
29		#include "ih264_weighted_pred.h"
30		#include <stdint.h>
31		#include <string.h>
32
33		#include <stdio.h>
34
35
36		/*****************************************************************************/
37		/* */
38		/* Function Name : ih264_weighted_bi_pred_luma_avx2 */
39		/* */
40		/* Description : This function performs the weighted biprediction as */
41		/* described in sec 8.4.2.3.2 titled "Weighted sample */
42		/* prediction process" for luma. The function gets two */
43		/* ht x wd blocks, weights them, adds them, rounds off the */
44		/* sum, offsets it, saturates it to unsigned 8-bit and */
45		/* stores it in the destination block. (ht,wd) can be */
46		/* (4,4), (8,4), (4,8), (8,8), (16,8), (8,16) or (16,16). */
47		/* */
48		/* Inputs : pu1_src1 - Pointer to source 1 */
49		/* pu1_src2 - Pointer to source 2 */
50		/* pu1_dst - Pointer to destination */
51		/* src_strd1 - stride for source 1 */
52		/* src_strd2 - stride for source 2 */
53		/* dst_strd2 - stride for destination */
54		/* log_wd - number of bits to be rounded off */
55		/* wt1 - weight value for source 1 */
56		/* wt2 - weight value for source 2 */
57		/* ofst1 - offset value for source 1 */
58		/* ofst2 - offset value for source 2 */
59		/* ht - height of the block */
60		/* wd - width of the block */
61		/* */
62		/* Issues : None */
63		/* */
64		/* Revision History: */
65		/* */
66		/* DD MM YYYY Author(s) Changes */
67		/* 04 02 2015 Kaushik Initial Version */
68		/* Senthoor */
69		/* 15 09 2020 Priyanka Bose AVX2 Intel Intrinsics Support */
70		/*****************************************************************************/
71		void ih264_weighted_bi_pred_luma_avx2(UWORD8 *pu1_src1,
72		UWORD8 *pu1_src2,
73		UWORD8 *pu1_dst,
74		WORD32 src_strd1,
75		WORD32 src_strd2,
76		WORD32 dst_strd,
77		WORD32 log_wd,
78		WORD32 wt1,
79		WORD32 wt2,
80		WORD32 ofst1,
81		WORD32 ofst2,
82		WORD32 ht,
83		WORD32 wd)
84	1.85k	{
85
86	1.85k	__m256i wt1_8x32b, wt2_8x32b;
87	1.85k	__m256i ofst_8x32b, round_8x32b;
88	1.85k	__m256i zero;
89	1.85k	zero = _mm256_set1_epi8(0);
90
91	1.85k	WORD32 ofst;
92	1.85k	WORD32 round_val, shft;
93
94	1.85k	wt1 = (WORD16)(wt1 & 0xffff);
95	1.85k	wt2 = (WORD16)(wt2 & 0xffff);
96	1.85k	round_val = 1 << log_wd;
97	1.85k	shft = log_wd + 1;
98	1.85k	ofst1 = (WORD8)(ofst1 & 0xff);
99	1.85k	ofst2 = (WORD8)(ofst2 & 0xff);
100	1.85k	ofst = (ofst1 + ofst2 + 1) >> 1;
101
102	1.85k	wt1_8x32b = _mm256_set1_epi16(wt1);
103	1.85k	wt2_8x32b = _mm256_set1_epi16(wt2);
104	1.85k	round_8x32b = _mm256_set1_epi16(round_val);
105	1.85k	ofst_8x32b = _mm256_set1_epi16(ofst);
106
107
108	1.85k	if(wd == 4)
109	0	{
110	0	__m128i y1_2_16x8b, y1_3_16x8b;
111	0	__m128i y2_2_16x8b, y2_3_16x8b;
112
113	0	__m256i y1_02_32x8b,y1_13_32x8b,y2_02_32x8b,y2_13_32x8b,y1_0_32x8b,y2_0_32x8b,y1_0_8x32b,y2_1_8x32b,y2_0_8x32b;
114	0	__m128i y1_0_16x8b_128,y2_0_16x8b_128,y1_1_16x8b_128,y1_2_16x8b_128,y1_3_16x8b_128;
115
116	0	do
117	0	{
118	0	y1_02_32x8b = _mm256_loadu2_m128i((__m128i )(pu1_src1 + (src_strd1 << 1)), (__m128i )(pu1_src1));
119	0	y1_13_32x8b = _mm256_loadu2_m128i((__m128i )(pu1_src1 + src_strd1 3), (__m128i *)(pu1_src1 + src_strd1));
120
121	0	y2_02_32x8b = _mm256_loadu2_m128i((__m128i )(pu1_src2 + (src_strd2 << 1)), (__m128i )(pu1_src2));
122	0	y2_13_32x8b = _mm256_loadu2_m128i((__m128i )(pu1_src2 + src_strd2 3), (__m128i *)(pu1_src2 + src_strd2));
123
124	0	y1_02_32x8b = _mm256_unpacklo_epi64(y1_02_32x8b, zero);
125	0	y1_13_32x8b = _mm256_unpacklo_epi64(y1_13_32x8b, zero);
126	0	y2_02_32x8b = _mm256_unpacklo_epi64(y2_02_32x8b, zero);
127	0	y2_13_32x8b = _mm256_unpacklo_epi64(y2_13_32x8b, zero);
128
129	0	y1_0_32x8b = _mm256_unpacklo_epi32(y1_02_32x8b, y1_13_32x8b);
130	0	y2_0_32x8b = _mm256_unpacklo_epi32(y2_02_32x8b, y2_13_32x8b);
131	0	y1_0_16x8b_128 = _mm256_castsi256_si128(_mm256_permute4x64_epi64(y1_0_32x8b, 0xD8));
132	0	y2_0_16x8b_128 = _mm256_castsi256_si128(_mm256_permute4x64_epi64(y2_0_32x8b, 0xD8));
133
134	0	y1_0_8x32b = _mm256_cvtepu8_epi16(y1_0_16x8b_128); // 8 to 16
135	0	y2_0_8x32b = _mm256_cvtepu8_epi16(y2_0_16x8b_128);
136
137	0	y1_0_8x32b = _mm256_mullo_epi16(y1_0_8x32b, wt1_8x32b);
138	0	y2_0_8x32b = _mm256_mullo_epi16(y2_0_8x32b, wt2_8x32b);
139
140	0	y1_0_8x32b = _mm256_adds_epi16(y1_0_8x32b, y2_0_8x32b);
141
142	0	y1_0_8x32b = _mm256_srai_epi16(y1_0_8x32b, shft);
143
144	0	y1_0_8x32b = _mm256_adds_epi16(ofst_8x32b, y1_0_8x32b);
145
146	0	y1_0_16x8b_128 = _mm256_castsi256_si128(_mm256_packus_epi16(y1_0_8x32b, y1_0_8x32b));
147	0	y1_2_16x8b_128 = _mm_srli_si128(y1_0_16x8b_128, 4);
148	0	y1_1_16x8b_128 = _mm_srli_si128(y1_0_16x8b_128, 8);
149	0	y1_3_16x8b_128 = _mm_srli_si128(y1_0_16x8b_128, 12);
150
151	0	((WORD32 )(pu1_dst)) = _mm_cvtsi128_si32(y1_0_16x8b_128);
152	0	((WORD32 )(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y1_1_16x8b_128);
153	0	((WORD32 )(pu1_dst + (dst_strd << 1))) = _mm_cvtsi128_si32(y1_2_16x8b_128);
154	0	((WORD32 )(pu1_dst + dst_strd * 3)) = _mm_cvtsi128_si32(y1_3_16x8b_128);
155
156	0	ht -= 4;
157	0	pu1_src1 += src_strd1 << 2;
158	0	pu1_src2 += src_strd2 << 2;
159	0	pu1_dst += dst_strd << 2;
160	0	}
161	0	while(ht > 0);
162	0	}
163	1.85k	else if(wd == 8)
164	184	{
165	184	__m128i y1_0_16x8b_128,y2_0_16x8b_128,y1_2_16x8b_128,y1_1_16x8b_128,y1_3_16x8b_128;
166	184	__m256i y1_02_32x8b,y1_13_32x8b,y2_02_32x8b,y2_13_32x8b,y1_0_32x8b,y2_0_32x8b,y1_0_8x32b;
167	184	__m256i y1_1_8x32b,y2_0_8x32b,y2_1_8x32b;
168
169	184	do
170	736	{
171
172	736	y1_02_32x8b = _mm256_loadu2_m128i((__m128i )(pu1_src1 + (src_strd1 << 1)), (__m128i )(pu1_src1));
173	736	y1_13_32x8b = _mm256_loadu2_m128i((__m128i )(pu1_src1 + src_strd1 3), (__m128i *)(pu1_src1 + src_strd1));
174
175	736	y2_02_32x8b = _mm256_loadu2_m128i((__m128i )(pu1_src2 + (src_strd2 << 1)), (__m128i )(pu1_src2));
176	736	y2_13_32x8b = _mm256_loadu2_m128i((__m128i )(pu1_src2 + src_strd2 3), (__m128i *)(pu1_src2 + src_strd2));
177
178	736	y1_02_32x8b = _mm256_unpacklo_epi64(y1_02_32x8b, zero);
179	736	y1_13_32x8b = _mm256_unpacklo_epi64(y1_13_32x8b, zero);
180	736	y2_02_32x8b = _mm256_unpacklo_epi64(y2_02_32x8b, zero);
181	736	y2_13_32x8b = _mm256_unpacklo_epi64(y2_13_32x8b, zero);
182
183	736	y1_0_32x8b = _mm256_unpacklo_epi64(y1_02_32x8b, y1_13_32x8b);
184	736	y2_0_32x8b = _mm256_unpacklo_epi64(y2_02_32x8b, y2_13_32x8b);
185
186	736	y1_0_8x32b = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(y1_0_32x8b));
187	736	y1_0_16x8b_128 = _mm256_castsi256_si128(_mm256_permute2x128_si256(y1_0_32x8b,y1_0_32x8b,0x1));
188	736	y1_1_8x32b = _mm256_cvtepu8_epi16(y1_0_16x8b_128);
189
190	736	y2_0_8x32b = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(y2_0_32x8b));
191	736	y2_0_16x8b_128 = _mm256_castsi256_si128(_mm256_permute2x128_si256(y2_0_32x8b,y2_0_32x8b,0x1));
192	736	y2_1_8x32b = _mm256_cvtepu8_epi16(y2_0_16x8b_128);
193
194
195	736	y1_0_8x32b = _mm256_mullo_epi16(y1_0_8x32b, wt1_8x32b);
196	736	y2_0_8x32b = _mm256_mullo_epi16(y2_0_8x32b, wt2_8x32b);
197	736	y1_1_8x32b = _mm256_mullo_epi16(y1_1_8x32b, wt1_8x32b);
198	736	y2_1_8x32b = _mm256_mullo_epi16(y2_1_8x32b, wt2_8x32b);
199
200	736	y1_0_8x32b = _mm256_adds_epi16(y1_0_8x32b, y2_0_8x32b);
201	736	y1_1_8x32b = _mm256_adds_epi16(y1_1_8x32b, y2_1_8x32b);
202
203	736	y1_0_8x32b = _mm256_srai_epi16(y1_0_8x32b, shft);
204	736	y1_1_8x32b = _mm256_srai_epi16(y1_1_8x32b, shft);
205
206	736	y1_0_8x32b = _mm256_adds_epi16(ofst_8x32b, y1_0_8x32b);
207	736	y1_1_8x32b = _mm256_adds_epi16(ofst_8x32b, y1_1_8x32b);
208
209	736	y1_0_32x8b = _mm256_packus_epi16(y1_0_8x32b, y1_1_8x32b);
210	736	y1_0_16x8b_128 = _mm256_castsi256_si128(y1_0_32x8b);
211	736	y1_2_16x8b_128 = _mm256_castsi256_si128(_mm256_srli_si256(y1_0_32x8b, 8));
212
213	736	y1_0_32x8b = _mm256_permute2x128_si256(y1_0_32x8b,y1_0_32x8b,1);
214	736	y1_1_16x8b_128 = _mm256_castsi256_si128(y1_0_32x8b);
215	736	y1_3_16x8b_128 = _mm256_castsi256_si128(_mm256_srli_si256(y1_0_32x8b, 8));
216
217	736	_mm_storel_epi64((__m128i *)pu1_dst, y1_0_16x8b_128);
218	736	_mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b_128);
219	736	_mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd << 1)), y1_2_16x8b_128);
220	736	_mm_storel_epi64((__m128i )(pu1_dst + dst_strd 3), y1_3_16x8b_128);
221
222	736	ht -= 4;
223	736	pu1_src1 += src_strd1 << 2;
224	736	pu1_src2 += src_strd2 << 2;
225	736	pu1_dst += dst_strd << 2;
226
227	736	}
228	736	while(ht > 0);
229	184	}
230	1.66k	else // wd == 16
231	1.66k	{
232	1.66k	__m256i y1_0L_8x32b, y1_0H_8x32b, y1_1L_8x32b, y1_1H_8x32b;
233	1.66k	__m256i y2_0L_8x32b, y2_0H_8x32b, y2_1L_8x32b, y2_1H_8x32b;
234
235	1.66k	__m256i zero_32x8b,y1_0_32x8b,y2_0_32x8b;
236	1.66k	zero_32x8b = _mm256_set1_epi8(0);
237
238	1.66k	do
239	10.2k	{
240
241	10.2k	y1_0_32x8b = _mm256_loadu_si256((__m256i *)pu1_src1);
242	10.2k	y2_0_32x8b = _mm256_loadu_si256((__m256i *)pu1_src2);
243
244	10.2k	y1_0L_8x32b = _mm256_unpacklo_epi8(y1_0_32x8b, zero_32x8b);
245	10.2k	y1_0H_8x32b = _mm256_unpackhi_epi8(y1_0_32x8b, zero_32x8b);
246
247	10.2k	y2_0L_8x32b = _mm256_unpacklo_epi8(y2_0_32x8b,zero_32x8b);
248	10.2k	y2_0H_8x32b = _mm256_unpackhi_epi8(y2_0_32x8b, zero_32x8b);
249
250	10.2k	y1_0L_8x32b = _mm256_mullo_epi16(y1_0L_8x32b, wt1_8x32b);
251	10.2k	y1_0H_8x32b = _mm256_mullo_epi16(y1_0H_8x32b, wt1_8x32b);
252
253	10.2k	y2_0L_8x32b = _mm256_mullo_epi16(y2_0L_8x32b, wt2_8x32b);
254	10.2k	y2_0H_8x32b = _mm256_mullo_epi16(y2_0H_8x32b, wt2_8x32b);
255
256	10.2k	y1_0L_8x32b = _mm256_adds_epi16(y1_0L_8x32b, y2_0L_8x32b);
257	10.2k	y1_0H_8x32b = _mm256_adds_epi16(y1_0H_8x32b, y2_0H_8x32b);
258
259	10.2k	y1_0L_8x32b = _mm256_adds_epi16(round_8x32b, y1_0L_8x32b);
260	10.2k	y1_0H_8x32b = _mm256_adds_epi16(round_8x32b, y1_0H_8x32b);
261
262	10.2k	y1_0L_8x32b = _mm256_srai_epi16(y1_0L_8x32b, shft);
263	10.2k	y1_0H_8x32b = _mm256_srai_epi16(y1_0H_8x32b, shft);
264
265	10.2k	y1_0L_8x32b = _mm256_adds_epi16(ofst_8x32b, y1_0L_8x32b);
266	10.2k	y1_0H_8x32b = _mm256_adds_epi16(ofst_8x32b, y1_0H_8x32b);
267
268	10.2k	y1_0_32x8b = _mm256_packus_epi16(y1_0L_8x32b, y1_0H_8x32b);
269
270	10.2k	_mm256_storeu_si256((__m256i *)pu1_dst, y1_0_32x8b);
271
272	10.2k	ht -= 2;
273	10.2k	pu1_src1 += src_strd1 << 1;
274	10.2k	pu1_src2 += src_strd2 << 1;
275	10.2k	pu1_dst += dst_strd << 1;
276	10.2k	}
277	10.2k	while(ht > 0);
278	1.66k	}
279	1.85k	}
280
281
282		/*****************************************************************************/
283		/* */
284		/* Function Name : ih264_weighted_bi_pred_chroma_avx2 */
285		/* */
286		/* Description : This function performs the weighted biprediction as */
287		/* described in sec 8.4.2.3.2 titled "Weighted sample */
288		/* prediction process" for chroma. The function gets two */
289		/* ht x wd blocks, weights them, adds them, rounds off the */
290		/* sum, offsets it, saturates it to unsigned 8-bit and */
291		/* stores it in the destination block. (ht,wd) can be */
292		/* (2,2), (4,2), (2,4), (4,4), (8,4), (4,8) or (8,8). */
293		/* */
294		/* Inputs : pu1_src1 - Pointer to source 1 */
295		/* pu1_src2 - Pointer to source 2 */
296		/* pu1_dst - Pointer to destination */
297		/* src_strd1 - stride for source 1 */
298		/* src_strd2 - stride for source 2 */
299		/* dst_strd2 - stride for destination */
300		/* log_wd - number of bits to be rounded off */
301		/* wt1 - weight values for u and v in source 1 */
302		/* wt2 - weight values for u and v in source 2 */
303		/* ofst1 - offset value for u and v in source 1 */
304		/* ofst2 - offset value for u and v in source 2 */
305		/* ht - height of the block */
306		/* wd - width of the block */
307		/* */
308		/* Issues : None */
309		/* */
310		/* Revision History: */
311		/* */
312		/* DD MM YYYY Author(s) Changes */
313		/* 04 02 2015 Kaushik Initial Version */
314		/* Senthoor */
315		/* 15 09 2020 Priyanka Bose AVX2 Intel Intrinsics Support */
316		/*****************************************************************************/
317		void ih264_weighted_bi_pred_chroma_avx2(UWORD8 *pu1_src1,
318		UWORD8 *pu1_src2,
319		UWORD8 *pu1_dst,
320		WORD32 src_strd1,
321		WORD32 src_strd2,
322		WORD32 dst_strd,
323		WORD32 log_wd,
324		WORD32 wt1,
325		WORD32 wt2,
326		WORD32 ofst1,
327		WORD32 ofst2,
328		WORD32 ht,
329		WORD32 wd)
330	1.85k	{
331
332	1.85k	__m128i y1_0_16x8b, y1_1_16x8b;
333	1.85k	__m128i y2_0_16x8b, y2_1_16x8b;
334
335	1.85k	__m128i wt1_8x16b, wt2_8x16b;
336	1.85k	__m128i ofst_8x16b, round_8x16b;
337
338	1.85k	WORD32 ofst1_u, ofst2_u, ofst_u;
339	1.85k	WORD32 ofst1_v, ofst2_v, ofst_v;
340	1.85k	WORD32 round_val, shft, ofst_val,ofst_val_256;
341
342	1.85k	round_val = 1 << log_wd;
343	1.85k	shft = log_wd + 1;
344
345	1.85k	ofst1_u = (WORD8)(ofst1 & 0xff);
346	1.85k	ofst1_v = (WORD8)(ofst1 >> 8);
347	1.85k	ofst2_u = (WORD8)(ofst2 & 0xff);
348	1.85k	ofst2_v = (WORD8)(ofst2 >> 8);
349
350	1.85k	wt1_8x16b = _mm_set1_epi32(wt1);
351	1.85k	wt2_8x16b = _mm_set1_epi32(wt2);
352
353	1.85k	ofst_u = (ofst1_u + ofst2_u + 1) >> 1;
354	1.85k	ofst_v = (ofst1_v + ofst2_v + 1) >> 1;
355	1.85k	ofst_val = (ofst_u & 0xffff) \| (ofst_v << 16);
356	1.85k	ofst_val_256 = (ofst_u & 0xffff) \| (ofst_v << 16);
357
358	1.85k	round_8x16b = _mm_set1_epi16(round_val);
359	1.85k	ofst_8x16b = _mm_set1_epi32(ofst_val);
360
361	1.85k	if(wd == 2)
362	0	{
363	0	__m128i y1_0_8x16b, y2_0_8x16b;
364
365	0	do
366	0	{
367	0	y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1); //Loading 64 bits from diff location
368	0	y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
369
370	0	y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
371	0	y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
372
373	0	y1_0_16x8b = _mm_unpacklo_epi32(y1_0_16x8b, y1_1_16x8b);
374	0	y2_0_16x8b = _mm_unpacklo_epi32(y2_0_16x8b, y2_1_16x8b);
375
376	0	y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
377	0	y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
378
379	0	y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b);
380	0	y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b);
381
382	0	y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b);
383	0	y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b);
384
385	0	y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft);
386	0	y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b);
387
388	0	y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_0_8x16b);
389	0	y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 4);
390
391	0	((WORD32 )(pu1_dst)) = _mm_cvtsi128_si32(y1_0_16x8b);
392	0	((WORD32 )(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y1_1_16x8b);
393
394	0	ht -= 2;
395	0	pu1_src1 += src_strd1 << 1;
396	0	pu1_src2 += src_strd2 << 1;
397	0	pu1_dst += dst_strd << 1;
398	0	}
399	0	while(ht > 0);
400	0	}
401	1.85k	else if(wd == 4)
402	184	{
403	184	__m128i y1_0_8x16b, y1_1_8x16b;
404	184	__m128i y2_0_8x16b, y2_1_8x16b;
405
406	184	do
407	736	{
408	736	y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1); //Loading 64 bits from diff location
409	736	y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
410
411	736	y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
412	736	y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
413
414	736	y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
415	736	y1_1_8x16b = _mm_cvtepu8_epi16(y1_1_16x8b);
416
417	736	y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
418	736	y2_1_8x16b = _mm_cvtepu8_epi16(y2_1_16x8b);
419
420	736	y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b);
421	736	y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b);
422	736	y1_1_8x16b = _mm_mullo_epi16(y1_1_8x16b, wt1_8x16b);
423	736	y2_1_8x16b = _mm_mullo_epi16(y2_1_8x16b, wt2_8x16b);
424
425	736	y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b);
426	736	y1_1_8x16b = _mm_adds_epi16(y1_1_8x16b, y2_1_8x16b);
427
428	736	y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b);
429	736	y1_1_8x16b = _mm_adds_epi16(round_8x16b, y1_1_8x16b);
430
431	736	y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft);
432	736	y1_1_8x16b = _mm_srai_epi16(y1_1_8x16b, shft);
433
434	736	y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b);
435	736	y1_1_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1_8x16b);
436
437	736	y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_1_8x16b);
438	736	y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 8);
439
440	736	_mm_storel_epi64((__m128i *)pu1_dst, y1_0_16x8b);
441	736	_mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b);
442
443	736	ht -= 2;
444	736	pu1_src1 += src_strd1 << 1;
445	736	pu1_src2 += src_strd2 << 1;
446	736	pu1_dst += dst_strd << 1;
447	736	}
448	736	while(ht > 0);
449	184	}
450	1.66k	else // wd == 8
451	1.66k	{
452	1.66k	__m256i y1_0L_8x32b, y1_0H_8x32b, y1_1L_8x32b, y1_1H_8x32b;
453	1.66k	__m256i y2_0L_8x32b, y2_0H_8x32b, y2_1L_8x32b, y2_1H_8x32b;
454	1.66k	__m256i y1_0_32x8b,y2_0_32x8b,ofst_8x32b,round_8x32b;
455	1.66k	__m256i wt1_8x32b, wt2_8x32b;
456	1.66k	__m256i zero_32x8b;
457
458	1.66k	wt1_8x32b = _mm256_set1_epi16(wt1);
459	1.66k	wt2_8x32b = _mm256_set1_epi16(wt2);
460	1.66k	round_8x32b = _mm256_set1_epi16(round_val);
461	1.66k	ofst_8x32b = _mm256_set1_epi32(ofst_val_256);
462	1.66k	zero_32x8b = _mm256_set1_epi8(0);
463
464	1.66k	do
465	5.12k	{
466	5.12k	y1_0_32x8b = _mm256_loadu_si256((__m256i *)pu1_src1);
467	5.12k	y2_0_32x8b = _mm256_loadu_si256((__m256i *)pu1_src2);
468	5.12k	y1_0L_8x32b = _mm256_unpacklo_epi8(y1_0_32x8b, zero_32x8b);
469	5.12k	y1_0H_8x32b = _mm256_unpackhi_epi8(y1_0_32x8b, zero_32x8b);
470	5.12k	y2_0L_8x32b = _mm256_unpacklo_epi8(y2_0_32x8b, zero_32x8b);
471	5.12k	y2_0H_8x32b = _mm256_unpackhi_epi8(y2_0_32x8b, zero_32x8b);
472	5.12k	y1_0L_8x32b = _mm256_mullo_epi16(y1_0L_8x32b, wt1_8x32b);
473	5.12k	y1_0H_8x32b = _mm256_mullo_epi16(y1_0H_8x32b, wt1_8x32b);
474
475	5.12k	y2_0L_8x32b = _mm256_mullo_epi16(y2_0L_8x32b, wt2_8x32b);
476	5.12k	y2_0H_8x32b = _mm256_mullo_epi16(y2_0H_8x32b, wt2_8x32b);
477
478	5.12k	y1_0L_8x32b = _mm256_adds_epi16(y1_0L_8x32b, y2_0L_8x32b);
479	5.12k	y1_0H_8x32b = _mm256_adds_epi16(y1_0H_8x32b, y2_0H_8x32b);
480
481	5.12k	y1_0L_8x32b = _mm256_adds_epi16(round_8x32b, y1_0L_8x32b);
482	5.12k	y1_0H_8x32b = _mm256_adds_epi16(round_8x32b, y1_0H_8x32b);
483
484	5.12k	y1_0L_8x32b = _mm256_srai_epi16(y1_0L_8x32b, shft);
485	5.12k	y1_0H_8x32b = _mm256_srai_epi16(y1_0H_8x32b, shft);
486
487	5.12k	y1_0L_8x32b = _mm256_adds_epi16(ofst_8x32b, y1_0L_8x32b);
488	5.12k	y1_0H_8x32b = _mm256_adds_epi16(ofst_8x32b, y1_0H_8x32b);
489
490
491	5.12k	y1_0_32x8b = _mm256_packus_epi16(y1_0L_8x32b, y1_0H_8x32b);
492	5.12k	_mm256_storeu_si256((__m256i *)pu1_dst, y1_0_32x8b);
493
494	5.12k	ht -= 2;
495	5.12k	pu1_src1 += src_strd1 << 1;
496	5.12k	pu1_src2 += src_strd2 << 1;
497	5.12k	pu1_dst += dst_strd << 1;
498	5.12k	}
499	5.12k	while(ht > 0);
500	1.66k	}
501	1.85k	}