/src/libavc/common/x86/ih264_ihadamard_scaling_avx2.c

Source
/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
/**
+ *******************************************************************************
+ * @file
+ *  ih264_ihadamard_scaling_avx2.c
+ *
+ * @brief
+ *  Contains definition of functions for h264 inverse hadamard 4x4 transform and scaling
+ *
+ * @author
+ *  Priyanka
+ *
+ *  @par List of Functions:
+ *  - ih264_ihadamard_scaling_4x4_avx2()
+ *
+ * @remarks
+ *
+ *******************************************************************************
+ */
/*****************************************************************************/
/* File Includes                                                             */
/*****************************************************************************/

/* User include files */
#include "ih264_typedefs.h"
#include "ih264_defs.h"
#include "ih264_trans_macros.h"
#include "ih264_macros.h"
#include "ih264_trans_data.h"
#include "ih264_size_defs.h"
#include "ih264_structs.h"
#include "ih264_trans_quant_itrans_iquant.h"
#include <immintrin.h>

/*
+ ********************************************************************************
+ *
+ * @brief This function performs a 4x4 inverse hadamard transform on the 4x4 DC coefficients
+ * of a 16x16 intra prediction macroblock, and then performs scaling.
+ * prediction buffer
+ *
+ * @par Description:
+ *  The DC coefficients pass through a 2-stage inverse hadamard transform.
+ *  This inverse transformed content is scaled to based on Qp value.
+ *
+ * @param[in] pi2_src
+ *  input 4x4 block of DC coefficients
+ *
+ * @param[out] pi2_out
+ *  output 4x4 block
+ *
+ * @param[in] pu2_iscal_mat
+ *  pointer to scaling list
+ *
+ * @param[in] pu2_weigh_mat
+ *  pointer to weight matrix
+ *
+ * @param[in] u4_qp_div_6
+ *  Floor (qp/6)
+ *
+ * @param[in] pi4_tmp
+ * temporary buffer of size 1*16
+ *
+ * @returns none
+ *
+ * @remarks none
+ *
+ *******************************************************************************
*/

#include <stdint.h>
#include <string.h>

#include <stdio.h>

#ifdef __ANDROID__
#include "log/log.h"
#include <cutils/log.h>
#endif


void ih264_ihadamard_scaling_4x4_avx2(WORD16* pi2_src,
                                      WORD16* pi2_out,
                                      const UWORD16 *pu2_iscal_mat,
                                      const UWORD16 *pu2_weigh_mat,
                                      UWORD32 u4_qp_div_6,
                                      WORD32* pi4_tmp)
{ 
    __m256i src,r0_r1,r2_r3,r3_r2,r1_r3,r0_r2;
    __m256i src_r0_r1, src_r2_r3;
    __m256i temp0, temp1,tmp0, tmp1, tmp2, tmp3;
    __m256i add_rshift = _mm256_set1_epi32((u4_qp_div_6 < 6) ? (1 << (5 - u4_qp_div_6)) : 0);
    __m256i mult_val = _mm256_set1_epi32(pu2_iscal_mat[0] * pu2_weigh_mat[0]);
    __m256i zero =  _mm256_setzero_si256();

    __m128i t0 ,t1;
    UNUSED (pi4_tmp);

    src_r0_r1 = _mm256_loadu_si256((__m256i *) (pi2_src)); //a00 a01 a02 a03 a10 a11 a12 a13 -- the source matrix 0th,1st row

    temp0 = _mm256_unpacklo_epi64(src_r0_r1, zero);  
    temp1 = _mm256_unpackhi_epi64(src_r0_r1, zero);        // b0 b1 b2..         d0 d1...
    temp0 = _mm256_unpacklo_epi16(temp0, temp1); 
    tmp0 =  _mm256_permute2x128_si256(temp0,zero,0x20);    //tmp0 tmp3
    tmp1 =  _mm256_permute2x128_si256(temp0,zero,0x31);    //tmp1  tmp2

    temp0 = _mm256_unpacklo_epi32(tmp0, tmp1);             //a0 c0 a1 c1 a2 c2 a3 c3 a0 c0 a1 c1 b0 d0 b1 c1
    temp1 = _mm256_unpackhi_epi32(tmp0, tmp1);
    
    temp1 = _mm256_shuffle_epi32(temp1,0b01001110);
    tmp0 = _mm256_add_epi16(temp0, temp1);
    tmp1 = _mm256_sub_epi16(temp0, temp1);

    temp0 =   _mm256_unpacklo_epi64(tmp0, tmp1);
    temp1 =   _mm256_unpackhi_epi64(tmp0, tmp1);
    tmp0 = _mm256_add_epi16(temp0, temp1);
    tmp1 = _mm256_sub_epi16(temp0, temp1);

    temp0 = _mm256_unpacklo_epi32(tmp0, tmp1);             //a0 c0 a1 c1 a2 c2 a3 c3 a0 c0 a1 c1 b0 d0 b1 c1
    temp1 = _mm256_unpackhi_epi32(tmp0, tmp1); 
    
    temp0 = _mm256_unpacklo_epi64(tmp0, tmp1);             //a0 c0 a1 c1 a2 c2 a3 c3 a0 c0 a1 c1 b0 d0 b1 c1
    temp1 = _mm256_unpackhi_epi64(tmp0, tmp1);
   
    tmp0 = _mm256_unpacklo_epi16(temp0, temp1);             //a0 c0 a1 c1 a2 c2 a3 c3 a0 c0 a1 c1 b0 d0 b1 c1
    tmp1 = _mm256_unpackhi_epi16(temp0, temp1);

    temp0 = _mm256_unpacklo_epi32(tmp0, tmp1);             //a0 c0 a1 c1 a2 c2 a3 c3 a0 c0 a1 c1 b0 d0 b1 c1
    temp1 = _mm256_unpackhi_epi32(tmp0, tmp1);
    
    temp1 = _mm256_shuffle_epi32(temp1, _MM_SHUFFLE(1, 0, 3, 2));
    tmp0 = _mm256_add_epi16(temp0, temp1);
    tmp1 = _mm256_sub_epi16(temp0, temp1);
    temp0 =   _mm256_unpacklo_epi64(tmp0, tmp1);
    temp1 =   _mm256_unpackhi_epi64(tmp0, tmp1);
    tmp0 = _mm256_add_epi16(temp0, temp1);
    tmp1 = _mm256_sub_epi16(temp0, temp1);

    temp0 =   _mm256_unpacklo_epi64(tmp0, tmp1);
    temp1 =   _mm256_unpackhi_epi64(tmp0, tmp1);


    r0_r1 =_mm256_cvtepi16_epi32(_mm256_castsi256_si128(temp0));
    r2_r3 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(temp1));

    src_r0_r1 = _mm256_mullo_epi32(r0_r1, mult_val);
    src_r2_r3 = _mm256_mullo_epi32(r2_r3, mult_val);

    //Scaling
    if(u4_qp_div_6 >= 6)
    {
        src_r0_r1 = _mm256_slli_epi32(src_r0_r1, u4_qp_div_6 - 6);
        src_r2_r3 = _mm256_slli_epi32(src_r2_r3, u4_qp_div_6 - 6);
    }
    else
    {
        temp0  = _mm256_add_epi32(src_r0_r1, add_rshift);
        temp1  = _mm256_add_epi32(src_r2_r3, add_rshift);
        src_r0_r1 = _mm256_srai_epi32(temp0, 6 - u4_qp_div_6);
        src_r2_r3 = _mm256_srai_epi32(temp1, 6 - u4_qp_div_6);
    }

    src = _mm256_packs_epi32(src_r0_r1, src_r2_r3);
    _mm256_storeu_si256((__m256i *) (&pi2_out[0]), src);
}


Line	Count	Source
1		/******************************************************************************
2		+ *
3		+ * Copyright (C) 2015 The Android Open Source Project
4		+ *
5		+ * Licensed under the Apache License, Version 2.0 (the "License");
6		+ * you may not use this file except in compliance with the License.
7		+ * You may obtain a copy of the License at:
8		+ *
9		+ * http://www.apache.org/licenses/LICENSE-2.0
10		+ *
11		+ * Unless required by applicable law or agreed to in writing, software
12		+ * distributed under the License is distributed on an "AS IS" BASIS,
13		+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14		+ * See the License for the specific language governing permissions and
15		+ * limitations under the License.
16		+ *
17		+ *****************************************************************************
18		+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19		+*/
20		/**
21		+ *******************************************************************************
22		+ * @file
23		+ * ih264_ihadamard_scaling_avx2.c
24		+ *
25		+ * @brief
26		+ * Contains definition of functions for h264 inverse hadamard 4x4 transform and scaling
27		+ *
28		+ * @author
29		+ * Priyanka
30		+ *
31		+ * @par List of Functions:
32		+ * - ih264_ihadamard_scaling_4x4_avx2()
33		+ *
34		+ * @remarks
35		+ *
36		+ *******************************************************************************
37		+ */
38		/*****************************************************************************/
39		/* File Includes */
40		/*****************************************************************************/
41
42		/* User include files */
43		#include "ih264_typedefs.h"
44		#include "ih264_defs.h"
45		#include "ih264_trans_macros.h"
46		#include "ih264_macros.h"
47		#include "ih264_trans_data.h"
48		#include "ih264_size_defs.h"
49		#include "ih264_structs.h"
50		#include "ih264_trans_quant_itrans_iquant.h"
51		#include <immintrin.h>
52
53		/*
54		+ ********************************************************************************
55		+ *
56		+ * @brief This function performs a 4x4 inverse hadamard transform on the 4x4 DC coefficients
57		+ * of a 16x16 intra prediction macroblock, and then performs scaling.
58		+ * prediction buffer
59		+ *
60		+ * @par Description:
61		+ * The DC coefficients pass through a 2-stage inverse hadamard transform.
62		+ * This inverse transformed content is scaled to based on Qp value.
63		+ *
64		+ * @param[in] pi2_src
65		+ * input 4x4 block of DC coefficients
66		+ *
67		+ * @param[out] pi2_out
68		+ * output 4x4 block
69		+ *
70		+ * @param[in] pu2_iscal_mat
71		+ * pointer to scaling list
72		+ *
73		+ * @param[in] pu2_weigh_mat
74		+ * pointer to weight matrix
75		+ *
76		+ * @param[in] u4_qp_div_6
77		+ * Floor (qp/6)
78		+ *
79		+ * @param[in] pi4_tmp
80		+ * temporary buffer of size 1*16
81		+ *
82		+ * @returns none
83		+ *
84		+ * @remarks none
85		+ *
86		+ *******************************************************************************
87		*/
88
89		#include <stdint.h>
90		#include <string.h>
91
92		#include <stdio.h>
93
94		#ifdef __ANDROID__
95		#include "log/log.h"
96		#include <cutils/log.h>
97		#endif
98
99
100		void ih264_ihadamard_scaling_4x4_avx2(WORD16* pi2_src,
101		WORD16* pi2_out,
102		const UWORD16 *pu2_iscal_mat,
103		const UWORD16 *pu2_weigh_mat,
104		UWORD32 u4_qp_div_6,
105		WORD32* pi4_tmp)
106	2.89k	{
107	2.89k	__m256i src,r0_r1,r2_r3,r3_r2,r1_r3,r0_r2;
108	2.89k	__m256i src_r0_r1, src_r2_r3;
109	2.89k	__m256i temp0, temp1,tmp0, tmp1, tmp2, tmp3;
110	2.89k	__m256i add_rshift = _mm256_set1_epi32((u4_qp_div_6 < 6) ? (1 << (5 - u4_qp_div_6)) : 0);
111	2.89k	__m256i mult_val = _mm256_set1_epi32(pu2_iscal_mat[0] * pu2_weigh_mat[0]);
112	2.89k	__m256i zero = _mm256_setzero_si256();
113
114	2.89k	__m128i t0 ,t1;
115	2.89k	UNUSED (pi4_tmp);
116
117	2.89k	src_r0_r1 = _mm256_loadu_si256((__m256i *) (pi2_src)); //a00 a01 a02 a03 a10 a11 a12 a13 -- the source matrix 0th,1st row
118
119	2.89k	temp0 = _mm256_unpacklo_epi64(src_r0_r1, zero);
120	2.89k	temp1 = _mm256_unpackhi_epi64(src_r0_r1, zero); // b0 b1 b2.. d0 d1...
121	2.89k	temp0 = _mm256_unpacklo_epi16(temp0, temp1);
122	2.89k	tmp0 = _mm256_permute2x128_si256(temp0,zero,0x20); //tmp0 tmp3
123	2.89k	tmp1 = _mm256_permute2x128_si256(temp0,zero,0x31); //tmp1 tmp2
124
125	2.89k	temp0 = _mm256_unpacklo_epi32(tmp0, tmp1); //a0 c0 a1 c1 a2 c2 a3 c3 a0 c0 a1 c1 b0 d0 b1 c1
126	2.89k	temp1 = _mm256_unpackhi_epi32(tmp0, tmp1);
127
128	2.89k	temp1 = _mm256_shuffle_epi32(temp1,0b01001110);
129	2.89k	tmp0 = _mm256_add_epi16(temp0, temp1);
130	2.89k	tmp1 = _mm256_sub_epi16(temp0, temp1);
131
132	2.89k	temp0 = _mm256_unpacklo_epi64(tmp0, tmp1);
133	2.89k	temp1 = _mm256_unpackhi_epi64(tmp0, tmp1);
134	2.89k	tmp0 = _mm256_add_epi16(temp0, temp1);
135	2.89k	tmp1 = _mm256_sub_epi16(temp0, temp1);
136
137	2.89k	temp0 = _mm256_unpacklo_epi32(tmp0, tmp1); //a0 c0 a1 c1 a2 c2 a3 c3 a0 c0 a1 c1 b0 d0 b1 c1
138	2.89k	temp1 = _mm256_unpackhi_epi32(tmp0, tmp1);
139
140	2.89k	temp0 = _mm256_unpacklo_epi64(tmp0, tmp1); //a0 c0 a1 c1 a2 c2 a3 c3 a0 c0 a1 c1 b0 d0 b1 c1
141	2.89k	temp1 = _mm256_unpackhi_epi64(tmp0, tmp1);
142
143	2.89k	tmp0 = _mm256_unpacklo_epi16(temp0, temp1); //a0 c0 a1 c1 a2 c2 a3 c3 a0 c0 a1 c1 b0 d0 b1 c1
144	2.89k	tmp1 = _mm256_unpackhi_epi16(temp0, temp1);
145
146	2.89k	temp0 = _mm256_unpacklo_epi32(tmp0, tmp1); //a0 c0 a1 c1 a2 c2 a3 c3 a0 c0 a1 c1 b0 d0 b1 c1
147	2.89k	temp1 = _mm256_unpackhi_epi32(tmp0, tmp1);
148
149	2.89k	temp1 = _mm256_shuffle_epi32(temp1, _MM_SHUFFLE(1, 0, 3, 2));
150	2.89k	tmp0 = _mm256_add_epi16(temp0, temp1);
151	2.89k	tmp1 = _mm256_sub_epi16(temp0, temp1);
152	2.89k	temp0 = _mm256_unpacklo_epi64(tmp0, tmp1);
153	2.89k	temp1 = _mm256_unpackhi_epi64(tmp0, tmp1);
154	2.89k	tmp0 = _mm256_add_epi16(temp0, temp1);
155	2.89k	tmp1 = _mm256_sub_epi16(temp0, temp1);
156
157	2.89k	temp0 = _mm256_unpacklo_epi64(tmp0, tmp1);
158	2.89k	temp1 = _mm256_unpackhi_epi64(tmp0, tmp1);
159
160
161	2.89k	r0_r1 =_mm256_cvtepi16_epi32(_mm256_castsi256_si128(temp0));
162	2.89k	r2_r3 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(temp1));
163
164	2.89k	src_r0_r1 = _mm256_mullo_epi32(r0_r1, mult_val);
165	2.89k	src_r2_r3 = _mm256_mullo_epi32(r2_r3, mult_val);
166
167		//Scaling
168	2.89k	if(u4_qp_div_6 >= 6)
169	1.63k	{
170	1.63k	src_r0_r1 = _mm256_slli_epi32(src_r0_r1, u4_qp_div_6 - 6);
171	1.63k	src_r2_r3 = _mm256_slli_epi32(src_r2_r3, u4_qp_div_6 - 6);
172	1.63k	}
173	1.26k	else
174	1.26k	{
175	1.26k	temp0 = _mm256_add_epi32(src_r0_r1, add_rshift);
176	1.26k	temp1 = _mm256_add_epi32(src_r2_r3, add_rshift);
177	1.26k	src_r0_r1 = _mm256_srai_epi32(temp0, 6 - u4_qp_div_6);
178	1.26k	src_r2_r3 = _mm256_srai_epi32(temp1, 6 - u4_qp_div_6);
179	1.26k	}
180
181	2.89k	src = _mm256_packs_epi32(src_r0_r1, src_r2_r3);
182	2.89k	_mm256_storeu_si256((__m256i *) (&pi2_out[0]), src);
183	2.89k	}
184

Coverage Report

Created: 2025-12-14 06:42