/src/libavc/common/x86/ih264_ihadamard_scaling_avx2.c
Line | Count | Source |
1 | | /****************************************************************************** |
2 | | + * |
3 | | + * Copyright (C) 2015 The Android Open Source Project |
4 | | + * |
5 | | + * Licensed under the Apache License, Version 2.0 (the "License"); |
6 | | + * you may not use this file except in compliance with the License. |
7 | | + * You may obtain a copy of the License at: |
8 | | + * |
9 | | + * http://www.apache.org/licenses/LICENSE-2.0 |
10 | | + * |
11 | | + * Unless required by applicable law or agreed to in writing, software |
12 | | + * distributed under the License is distributed on an "AS IS" BASIS, |
13 | | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | + * See the License for the specific language governing permissions and |
15 | | + * limitations under the License. |
16 | | + * |
17 | | + ***************************************************************************** |
18 | | + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore |
19 | | +*/ |
20 | | /** |
21 | | + ******************************************************************************* |
22 | | + * @file |
23 | | + * ih264_ihadamard_scaling_avx2.c |
24 | | + * |
25 | | + * @brief |
26 | | + * Contains definition of functions for h264 inverse hadamard 4x4 transform and scaling |
27 | | + * |
28 | | + * @author |
29 | | + * Priyanka |
30 | | + * |
31 | | + * @par List of Functions: |
32 | | + * - ih264_ihadamard_scaling_4x4_avx2() |
33 | | + * |
34 | | + * @remarks |
35 | | + * |
36 | | + ******************************************************************************* |
37 | | + */ |
38 | | /*****************************************************************************/ |
39 | | /* File Includes */ |
40 | | /*****************************************************************************/ |
41 | | |
42 | | /* User include files */ |
43 | | #include "ih264_typedefs.h" |
44 | | #include "ih264_defs.h" |
45 | | #include "ih264_trans_macros.h" |
46 | | #include "ih264_macros.h" |
47 | | #include "ih264_trans_data.h" |
48 | | #include "ih264_size_defs.h" |
49 | | #include "ih264_structs.h" |
50 | | #include "ih264_trans_quant_itrans_iquant.h" |
51 | | #include <immintrin.h> |
52 | | |
53 | | /* |
54 | | + ******************************************************************************** |
55 | | + * |
56 | | + * @brief This function performs a 4x4 inverse hadamard transform on the 4x4 DC coefficients |
57 | | + * of a 16x16 intra prediction macroblock, and then performs scaling. |
58 | | + * prediction buffer |
59 | | + * |
60 | | + * @par Description: |
61 | | + * The DC coefficients pass through a 2-stage inverse hadamard transform. |
62 | | + * This inverse transformed content is scaled to based on Qp value. |
63 | | + * |
64 | | + * @param[in] pi2_src |
65 | | + * input 4x4 block of DC coefficients |
66 | | + * |
67 | | + * @param[out] pi2_out |
68 | | + * output 4x4 block |
69 | | + * |
70 | | + * @param[in] pu2_iscal_mat |
71 | | + * pointer to scaling list |
72 | | + * |
73 | | + * @param[in] pu2_weigh_mat |
74 | | + * pointer to weight matrix |
75 | | + * |
76 | | + * @param[in] u4_qp_div_6 |
77 | | + * Floor (qp/6) |
78 | | + * |
79 | | + * @param[in] pi4_tmp |
80 | | + * temporary buffer of size 1*16 |
81 | | + * |
82 | | + * @returns none |
83 | | + * |
84 | | + * @remarks none |
85 | | + * |
86 | | + ******************************************************************************* |
87 | | */ |
88 | | |
89 | | #include <stdint.h> |
90 | | #include <string.h> |
91 | | |
92 | | #include <stdio.h> |
93 | | |
94 | | #ifdef __ANDROID__ |
95 | | #include "log/log.h" |
96 | | #include <cutils/log.h> |
97 | | #endif |
98 | | |
99 | | |
100 | | void ih264_ihadamard_scaling_4x4_avx2(WORD16* pi2_src, |
101 | | WORD16* pi2_out, |
102 | | const UWORD16 *pu2_iscal_mat, |
103 | | const UWORD16 *pu2_weigh_mat, |
104 | | UWORD32 u4_qp_div_6, |
105 | | WORD32* pi4_tmp) |
106 | 2.89k | { |
107 | 2.89k | __m256i src,r0_r1,r2_r3,r3_r2,r1_r3,r0_r2; |
108 | 2.89k | __m256i src_r0_r1, src_r2_r3; |
109 | 2.89k | __m256i temp0, temp1,tmp0, tmp1, tmp2, tmp3; |
110 | 2.89k | __m256i add_rshift = _mm256_set1_epi32((u4_qp_div_6 < 6) ? (1 << (5 - u4_qp_div_6)) : 0); |
111 | 2.89k | __m256i mult_val = _mm256_set1_epi32(pu2_iscal_mat[0] * pu2_weigh_mat[0]); |
112 | 2.89k | __m256i zero = _mm256_setzero_si256(); |
113 | | |
114 | 2.89k | __m128i t0 ,t1; |
115 | 2.89k | UNUSED (pi4_tmp); |
116 | | |
117 | 2.89k | src_r0_r1 = _mm256_loadu_si256((__m256i *) (pi2_src)); //a00 a01 a02 a03 a10 a11 a12 a13 -- the source matrix 0th,1st row |
118 | | |
119 | 2.89k | temp0 = _mm256_unpacklo_epi64(src_r0_r1, zero); |
120 | 2.89k | temp1 = _mm256_unpackhi_epi64(src_r0_r1, zero); // b0 b1 b2.. d0 d1... |
121 | 2.89k | temp0 = _mm256_unpacklo_epi16(temp0, temp1); |
122 | 2.89k | tmp0 = _mm256_permute2x128_si256(temp0,zero,0x20); //tmp0 tmp3 |
123 | 2.89k | tmp1 = _mm256_permute2x128_si256(temp0,zero,0x31); //tmp1 tmp2 |
124 | | |
125 | 2.89k | temp0 = _mm256_unpacklo_epi32(tmp0, tmp1); //a0 c0 a1 c1 a2 c2 a3 c3 a0 c0 a1 c1 b0 d0 b1 c1 |
126 | 2.89k | temp1 = _mm256_unpackhi_epi32(tmp0, tmp1); |
127 | | |
128 | 2.89k | temp1 = _mm256_shuffle_epi32(temp1,0b01001110); |
129 | 2.89k | tmp0 = _mm256_add_epi16(temp0, temp1); |
130 | 2.89k | tmp1 = _mm256_sub_epi16(temp0, temp1); |
131 | | |
132 | 2.89k | temp0 = _mm256_unpacklo_epi64(tmp0, tmp1); |
133 | 2.89k | temp1 = _mm256_unpackhi_epi64(tmp0, tmp1); |
134 | 2.89k | tmp0 = _mm256_add_epi16(temp0, temp1); |
135 | 2.89k | tmp1 = _mm256_sub_epi16(temp0, temp1); |
136 | | |
137 | 2.89k | temp0 = _mm256_unpacklo_epi32(tmp0, tmp1); //a0 c0 a1 c1 a2 c2 a3 c3 a0 c0 a1 c1 b0 d0 b1 c1 |
138 | 2.89k | temp1 = _mm256_unpackhi_epi32(tmp0, tmp1); |
139 | | |
140 | 2.89k | temp0 = _mm256_unpacklo_epi64(tmp0, tmp1); //a0 c0 a1 c1 a2 c2 a3 c3 a0 c0 a1 c1 b0 d0 b1 c1 |
141 | 2.89k | temp1 = _mm256_unpackhi_epi64(tmp0, tmp1); |
142 | | |
143 | 2.89k | tmp0 = _mm256_unpacklo_epi16(temp0, temp1); //a0 c0 a1 c1 a2 c2 a3 c3 a0 c0 a1 c1 b0 d0 b1 c1 |
144 | 2.89k | tmp1 = _mm256_unpackhi_epi16(temp0, temp1); |
145 | | |
146 | 2.89k | temp0 = _mm256_unpacklo_epi32(tmp0, tmp1); //a0 c0 a1 c1 a2 c2 a3 c3 a0 c0 a1 c1 b0 d0 b1 c1 |
147 | 2.89k | temp1 = _mm256_unpackhi_epi32(tmp0, tmp1); |
148 | | |
149 | 2.89k | temp1 = _mm256_shuffle_epi32(temp1, _MM_SHUFFLE(1, 0, 3, 2)); |
150 | 2.89k | tmp0 = _mm256_add_epi16(temp0, temp1); |
151 | 2.89k | tmp1 = _mm256_sub_epi16(temp0, temp1); |
152 | 2.89k | temp0 = _mm256_unpacklo_epi64(tmp0, tmp1); |
153 | 2.89k | temp1 = _mm256_unpackhi_epi64(tmp0, tmp1); |
154 | 2.89k | tmp0 = _mm256_add_epi16(temp0, temp1); |
155 | 2.89k | tmp1 = _mm256_sub_epi16(temp0, temp1); |
156 | | |
157 | 2.89k | temp0 = _mm256_unpacklo_epi64(tmp0, tmp1); |
158 | 2.89k | temp1 = _mm256_unpackhi_epi64(tmp0, tmp1); |
159 | | |
160 | | |
161 | 2.89k | r0_r1 =_mm256_cvtepi16_epi32(_mm256_castsi256_si128(temp0)); |
162 | 2.89k | r2_r3 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(temp1)); |
163 | | |
164 | 2.89k | src_r0_r1 = _mm256_mullo_epi32(r0_r1, mult_val); |
165 | 2.89k | src_r2_r3 = _mm256_mullo_epi32(r2_r3, mult_val); |
166 | | |
167 | | //Scaling |
168 | 2.89k | if(u4_qp_div_6 >= 6) |
169 | 1.63k | { |
170 | 1.63k | src_r0_r1 = _mm256_slli_epi32(src_r0_r1, u4_qp_div_6 - 6); |
171 | 1.63k | src_r2_r3 = _mm256_slli_epi32(src_r2_r3, u4_qp_div_6 - 6); |
172 | 1.63k | } |
173 | 1.26k | else |
174 | 1.26k | { |
175 | 1.26k | temp0 = _mm256_add_epi32(src_r0_r1, add_rshift); |
176 | 1.26k | temp1 = _mm256_add_epi32(src_r2_r3, add_rshift); |
177 | 1.26k | src_r0_r1 = _mm256_srai_epi32(temp0, 6 - u4_qp_div_6); |
178 | 1.26k | src_r2_r3 = _mm256_srai_epi32(temp1, 6 - u4_qp_div_6); |
179 | 1.26k | } |
180 | | |
181 | 2.89k | src = _mm256_packs_epi32(src_r0_r1, src_r2_r3); |
182 | 2.89k | _mm256_storeu_si256((__m256i *) (&pi2_out[0]), src); |
183 | 2.89k | } |
184 | | |