/src/libavc/common/x86/ih264_ihadamard_scaling_sse42.c
Line | Count | Source |
1 | | /****************************************************************************** |
2 | | * |
3 | | * Copyright (C) 2015 The Android Open Source Project |
4 | | * |
5 | | * Licensed under the Apache License, Version 2.0 (the "License"); |
6 | | * you may not use this file except in compliance with the License. |
7 | | * You may obtain a copy of the License at: |
8 | | * |
9 | | * http://www.apache.org/licenses/LICENSE-2.0 |
10 | | * |
11 | | * Unless required by applicable law or agreed to in writing, software |
12 | | * distributed under the License is distributed on an "AS IS" BASIS, |
13 | | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | * See the License for the specific language governing permissions and |
15 | | * limitations under the License. |
16 | | * |
17 | | ***************************************************************************** |
18 | | * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore |
19 | | */ |
20 | | /** |
21 | | ******************************************************************************* |
22 | | * @file |
23 | | * ih264_ihadamard_scaling_sse42.c |
24 | | * |
25 | | * @brief |
26 | | * Contains definition of functions for h264 inverse hadamard 4x4 transform and scaling |
27 | | * |
28 | | * @author |
29 | | * Mohit |
30 | | * |
31 | | * @par List of Functions: |
32 | | * - ih264_ihadamard_scaling_4x4_sse42() |
33 | | * - ih264_ihadamard_scaling_2x2_uv_ssse42() |
34 | | * |
35 | | * @remarks |
36 | | * |
37 | | ******************************************************************************* |
38 | | */ |
39 | | /*****************************************************************************/ |
40 | | /* File Includes */ |
41 | | /*****************************************************************************/ |
42 | | |
43 | | /* User include files */ |
44 | | #include "ih264_typedefs.h" |
45 | | #include "ih264_defs.h" |
46 | | #include "ih264_trans_macros.h" |
47 | | #include "ih264_macros.h" |
48 | | #include "ih264_trans_data.h" |
49 | | #include "ih264_size_defs.h" |
50 | | #include "ih264_structs.h" |
51 | | #include "ih264_trans_quant_itrans_iquant.h" |
52 | | #include <immintrin.h> |
53 | | |
54 | | /* |
55 | | ******************************************************************************** |
56 | | * |
57 | | * @brief This function performs a 4x4 inverse hadamard transform on the 4x4 DC coefficients |
58 | | * of a 16x16 intra prediction macroblock, and then performs scaling. |
59 | | * prediction buffer |
60 | | * |
61 | | * @par Description: |
62 | | * The DC coefficients pass through a 2-stage inverse hadamard transform. |
63 | | * This inverse transformed content is scaled to based on Qp value. |
64 | | * |
65 | | * @param[in] pi2_src |
66 | | * input 4x4 block of DC coefficients |
67 | | * |
68 | | * @param[out] pi2_out |
69 | | * output 4x4 block |
70 | | * |
71 | | * @param[in] pu2_iscal_mat |
72 | | * pointer to scaling list |
73 | | * |
74 | | * @param[in] pu2_weigh_mat |
75 | | * pointer to weight matrix |
76 | | * |
77 | | * @param[in] u4_qp_div_6 |
78 | | * Floor (qp/6) |
79 | | * |
80 | | * @param[in] pi4_tmp |
81 | | * temporary buffer of size 1*16 |
82 | | * |
83 | | * @returns none |
84 | | * |
85 | | * @remarks none |
86 | | * |
87 | | ******************************************************************************* |
88 | | */ |
89 | | void ih264_ihadamard_scaling_4x4_sse42(WORD16* pi2_src, |
90 | | WORD16* pi2_out, |
91 | | const UWORD16 *pu2_iscal_mat, |
92 | | const UWORD16 *pu2_weigh_mat, |
93 | | UWORD32 u4_qp_div_6, |
94 | | WORD32* pi4_tmp) |
95 | 840k | { |
96 | 840k | __m128i src_r0_r1, src_r2_r3; |
97 | 840k | __m128i src_r0, src_r1, src_r2, src_r3; |
98 | 840k | __m128i temp0, temp1, temp2, temp3; |
99 | 840k | __m128i add_rshift = _mm_set1_epi32((u4_qp_div_6 < 6) ? (1 << (5 - u4_qp_div_6)) : 0); |
100 | 840k | __m128i mult_val = _mm_set1_epi32(pu2_iscal_mat[0] * pu2_weigh_mat[0]); |
101 | 840k | UNUSED (pi4_tmp); |
102 | | |
103 | 840k | src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src)); //a00 a01 a02 a03 a10 a11 a12 a13 -- the source matrix 0th,1st row |
104 | 840k | src_r2_r3 = _mm_loadu_si128((__m128i *) (pi2_src + 8)); //a20 a21 a22 a23 a30 a31 a32 a33 -- the source matrix 2nd,3rd row |
105 | | //sign_reg = _mm_cmpgt_epi16(zero_8x16b, src_r0_r1); |
106 | 840k | src_r0 = _mm_cvtepi16_epi32(src_r0_r1); |
107 | 840k | src_r0_r1 = _mm_srli_si128(src_r0_r1, 8); |
108 | 840k | src_r1 = _mm_cvtepi16_epi32(src_r0_r1); |
109 | | |
110 | 840k | src_r2 = _mm_cvtepi16_epi32(src_r2_r3); |
111 | 840k | src_r2_r3 = _mm_srli_si128(src_r2_r3, 8); |
112 | 840k | src_r3 = _mm_cvtepi16_epi32(src_r2_r3); |
113 | | |
114 | | /* Perform Inverse transform */ |
115 | | /*-------------------------------------------------------------*/ |
116 | | /* IDCT [ Horizontal transformation ] */ |
117 | | /*-------------------------------------------------------------*/ |
118 | | // Matrix transpose |
119 | | /* |
120 | | * a0 a1 a2 a3 |
121 | | * b0 b1 b2 b3 |
122 | | * c0 c1 c2 c3 |
123 | | * d0 d1 d2 d3 |
124 | | */ |
125 | 840k | temp0 = _mm_unpacklo_epi32(src_r0, src_r1); //a0 b0 a1 b1 |
126 | 840k | temp2 = _mm_unpacklo_epi32(src_r2, src_r3); //c0 d0 c1 d1 |
127 | 840k | temp1 = _mm_unpackhi_epi32(src_r0, src_r1); //a2 b2 a3 b3 |
128 | 840k | temp3 = _mm_unpackhi_epi32(src_r2, src_r3); //c2 d2 c3 d3 |
129 | 840k | src_r0 = _mm_unpacklo_epi64(temp0, temp2); //a0 b0 c0 d0 |
130 | 840k | src_r1 = _mm_unpackhi_epi64(temp0, temp2); //a1 b1 c1 d1 |
131 | 840k | src_r2 = _mm_unpacklo_epi64(temp1, temp3); //a2 b2 c2 d2 |
132 | 840k | src_r3 = _mm_unpackhi_epi64(temp1, temp3); //a3 b3 c3 d3 |
133 | | |
134 | 840k | temp0 = _mm_add_epi32(src_r0, src_r3); |
135 | 840k | temp1 = _mm_add_epi32(src_r1, src_r2); |
136 | 840k | temp2 = _mm_sub_epi32(src_r1, src_r2); |
137 | 840k | temp3 = _mm_sub_epi32(src_r0, src_r3); |
138 | | |
139 | 840k | src_r0 = _mm_add_epi32(temp0, temp1); |
140 | 840k | src_r1 = _mm_add_epi32(temp2, temp3); |
141 | 840k | src_r2 = _mm_sub_epi32(temp0, temp1); |
142 | 840k | src_r3 = _mm_sub_epi32(temp3, temp2); |
143 | | |
144 | | /*-------------------------------------------------------------*/ |
145 | | /* IDCT [ Vertical transformation ] */ |
146 | | /*-------------------------------------------------------------*/ |
147 | | // Matrix transpose |
148 | | /* |
149 | | * a0 b0 c0 d0 |
150 | | * a1 b1 c1 d1 |
151 | | * a2 b2 c2 d2 |
152 | | * a3 b3 c3 d3 |
153 | | */ |
154 | 840k | temp0 = _mm_unpacklo_epi32(src_r0, src_r1); //a0 a1 b0 b1 |
155 | 840k | temp2 = _mm_unpacklo_epi32(src_r2, src_r3); //a2 a3 b2 b3 |
156 | 840k | temp1 = _mm_unpackhi_epi32(src_r0, src_r1); //c0 c1 d0 d1 |
157 | 840k | temp3 = _mm_unpackhi_epi32(src_r2, src_r3); //c2 c3 d2 d3 |
158 | 840k | src_r0 = _mm_unpacklo_epi64(temp0, temp2); //a0 a1 a2 a3 |
159 | 840k | src_r1 = _mm_unpackhi_epi64(temp0, temp2); //b0 b1 b2 b3 |
160 | 840k | src_r2 = _mm_unpacklo_epi64(temp1, temp3); //c0 c1 c2 c3 |
161 | 840k | src_r3 = _mm_unpackhi_epi64(temp1, temp3); //d0 d1 d2 d3 |
162 | | |
163 | 840k | temp0 = _mm_add_epi32(src_r0, src_r3); |
164 | 840k | temp1 = _mm_add_epi32(src_r1, src_r2); |
165 | 840k | temp2 = _mm_sub_epi32(src_r1, src_r2); |
166 | 840k | temp3 = _mm_sub_epi32(src_r0, src_r3); |
167 | | |
168 | 840k | src_r0 = _mm_add_epi32(temp0, temp1); |
169 | 840k | src_r1 = _mm_add_epi32(temp2, temp3); |
170 | 840k | src_r2 = _mm_sub_epi32(temp0, temp1); |
171 | 840k | src_r3 = _mm_sub_epi32(temp3, temp2); |
172 | | |
173 | 840k | src_r0 = _mm_mullo_epi32(src_r0, mult_val); |
174 | 840k | src_r1 = _mm_mullo_epi32(src_r1, mult_val); |
175 | 840k | src_r2 = _mm_mullo_epi32(src_r2, mult_val); |
176 | 840k | src_r3 = _mm_mullo_epi32(src_r3, mult_val); |
177 | | |
178 | | //Scaling |
179 | 840k | if(u4_qp_div_6 >= 6) |
180 | 196k | { |
181 | 196k | src_r0 = _mm_slli_epi32(src_r0, u4_qp_div_6 - 6); |
182 | 196k | src_r1 = _mm_slli_epi32(src_r1, u4_qp_div_6 - 6); |
183 | 196k | src_r2 = _mm_slli_epi32(src_r2, u4_qp_div_6 - 6); |
184 | 196k | src_r3 = _mm_slli_epi32(src_r3, u4_qp_div_6 - 6); |
185 | 196k | } |
186 | 644k | else |
187 | 644k | { |
188 | 644k | temp0 = _mm_add_epi32(src_r0, add_rshift); |
189 | 644k | temp1 = _mm_add_epi32(src_r1, add_rshift); |
190 | 644k | temp2 = _mm_add_epi32(src_r2, add_rshift); |
191 | 644k | temp3 = _mm_add_epi32(src_r3, add_rshift); |
192 | 644k | src_r0 = _mm_srai_epi32(temp0, 6 - u4_qp_div_6); |
193 | 644k | src_r1 = _mm_srai_epi32(temp1, 6 - u4_qp_div_6); |
194 | 644k | src_r2 = _mm_srai_epi32(temp2, 6 - u4_qp_div_6); |
195 | 644k | src_r3 = _mm_srai_epi32(temp3, 6 - u4_qp_div_6); |
196 | 644k | } |
197 | 840k | src_r0_r1 = _mm_packs_epi32(src_r0, src_r1); |
198 | 840k | src_r2_r3 = _mm_packs_epi32(src_r2, src_r3); |
199 | | |
200 | 840k | _mm_storeu_si128((__m128i *) (&pi2_out[0]), src_r0_r1); |
201 | 840k | _mm_storeu_si128((__m128i *) (&pi2_out[8]), src_r2_r3); |
202 | 840k | } |
203 | | |
204 | | void ih264_ihadamard_scaling_2x2_uv_sse42(WORD16* pi2_src, |
205 | | WORD16* pi2_out, |
206 | | const UWORD16 *pu2_iscal_mat, |
207 | | const UWORD16 *pu2_weigh_mat, |
208 | | UWORD32 u4_qp_div_6, |
209 | | WORD32* pi4_tmp) |
210 | 0 | { |
211 | 0 | __m128i src, plane_0, plane_1, temp0, temp1, sign_reg; |
212 | 0 | __m128i zero_8x16b = _mm_setzero_si128(); |
213 | 0 | __m128i scale_val = _mm_set1_epi32((WORD32)(pu2_iscal_mat[0] * pu2_weigh_mat[0])); |
214 | 0 | UNUSED(pi4_tmp); |
215 | |
|
216 | 0 | src = _mm_loadu_si128((__m128i *) pi2_src); //a0 a1 a2 a3 b0 b1 b2 b3 |
217 | 0 | sign_reg = _mm_cmpgt_epi16(zero_8x16b, src); |
218 | 0 | plane_0 = _mm_unpacklo_epi16(src, sign_reg); //a0 a1 a2 a3 -- 32 bits |
219 | 0 | plane_1 = _mm_unpackhi_epi16(src, sign_reg); //b0 b1 b2 b3 -- 32 bits |
220 | |
|
221 | 0 | temp0 = _mm_hadd_epi32(plane_0, plane_1); //a0+a1 a2+a3 b0+b1 b2+b3 |
222 | 0 | temp1 = _mm_hsub_epi32(plane_0, plane_1); //a0-a1 a2-a3 b0-b1 b2-b3 |
223 | 0 | plane_0 = _mm_hadd_epi32(temp0, temp1); //a0+a1+a2+a3 b0+b1+b2+b3 a0-a1+a2-a3 b0-b1+b2-b3 |
224 | 0 | plane_1 = _mm_hsub_epi32(temp0, temp1); //a0+a1-a2-a3 b0+b1-b2-b3 a0-a1-a2+a3 b0-b1-b2+b3 |
225 | 0 | temp0 = _mm_unpacklo_epi32(plane_0, plane_1); //a0+a1+a2+a3 a0+a1-a2-a3 b0+b1+b2+b3 b0+b1-b2-b3 |
226 | 0 | temp1 = _mm_unpackhi_epi32(plane_0, plane_1); //a0-a1+a2-a3 a0-a1-a2+a3 b0-b1+b2-b3 b0-b1-b2+b3 |
227 | |
|
228 | 0 | plane_0 = _mm_unpacklo_epi64(temp0, temp1); //a0+a1+a2+a3 a0+a1-a2-a3 a0-a1+a2-a3 a0-a1-a2+a3 |
229 | 0 | plane_1 = _mm_unpackhi_epi64(temp0, temp1); //b0+b1+b2+b3 b0+b1-b2-b3 b0-b1+b2-b3 b0-b1-b2+b3 |
230 | |
|
231 | 0 | plane_0 = _mm_shuffle_epi32(plane_0, 0xd8); //a0+a1+a2+a3 a0-a1+a2-a3 a0+a1-a2-a3 a0-a1-a2+a3 |
232 | 0 | plane_1 = _mm_shuffle_epi32(plane_1, 0xd8); //b0+b1+b2+b3 b0-b1+b2-b3 b0+b1-b2-b3 b0-b1-b2+b3 |
233 | |
|
234 | 0 | temp0 = _mm_mullo_epi32(scale_val, plane_0); //multiply by pu2_iscal_mat[0] * pu2_weigh_mat[0] |
235 | 0 | temp1 = _mm_mullo_epi32(scale_val, plane_1); //multiply by pu2_iscal_mat[0] * pu2_weigh_mat[0] |
236 | |
|
237 | 0 | temp0 = _mm_slli_epi32(temp0, u4_qp_div_6); |
238 | 0 | temp1 = _mm_slli_epi32(temp1, u4_qp_div_6); |
239 | |
|
240 | 0 | temp0 = _mm_srai_epi32(temp0, 5); |
241 | 0 | temp1 = _mm_srai_epi32(temp1, 5); |
242 | |
|
243 | 0 | temp0 = _mm_packs_epi32(temp0, temp1); //Final values are 16-bits only. |
244 | |
|
245 | 0 | _mm_storeu_si128((__m128i *) (&pi2_out[0]), temp0); |
246 | |
|
247 | 0 | } |