/src/libhevc/common/ihevc_itrans_recon_32x32.c
Line | Count | Source |
1 | | /****************************************************************************** |
2 | | * |
3 | | * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore |
4 | | * |
5 | | * Licensed under the Apache License, Version 2.0 (the "License"); |
6 | | * you may not use this file except in compliance with the License. |
7 | | * You may obtain a copy of the License at: |
8 | | * |
9 | | * http://www.apache.org/licenses/LICENSE-2.0 |
10 | | * |
11 | | * Unless required by applicable law or agreed to in writing, software |
12 | | * distributed under the License is distributed on an "AS IS" BASIS, |
13 | | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | * See the License for the specific language governing permissions and |
15 | | * limitations under the License. |
16 | | * |
17 | | ******************************************************************************/ |
18 | | /** |
19 | | ******************************************************************************* |
20 | | * @file |
21 | | * ihevc_itrans_recon_32x32.c |
22 | | * |
23 | | * @brief |
24 | | * Contains function definitions for inverse transform and reconstruction 32x32 |
25 | | * |
26 | | * |
27 | | * @author |
28 | | * 100470 |
29 | | * |
30 | | * @par List of Functions: |
31 | | * - ihevc_itrans_recon_32x32() |
32 | | * |
33 | | * @remarks |
34 | | * None |
35 | | * |
36 | | ******************************************************************************* |
37 | | */ |
38 | | #include <stdio.h> |
39 | | #include <string.h> |
40 | | #include "ihevc_typedefs.h" |
41 | | #include "ihevc_macros.h" |
42 | | #include "ihevc_platform_macros.h" |
43 | | #include "ihevc_defs.h" |
44 | | #include "ihevc_trans_tables.h" |
45 | | #include "ihevc_itrans_recon.h" |
46 | | #include "ihevc_func_selector.h" |
47 | | #include "ihevc_trans_macros.h" |
48 | | |
49 | | |
50 | | /** |
51 | | ******************************************************************************* |
52 | | * |
53 | | * @brief |
54 | | * This function performs Inverse transform and reconstruction for 32x32 |
55 | | * input block |
56 | | * |
57 | | * @par Description: |
58 | | * Performs inverse transform and adds the prediction data and clips output |
59 | | * to 8 bit |
60 | | * |
61 | | * @param[in] pi2_src |
62 | | * Input 32x32 coefficients |
63 | | * |
64 | | * @param[in] pi2_tmp |
65 | | * Temporary 32x32 buffer for storing inverse |
66 | | * |
67 | | * transform |
68 | | * 1st stage output |
69 | | * |
70 | | * @param[in] pu1_pred |
71 | | * Prediction 32x32 block |
72 | | * |
73 | | * @param[out] pu1_dst |
74 | | * Output 32x32 block |
75 | | * |
76 | | * @param[in] src_strd |
77 | | * Input stride |
78 | | * |
79 | | * @param[in] pred_strd |
80 | | * Prediction stride |
81 | | * |
82 | | * @param[in] dst_strd |
83 | | * Output Stride |
84 | | * |
85 | | * @param[in] shift |
86 | | * Output shift |
87 | | * |
88 | | * @param[in] zero_cols |
89 | | * Zero columns in pi2_src |
90 | | * |
91 | | * @returns Void |
92 | | * |
93 | | * @remarks |
94 | | * None |
95 | | * |
96 | | ******************************************************************************* |
97 | | */ |
98 | | |
99 | | void ihevc_itrans_recon_32x32(WORD16 *pi2_src, |
100 | | WORD16 *pi2_tmp, |
101 | | UWORD8 *pu1_pred, |
102 | | UWORD8 *pu1_dst, |
103 | | WORD32 src_strd, |
104 | | WORD32 pred_strd, |
105 | | WORD32 dst_strd, |
106 | | WORD32 zero_cols, |
107 | | WORD32 zero_rows) |
108 | 904k | { |
109 | 904k | WORD32 j, k; |
110 | 904k | WORD32 e[16], o[16]; |
111 | 904k | WORD32 ee[8], eo[8]; |
112 | 904k | WORD32 eee[4], eeo[4]; |
113 | 904k | WORD32 eeee[2], eeeo[2]; |
114 | 904k | WORD32 add; |
115 | 904k | WORD32 shift; |
116 | 904k | WORD16 *pi2_tmp_orig; |
117 | 904k | WORD32 trans_size; |
118 | 904k | WORD32 zero_rows_2nd_stage = zero_cols; |
119 | 904k | WORD32 row_limit_2nd_stage; |
120 | | |
121 | 904k | trans_size = TRANS_SIZE_32; |
122 | 904k | pi2_tmp_orig = pi2_tmp; |
123 | | |
124 | 904k | if((zero_cols & 0xFFFFFFF0) == 0xFFFFFFF0) |
125 | 310k | row_limit_2nd_stage = 4; |
126 | 594k | else if((zero_cols & 0xFFFFFF00) == 0xFFFFFF00) |
127 | 284k | row_limit_2nd_stage = 8; |
128 | 310k | else |
129 | 310k | row_limit_2nd_stage = TRANS_SIZE_32; |
130 | | |
131 | 904k | if((zero_rows & 0xFFFFFFF0) == 0xFFFFFFF0) /* First 4 rows of input are non-zero */ |
132 | 310k | { |
133 | | /************************************************************************************************/ |
134 | | /**********************************START - IT_RECON_32x32****************************************/ |
135 | | /************************************************************************************************/ |
136 | | /* Inverse Transform 1st stage */ |
137 | 310k | shift = IT_SHIFT_STAGE_1; |
138 | 310k | add = 1 << (shift - 1); |
139 | | |
140 | 3.85M | for(j = 0; j < row_limit_2nd_stage; j++) |
141 | 3.54M | { |
142 | | /* Checking for Zero Cols */ |
143 | 3.54M | if((zero_cols & 1) == 1) |
144 | 2.19M | { |
145 | 2.19M | memset(pi2_tmp, 0, trans_size * sizeof(WORD16)); |
146 | 2.19M | } |
147 | 1.34M | else |
148 | 1.34M | { |
149 | | /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ |
150 | 22.9M | for(k = 0; k < 16; k++) |
151 | 21.5M | { |
152 | 21.5M | o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_src[src_strd] |
153 | 21.5M | + g_ai2_ihevc_trans_32[3][k] |
154 | 21.5M | * pi2_src[3 * src_strd]; |
155 | 21.5M | } |
156 | 12.1M | for(k = 0; k < 8; k++) |
157 | 10.7M | { |
158 | 10.7M | eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_src[2 * src_strd]; |
159 | 10.7M | } |
160 | | // for(k = 0; k < 4; k++) |
161 | 1.34M | { |
162 | 1.34M | eeo[0] = 0; |
163 | 1.34M | eeo[1] = 0; |
164 | 1.34M | eeo[2] = 0; |
165 | 1.34M | eeo[3] = 0; |
166 | 1.34M | } |
167 | 1.34M | eeeo[0] = 0; |
168 | 1.34M | eeeo[1] = 0; |
169 | 1.34M | eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_src[0]; |
170 | 1.34M | eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_src[0]; |
171 | | |
172 | | /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */ |
173 | 1.34M | eee[0] = eeee[0] + eeeo[0]; |
174 | 1.34M | eee[3] = eeee[0] - eeeo[0]; |
175 | 1.34M | eee[1] = eeee[1] + eeeo[1]; |
176 | 1.34M | eee[2] = eeee[1] - eeeo[1]; |
177 | 6.74M | for(k = 0; k < 4; k++) |
178 | 5.39M | { |
179 | 5.39M | ee[k] = eee[k] + eeo[k]; |
180 | 5.39M | ee[k + 4] = eee[3 - k] - eeo[3 - k]; |
181 | 5.39M | } |
182 | 12.1M | for(k = 0; k < 8; k++) |
183 | 10.7M | { |
184 | 10.7M | e[k] = ee[k] + eo[k]; |
185 | 10.7M | e[k + 8] = ee[7 - k] - eo[7 - k]; |
186 | 10.7M | } |
187 | 22.8M | for(k = 0; k < 16; k++) |
188 | 21.5M | { |
189 | 21.5M | pi2_tmp[k] = |
190 | 21.5M | CLIP_S16(((e[k] + o[k] + add) >> shift)); |
191 | 21.5M | pi2_tmp[k + 16] = |
192 | 21.5M | CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift)); |
193 | 21.5M | } |
194 | 1.34M | } |
195 | 3.54M | pi2_src++; |
196 | 3.54M | pi2_tmp += trans_size; |
197 | 3.54M | zero_cols = zero_cols >> 1; |
198 | 3.54M | } |
199 | | |
200 | 310k | pi2_tmp = pi2_tmp_orig; |
201 | | |
202 | | /* Inverse Transform 2nd stage */ |
203 | 310k | shift = IT_SHIFT_STAGE_2; |
204 | 310k | add = 1 << (shift - 1); |
205 | 310k | if((zero_rows_2nd_stage & 0xFFFFFFF0) == 0xFFFFFFF0) /* First 4 rows of output of 1st stage are non-zero */ |
206 | 153k | { |
207 | 5.05M | for(j = 0; j < trans_size; j++) |
208 | 4.90M | { |
209 | | /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ |
210 | 83.3M | for(k = 0; k < 16; k++) |
211 | 78.3M | { |
212 | 78.3M | o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size] |
213 | 78.3M | + g_ai2_ihevc_trans_32[3][k] |
214 | 78.3M | * pi2_tmp[3 * trans_size]; |
215 | 78.3M | } |
216 | 44.1M | for(k = 0; k < 8; k++) |
217 | 39.2M | { |
218 | 39.2M | eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size]; |
219 | 39.2M | } |
220 | | // for(k = 0; k < 4; k++) |
221 | 4.90M | { |
222 | 4.90M | eeo[0] = 0; |
223 | 4.90M | eeo[1] = 0; |
224 | 4.90M | eeo[2] = 0; |
225 | 4.90M | eeo[3] = 0; |
226 | 4.90M | } |
227 | 4.90M | eeeo[0] = 0; |
228 | 4.90M | eeeo[1] = 0; |
229 | 4.90M | eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0]; |
230 | 4.90M | eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0]; |
231 | | |
232 | | /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */ |
233 | 4.90M | eee[0] = eeee[0] + eeeo[0]; |
234 | 4.90M | eee[3] = eeee[0] - eeeo[0]; |
235 | 4.90M | eee[1] = eeee[1] + eeeo[1]; |
236 | 4.90M | eee[2] = eeee[1] - eeeo[1]; |
237 | 24.4M | for(k = 0; k < 4; k++) |
238 | 19.5M | { |
239 | 19.5M | ee[k] = eee[k] + eeo[k]; |
240 | 19.5M | ee[k + 4] = eee[3 - k] - eeo[3 - k]; |
241 | 19.5M | } |
242 | 44.0M | for(k = 0; k < 8; k++) |
243 | 39.1M | { |
244 | 39.1M | e[k] = ee[k] + eo[k]; |
245 | 39.1M | e[k + 8] = ee[7 - k] - eo[7 - k]; |
246 | 39.1M | } |
247 | 82.9M | for(k = 0; k < 16; k++) |
248 | 78.0M | { |
249 | 78.0M | WORD32 itrans_out; |
250 | 78.0M | itrans_out = |
251 | 78.0M | CLIP_S16(((e[k] + o[k] + add) >> shift)); |
252 | 78.0M | pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k])); |
253 | 78.0M | itrans_out = |
254 | 78.0M | CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift)); |
255 | 78.0M | pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16])); |
256 | 78.0M | } |
257 | 4.90M | pi2_tmp++; |
258 | 4.90M | pu1_pred += pred_strd; |
259 | 4.90M | pu1_dst += dst_strd; |
260 | 4.90M | } |
261 | 153k | } |
262 | 157k | else if((zero_rows_2nd_stage & 0xFFFFFF00) == 0xFFFFFF00) /* First 8 rows of output of 1st stage are non-zero */ |
263 | 86.9k | { |
264 | 2.86M | for(j = 0; j < trans_size; j++) |
265 | 2.77M | { |
266 | | /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ |
267 | 47.0M | for(k = 0; k < 16; k++) |
268 | 44.2M | { |
269 | 44.2M | o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size] |
270 | 44.2M | + g_ai2_ihevc_trans_32[3][k] |
271 | 44.2M | * pi2_tmp[3 * trans_size] |
272 | 44.2M | + g_ai2_ihevc_trans_32[5][k] |
273 | 44.2M | * pi2_tmp[5 * trans_size] |
274 | 44.2M | + g_ai2_ihevc_trans_32[7][k] |
275 | 44.2M | * pi2_tmp[7 * trans_size]; |
276 | 44.2M | } |
277 | 24.9M | for(k = 0; k < 8; k++) |
278 | 22.1M | { |
279 | 22.1M | eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size] |
280 | 22.1M | + g_ai2_ihevc_trans_32[6][k] |
281 | 22.1M | * pi2_tmp[6 * trans_size]; |
282 | 22.1M | } |
283 | 13.8M | for(k = 0; k < 4; k++) |
284 | 11.0M | { |
285 | 11.0M | eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_tmp[4 * trans_size]; |
286 | 11.0M | } |
287 | 2.77M | eeeo[0] = 0; |
288 | 2.77M | eeeo[1] = 0; |
289 | 2.77M | eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0]; |
290 | 2.77M | eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0]; |
291 | | |
292 | | /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */ |
293 | 2.77M | eee[0] = eeee[0] + eeeo[0]; |
294 | 2.77M | eee[3] = eeee[0] - eeeo[0]; |
295 | 2.77M | eee[1] = eeee[1] + eeeo[1]; |
296 | 2.77M | eee[2] = eeee[1] - eeeo[1]; |
297 | 13.8M | for(k = 0; k < 4; k++) |
298 | 11.0M | { |
299 | 11.0M | ee[k] = eee[k] + eeo[k]; |
300 | 11.0M | ee[k + 4] = eee[3 - k] - eeo[3 - k]; |
301 | 11.0M | } |
302 | 24.8M | for(k = 0; k < 8; k++) |
303 | 22.1M | { |
304 | 22.1M | e[k] = ee[k] + eo[k]; |
305 | 22.1M | e[k + 8] = ee[7 - k] - eo[7 - k]; |
306 | 22.1M | } |
307 | 46.6M | for(k = 0; k < 16; k++) |
308 | 43.8M | { |
309 | 43.8M | WORD32 itrans_out; |
310 | 43.8M | itrans_out = |
311 | 43.8M | CLIP_S16(((e[k] + o[k] + add) >> shift)); |
312 | 43.8M | pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k])); |
313 | 43.8M | itrans_out = |
314 | 43.8M | CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift)); |
315 | 43.8M | pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16])); |
316 | 43.8M | } |
317 | 2.77M | pi2_tmp++; |
318 | 2.77M | pu1_pred += pred_strd; |
319 | 2.77M | pu1_dst += dst_strd; |
320 | 2.77M | } |
321 | 86.9k | } |
322 | 70.0k | else /* All rows of output of 1st stage are non-zero */ |
323 | 70.0k | { |
324 | 2.30M | for(j = 0; j < trans_size; j++) |
325 | 2.23M | { |
326 | | /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ |
327 | 36.9M | for(k = 0; k < 16; k++) |
328 | 34.7M | { |
329 | 34.7M | o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size] |
330 | 34.7M | + g_ai2_ihevc_trans_32[3][k] |
331 | 34.7M | * pi2_tmp[3 * trans_size] |
332 | 34.7M | + g_ai2_ihevc_trans_32[5][k] |
333 | 34.7M | * pi2_tmp[5 * trans_size] |
334 | 34.7M | + g_ai2_ihevc_trans_32[7][k] |
335 | 34.7M | * pi2_tmp[7 * trans_size] |
336 | 34.7M | + g_ai2_ihevc_trans_32[9][k] |
337 | 34.7M | * pi2_tmp[9 * trans_size] |
338 | 34.7M | + g_ai2_ihevc_trans_32[11][k] |
339 | 34.7M | * pi2_tmp[11 * trans_size] |
340 | 34.7M | + g_ai2_ihevc_trans_32[13][k] |
341 | 34.7M | * pi2_tmp[13 * trans_size] |
342 | 34.7M | + g_ai2_ihevc_trans_32[15][k] |
343 | 34.7M | * pi2_tmp[15 * trans_size] |
344 | 34.7M | + g_ai2_ihevc_trans_32[17][k] |
345 | 34.7M | * pi2_tmp[17 * trans_size] |
346 | 34.7M | + g_ai2_ihevc_trans_32[19][k] |
347 | 34.7M | * pi2_tmp[19 * trans_size] |
348 | 34.7M | + g_ai2_ihevc_trans_32[21][k] |
349 | 34.7M | * pi2_tmp[21 * trans_size] |
350 | 34.7M | + g_ai2_ihevc_trans_32[23][k] |
351 | 34.7M | * pi2_tmp[23 * trans_size] |
352 | 34.7M | + g_ai2_ihevc_trans_32[25][k] |
353 | 34.7M | * pi2_tmp[25 * trans_size] |
354 | 34.7M | + g_ai2_ihevc_trans_32[27][k] |
355 | 34.7M | * pi2_tmp[27 * trans_size] |
356 | 34.7M | + g_ai2_ihevc_trans_32[29][k] |
357 | 34.7M | * pi2_tmp[29 * trans_size] |
358 | 34.7M | + g_ai2_ihevc_trans_32[31][k] |
359 | 34.7M | * pi2_tmp[31 * trans_size]; |
360 | 34.7M | } |
361 | 19.9M | for(k = 0; k < 8; k++) |
362 | 17.6M | { |
363 | 17.6M | eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size] |
364 | 17.6M | + g_ai2_ihevc_trans_32[6][k] |
365 | 17.6M | * pi2_tmp[6 * trans_size] |
366 | 17.6M | + g_ai2_ihevc_trans_32[10][k] |
367 | 17.6M | * pi2_tmp[10 * trans_size] |
368 | 17.6M | + g_ai2_ihevc_trans_32[14][k] |
369 | 17.6M | * pi2_tmp[14 * trans_size] |
370 | 17.6M | + g_ai2_ihevc_trans_32[18][k] |
371 | 17.6M | * pi2_tmp[18 * trans_size] |
372 | 17.6M | + g_ai2_ihevc_trans_32[22][k] |
373 | 17.6M | * pi2_tmp[22 * trans_size] |
374 | 17.6M | + g_ai2_ihevc_trans_32[26][k] |
375 | 17.6M | * pi2_tmp[26 * trans_size] |
376 | 17.6M | + g_ai2_ihevc_trans_32[30][k] |
377 | 17.6M | * pi2_tmp[30 * trans_size]; |
378 | 17.6M | } |
379 | 11.1M | for(k = 0; k < 4; k++) |
380 | 8.90M | { |
381 | 8.90M | eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_tmp[4 * trans_size] |
382 | 8.90M | + g_ai2_ihevc_trans_32[12][k] |
383 | 8.90M | * pi2_tmp[12 * trans_size] |
384 | 8.90M | + g_ai2_ihevc_trans_32[20][k] |
385 | 8.90M | * pi2_tmp[20 * trans_size] |
386 | 8.90M | + g_ai2_ihevc_trans_32[28][k] |
387 | 8.90M | * pi2_tmp[28 * trans_size]; |
388 | 8.90M | } |
389 | 2.23M | eeeo[0] = |
390 | 2.23M | g_ai2_ihevc_trans_32[8][0] * pi2_tmp[8 * trans_size] |
391 | 2.23M | + g_ai2_ihevc_trans_32[24][0] |
392 | 2.23M | * pi2_tmp[24 |
393 | 2.23M | * trans_size]; |
394 | 2.23M | eeeo[1] = |
395 | 2.23M | g_ai2_ihevc_trans_32[8][1] * pi2_tmp[8 * trans_size] |
396 | 2.23M | + g_ai2_ihevc_trans_32[24][1] |
397 | 2.23M | * pi2_tmp[24 |
398 | 2.23M | * trans_size]; |
399 | 2.23M | eeee[0] = |
400 | 2.23M | g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0] |
401 | 2.23M | + g_ai2_ihevc_trans_32[16][0] |
402 | 2.23M | * pi2_tmp[16 |
403 | 2.23M | * trans_size]; |
404 | 2.23M | eeee[1] = |
405 | 2.23M | g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0] |
406 | 2.23M | + g_ai2_ihevc_trans_32[16][1] |
407 | 2.23M | * pi2_tmp[16 |
408 | 2.23M | * trans_size]; |
409 | | |
410 | | /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */ |
411 | 2.23M | eee[0] = eeee[0] + eeeo[0]; |
412 | 2.23M | eee[3] = eeee[0] - eeeo[0]; |
413 | 2.23M | eee[1] = eeee[1] + eeeo[1]; |
414 | 2.23M | eee[2] = eeee[1] - eeeo[1]; |
415 | 11.1M | for(k = 0; k < 4; k++) |
416 | 8.87M | { |
417 | 8.87M | ee[k] = eee[k] + eeo[k]; |
418 | 8.87M | ee[k + 4] = eee[3 - k] - eeo[3 - k]; |
419 | 8.87M | } |
420 | 19.9M | for(k = 0; k < 8; k++) |
421 | 17.7M | { |
422 | 17.7M | e[k] = ee[k] + eo[k]; |
423 | 17.7M | e[k + 8] = ee[7 - k] - eo[7 - k]; |
424 | 17.7M | } |
425 | 37.4M | for(k = 0; k < 16; k++) |
426 | 35.2M | { |
427 | 35.2M | WORD32 itrans_out; |
428 | 35.2M | itrans_out = |
429 | 35.2M | CLIP_S16(((e[k] + o[k] + add) >> shift)); |
430 | 35.2M | pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k])); |
431 | 35.2M | itrans_out = |
432 | 35.2M | CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift)); |
433 | 35.2M | pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16])); |
434 | 35.2M | } |
435 | 2.23M | pi2_tmp++; |
436 | 2.23M | pu1_pred += pred_strd; |
437 | 2.23M | pu1_dst += dst_strd; |
438 | 2.23M | } |
439 | 70.0k | } |
440 | | /************************************************************************************************/ |
441 | | /************************************END - IT_RECON_32x32****************************************/ |
442 | | /************************************************************************************************/ |
443 | 310k | } |
444 | 594k | else if((zero_rows & 0xFFFFFF00) == 0xFFFFFF00) /* First 8 rows of input are non-zero */ |
445 | 233k | { |
446 | | /************************************************************************************************/ |
447 | | /**********************************START - IT_RECON_32x32****************************************/ |
448 | | /************************************************************************************************/ |
449 | | /* Inverse Transform 1st stage */ |
450 | 233k | shift = IT_SHIFT_STAGE_1; |
451 | 233k | add = 1 << (shift - 1); |
452 | | |
453 | 3.28M | for(j = 0; j < row_limit_2nd_stage; j++) |
454 | 3.05M | { |
455 | | /* Checking for Zero Cols */ |
456 | 3.05M | if((zero_cols & 1) == 1) |
457 | 2.38M | { |
458 | 2.38M | memset(pi2_tmp, 0, trans_size * sizeof(WORD16)); |
459 | 2.38M | } |
460 | 662k | else |
461 | 662k | { |
462 | | /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ |
463 | 11.2M | for(k = 0; k < 16; k++) |
464 | 10.6M | { |
465 | 10.6M | o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_src[src_strd] |
466 | 10.6M | + g_ai2_ihevc_trans_32[3][k] |
467 | 10.6M | * pi2_src[3 * src_strd] |
468 | 10.6M | + g_ai2_ihevc_trans_32[5][k] |
469 | 10.6M | * pi2_src[5 * src_strd] |
470 | 10.6M | + g_ai2_ihevc_trans_32[7][k] |
471 | 10.6M | * pi2_src[7 * src_strd]; |
472 | 10.6M | } |
473 | 5.96M | for(k = 0; k < 8; k++) |
474 | 5.30M | { |
475 | 5.30M | eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_src[2 * src_strd] |
476 | 5.30M | + g_ai2_ihevc_trans_32[6][k] |
477 | 5.30M | * pi2_src[6 * src_strd]; |
478 | 5.30M | } |
479 | 3.31M | for(k = 0; k < 4; k++) |
480 | 2.65M | { |
481 | 2.65M | eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_src[4 * src_strd]; |
482 | 2.65M | } |
483 | 662k | eeeo[0] = 0; |
484 | 662k | eeeo[1] = 0; |
485 | 662k | eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_src[0]; |
486 | 662k | eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_src[0]; |
487 | | |
488 | | /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */ |
489 | 662k | eee[0] = eeee[0] + eeeo[0]; |
490 | 662k | eee[3] = eeee[0] - eeeo[0]; |
491 | 662k | eee[1] = eeee[1] + eeeo[1]; |
492 | 662k | eee[2] = eeee[1] - eeeo[1]; |
493 | 3.31M | for(k = 0; k < 4; k++) |
494 | 2.65M | { |
495 | 2.65M | ee[k] = eee[k] + eeo[k]; |
496 | 2.65M | ee[k + 4] = eee[3 - k] - eeo[3 - k]; |
497 | 2.65M | } |
498 | 5.96M | for(k = 0; k < 8; k++) |
499 | 5.30M | { |
500 | 5.30M | e[k] = ee[k] + eo[k]; |
501 | 5.30M | e[k + 8] = ee[7 - k] - eo[7 - k]; |
502 | 5.30M | } |
503 | 11.2M | for(k = 0; k < 16; k++) |
504 | 10.6M | { |
505 | 10.6M | pi2_tmp[k] = |
506 | 10.6M | CLIP_S16(((e[k] + o[k] + add) >> shift)); |
507 | 10.6M | pi2_tmp[k + 16] = |
508 | 10.6M | CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift)); |
509 | 10.6M | } |
510 | 662k | } |
511 | 3.05M | pi2_src++; |
512 | 3.05M | pi2_tmp += trans_size; |
513 | 3.05M | zero_cols = zero_cols >> 1; |
514 | 3.05M | } |
515 | | |
516 | 233k | pi2_tmp = pi2_tmp_orig; |
517 | | |
518 | | /* Inverse Transform 2nd stage */ |
519 | 233k | shift = IT_SHIFT_STAGE_2; |
520 | 233k | add = 1 << (shift - 1); |
521 | 233k | if((zero_rows_2nd_stage & 0xFFFFFFF0) == 0xFFFFFFF0) /* First 4 rows of output of 1st stage are non-zero */ |
522 | 70.5k | { |
523 | 2.30M | for(j = 0; j < trans_size; j++) |
524 | 2.23M | { |
525 | | /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ |
526 | 37.5M | for(k = 0; k < 16; k++) |
527 | 35.3M | { |
528 | 35.3M | o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size] |
529 | 35.3M | + g_ai2_ihevc_trans_32[3][k] |
530 | 35.3M | * pi2_tmp[3 * trans_size]; |
531 | 35.3M | } |
532 | 20.0M | for(k = 0; k < 8; k++) |
533 | 17.7M | { |
534 | 17.7M | eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size]; |
535 | 17.7M | } |
536 | | // for(k = 0; k < 4; k++) |
537 | 2.23M | { |
538 | 2.23M | eeo[0] = 0; |
539 | 2.23M | eeo[1] = 0; |
540 | 2.23M | eeo[2] = 0; |
541 | 2.23M | eeo[3] = 0; |
542 | 2.23M | } |
543 | 2.23M | eeeo[0] = 0; |
544 | 2.23M | eeeo[1] = 0; |
545 | 2.23M | eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0]; |
546 | 2.23M | eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0]; |
547 | | |
548 | | /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */ |
549 | 2.23M | eee[0] = eeee[0] + eeeo[0]; |
550 | 2.23M | eee[3] = eeee[0] - eeeo[0]; |
551 | 2.23M | eee[1] = eeee[1] + eeeo[1]; |
552 | 2.23M | eee[2] = eeee[1] - eeeo[1]; |
553 | 11.0M | for(k = 0; k < 4; k++) |
554 | 8.84M | { |
555 | 8.84M | ee[k] = eee[k] + eeo[k]; |
556 | 8.84M | ee[k + 4] = eee[3 - k] - eeo[3 - k]; |
557 | 8.84M | } |
558 | 19.8M | for(k = 0; k < 8; k++) |
559 | 17.6M | { |
560 | 17.6M | e[k] = ee[k] + eo[k]; |
561 | 17.6M | e[k + 8] = ee[7 - k] - eo[7 - k]; |
562 | 17.6M | } |
563 | 36.9M | for(k = 0; k < 16; k++) |
564 | 34.6M | { |
565 | 34.6M | WORD32 itrans_out; |
566 | 34.6M | itrans_out = |
567 | 34.6M | CLIP_S16(((e[k] + o[k] + add) >> shift)); |
568 | 34.6M | pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k])); |
569 | 34.6M | itrans_out = |
570 | 34.6M | CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift)); |
571 | 34.6M | pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16])); |
572 | 34.6M | } |
573 | 2.23M | pi2_tmp++; |
574 | 2.23M | pu1_pred += pred_strd; |
575 | 2.23M | pu1_dst += dst_strd; |
576 | 2.23M | } |
577 | 70.5k | } |
578 | 163k | else if((zero_rows_2nd_stage & 0xFFFFFF00) == 0xFFFFFF00) /* First 8 rows of output of 1st stage are non-zero */ |
579 | 101k | { |
580 | 3.32M | for(j = 0; j < trans_size; j++) |
581 | 3.22M | { |
582 | | /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ |
583 | 53.8M | for(k = 0; k < 16; k++) |
584 | 50.6M | { |
585 | 50.6M | o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size] |
586 | 50.6M | + g_ai2_ihevc_trans_32[3][k] |
587 | 50.6M | * pi2_tmp[3 * trans_size] |
588 | 50.6M | + g_ai2_ihevc_trans_32[5][k] |
589 | 50.6M | * pi2_tmp[5 * trans_size] |
590 | 50.6M | + g_ai2_ihevc_trans_32[7][k] |
591 | 50.6M | * pi2_tmp[7 * trans_size]; |
592 | 50.6M | } |
593 | 28.8M | for(k = 0; k < 8; k++) |
594 | 25.6M | { |
595 | 25.6M | eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size] |
596 | 25.6M | + g_ai2_ihevc_trans_32[6][k] |
597 | 25.6M | * pi2_tmp[6 * trans_size]; |
598 | 25.6M | } |
599 | 16.0M | for(k = 0; k < 4; k++) |
600 | 12.8M | { |
601 | 12.8M | eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_tmp[4 * trans_size]; |
602 | 12.8M | } |
603 | 3.22M | eeeo[0] = 0; |
604 | 3.22M | eeeo[1] = 0; |
605 | 3.22M | eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0]; |
606 | 3.22M | eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0]; |
607 | | |
608 | | /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */ |
609 | 3.22M | eee[0] = eeee[0] + eeeo[0]; |
610 | 3.22M | eee[3] = eeee[0] - eeeo[0]; |
611 | 3.22M | eee[1] = eeee[1] + eeeo[1]; |
612 | 3.22M | eee[2] = eeee[1] - eeeo[1]; |
613 | 15.8M | for(k = 0; k < 4; k++) |
614 | 12.6M | { |
615 | 12.6M | ee[k] = eee[k] + eeo[k]; |
616 | 12.6M | ee[k + 4] = eee[3 - k] - eeo[3 - k]; |
617 | 12.6M | } |
618 | 28.2M | for(k = 0; k < 8; k++) |
619 | 25.0M | { |
620 | 25.0M | e[k] = ee[k] + eo[k]; |
621 | 25.0M | e[k + 8] = ee[7 - k] - eo[7 - k]; |
622 | 25.0M | } |
623 | 51.2M | for(k = 0; k < 16; k++) |
624 | 47.9M | { |
625 | 47.9M | WORD32 itrans_out; |
626 | 47.9M | itrans_out = |
627 | 47.9M | CLIP_S16(((e[k] + o[k] + add) >> shift)); |
628 | 47.9M | pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k])); |
629 | 47.9M | itrans_out = |
630 | 47.9M | CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift)); |
631 | 47.9M | pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16])); |
632 | 47.9M | } |
633 | 3.22M | pi2_tmp++; |
634 | 3.22M | pu1_pred += pred_strd; |
635 | 3.22M | pu1_dst += dst_strd; |
636 | 3.22M | } |
637 | 101k | } |
638 | 61.1k | else /* All rows of output of 1st stage are non-zero */ |
639 | 61.1k | { |
640 | 2.01M | for(j = 0; j < trans_size; j++) |
641 | 1.95M | { |
642 | | /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ |
643 | 32.5M | for(k = 0; k < 16; k++) |
644 | 30.6M | { |
645 | 30.6M | o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size] |
646 | 30.6M | + g_ai2_ihevc_trans_32[3][k] |
647 | 30.6M | * pi2_tmp[3 * trans_size] |
648 | 30.6M | + g_ai2_ihevc_trans_32[5][k] |
649 | 30.6M | * pi2_tmp[5 * trans_size] |
650 | 30.6M | + g_ai2_ihevc_trans_32[7][k] |
651 | 30.6M | * pi2_tmp[7 * trans_size] |
652 | 30.6M | + g_ai2_ihevc_trans_32[9][k] |
653 | 30.6M | * pi2_tmp[9 * trans_size] |
654 | 30.6M | + g_ai2_ihevc_trans_32[11][k] |
655 | 30.6M | * pi2_tmp[11 * trans_size] |
656 | 30.6M | + g_ai2_ihevc_trans_32[13][k] |
657 | 30.6M | * pi2_tmp[13 * trans_size] |
658 | 30.6M | + g_ai2_ihevc_trans_32[15][k] |
659 | 30.6M | * pi2_tmp[15 * trans_size] |
660 | 30.6M | + g_ai2_ihevc_trans_32[17][k] |
661 | 30.6M | * pi2_tmp[17 * trans_size] |
662 | 30.6M | + g_ai2_ihevc_trans_32[19][k] |
663 | 30.6M | * pi2_tmp[19 * trans_size] |
664 | 30.6M | + g_ai2_ihevc_trans_32[21][k] |
665 | 30.6M | * pi2_tmp[21 * trans_size] |
666 | 30.6M | + g_ai2_ihevc_trans_32[23][k] |
667 | 30.6M | * pi2_tmp[23 * trans_size] |
668 | 30.6M | + g_ai2_ihevc_trans_32[25][k] |
669 | 30.6M | * pi2_tmp[25 * trans_size] |
670 | 30.6M | + g_ai2_ihevc_trans_32[27][k] |
671 | 30.6M | * pi2_tmp[27 * trans_size] |
672 | 30.6M | + g_ai2_ihevc_trans_32[29][k] |
673 | 30.6M | * pi2_tmp[29 * trans_size] |
674 | 30.6M | + g_ai2_ihevc_trans_32[31][k] |
675 | 30.6M | * pi2_tmp[31 * trans_size]; |
676 | 30.6M | } |
677 | 17.5M | for(k = 0; k < 8; k++) |
678 | 15.5M | { |
679 | 15.5M | eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size] |
680 | 15.5M | + g_ai2_ihevc_trans_32[6][k] |
681 | 15.5M | * pi2_tmp[6 * trans_size] |
682 | 15.5M | + g_ai2_ihevc_trans_32[10][k] |
683 | 15.5M | * pi2_tmp[10 * trans_size] |
684 | 15.5M | + g_ai2_ihevc_trans_32[14][k] |
685 | 15.5M | * pi2_tmp[14 * trans_size] |
686 | 15.5M | + g_ai2_ihevc_trans_32[18][k] |
687 | 15.5M | * pi2_tmp[18 * trans_size] |
688 | 15.5M | + g_ai2_ihevc_trans_32[22][k] |
689 | 15.5M | * pi2_tmp[22 * trans_size] |
690 | 15.5M | + g_ai2_ihevc_trans_32[26][k] |
691 | 15.5M | * pi2_tmp[26 * trans_size] |
692 | 15.5M | + g_ai2_ihevc_trans_32[30][k] |
693 | 15.5M | * pi2_tmp[30 * trans_size]; |
694 | 15.5M | } |
695 | 9.76M | for(k = 0; k < 4; k++) |
696 | 7.81M | { |
697 | 7.81M | eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_tmp[4 * trans_size] |
698 | 7.81M | + g_ai2_ihevc_trans_32[12][k] |
699 | 7.81M | * pi2_tmp[12 * trans_size] |
700 | 7.81M | + g_ai2_ihevc_trans_32[20][k] |
701 | 7.81M | * pi2_tmp[20 * trans_size] |
702 | 7.81M | + g_ai2_ihevc_trans_32[28][k] |
703 | 7.81M | * pi2_tmp[28 * trans_size]; |
704 | 7.81M | } |
705 | 1.95M | eeeo[0] = |
706 | 1.95M | g_ai2_ihevc_trans_32[8][0] * pi2_tmp[8 * trans_size] |
707 | 1.95M | + g_ai2_ihevc_trans_32[24][0] |
708 | 1.95M | * pi2_tmp[24 |
709 | 1.95M | * trans_size]; |
710 | 1.95M | eeeo[1] = |
711 | 1.95M | g_ai2_ihevc_trans_32[8][1] * pi2_tmp[8 * trans_size] |
712 | 1.95M | + g_ai2_ihevc_trans_32[24][1] |
713 | 1.95M | * pi2_tmp[24 |
714 | 1.95M | * trans_size]; |
715 | 1.95M | eeee[0] = |
716 | 1.95M | g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0] |
717 | 1.95M | + g_ai2_ihevc_trans_32[16][0] |
718 | 1.95M | * pi2_tmp[16 |
719 | 1.95M | * trans_size]; |
720 | 1.95M | eeee[1] = |
721 | 1.95M | g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0] |
722 | 1.95M | + g_ai2_ihevc_trans_32[16][1] |
723 | 1.95M | * pi2_tmp[16 |
724 | 1.95M | * trans_size]; |
725 | | |
726 | | /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */ |
727 | 1.95M | eee[0] = eeee[0] + eeeo[0]; |
728 | 1.95M | eee[3] = eeee[0] - eeeo[0]; |
729 | 1.95M | eee[1] = eeee[1] + eeeo[1]; |
730 | 1.95M | eee[2] = eeee[1] - eeeo[1]; |
731 | 9.74M | for(k = 0; k < 4; k++) |
732 | 7.79M | { |
733 | 7.79M | ee[k] = eee[k] + eeo[k]; |
734 | 7.79M | ee[k + 4] = eee[3 - k] - eeo[3 - k]; |
735 | 7.79M | } |
736 | 17.5M | for(k = 0; k < 8; k++) |
737 | 15.5M | { |
738 | 15.5M | e[k] = ee[k] + eo[k]; |
739 | 15.5M | e[k + 8] = ee[7 - k] - eo[7 - k]; |
740 | 15.5M | } |
741 | 32.5M | for(k = 0; k < 16; k++) |
742 | 30.5M | { |
743 | 30.5M | WORD32 itrans_out; |
744 | 30.5M | itrans_out = |
745 | 30.5M | CLIP_S16(((e[k] + o[k] + add) >> shift)); |
746 | 30.5M | pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k])); |
747 | 30.5M | itrans_out = |
748 | 30.5M | CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift)); |
749 | 30.5M | pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16])); |
750 | 30.5M | } |
751 | 1.95M | pi2_tmp++; |
752 | 1.95M | pu1_pred += pred_strd; |
753 | 1.95M | pu1_dst += dst_strd; |
754 | 1.95M | } |
755 | 61.1k | } |
756 | | /************************************************************************************************/ |
757 | | /************************************END - IT_RECON_32x32****************************************/ |
758 | | /************************************************************************************************/ |
759 | 233k | } |
760 | 360k | else /* All rows of input are non-zero */ |
761 | 360k | { |
762 | | /************************************************************************************************/ |
763 | | /**********************************START - IT_RECON_32x32****************************************/ |
764 | | /************************************************************************************************/ |
765 | | /* Inverse Transform 1st stage */ |
766 | 360k | shift = IT_SHIFT_STAGE_1; |
767 | 360k | add = 1 << (shift - 1); |
768 | | |
769 | 7.19M | for(j = 0; j < row_limit_2nd_stage; j++) |
770 | 6.83M | { |
771 | | /* Checking for Zero Cols */ |
772 | 6.83M | if((zero_cols & 1) == 1) |
773 | 2.96M | { |
774 | 2.96M | memset(pi2_tmp, 0, trans_size * sizeof(WORD16)); |
775 | 2.96M | } |
776 | 3.87M | else |
777 | 3.87M | { |
778 | | /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ |
779 | 65.8M | for(k = 0; k < 16; k++) |
780 | 61.9M | { |
781 | 61.9M | o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_src[src_strd] |
782 | 61.9M | + g_ai2_ihevc_trans_32[3][k] |
783 | 61.9M | * pi2_src[3 * src_strd] |
784 | 61.9M | + g_ai2_ihevc_trans_32[5][k] |
785 | 61.9M | * pi2_src[5 * src_strd] |
786 | 61.9M | + g_ai2_ihevc_trans_32[7][k] |
787 | 61.9M | * pi2_src[7 * src_strd] |
788 | 61.9M | + g_ai2_ihevc_trans_32[9][k] |
789 | 61.9M | * pi2_src[9 * src_strd] |
790 | 61.9M | + g_ai2_ihevc_trans_32[11][k] |
791 | 61.9M | * pi2_src[11 * src_strd] |
792 | 61.9M | + g_ai2_ihevc_trans_32[13][k] |
793 | 61.9M | * pi2_src[13 * src_strd] |
794 | 61.9M | + g_ai2_ihevc_trans_32[15][k] |
795 | 61.9M | * pi2_src[15 * src_strd] |
796 | 61.9M | + g_ai2_ihevc_trans_32[17][k] |
797 | 61.9M | * pi2_src[17 * src_strd] |
798 | 61.9M | + g_ai2_ihevc_trans_32[19][k] |
799 | 61.9M | * pi2_src[19 * src_strd] |
800 | 61.9M | + g_ai2_ihevc_trans_32[21][k] |
801 | 61.9M | * pi2_src[21 * src_strd] |
802 | 61.9M | + g_ai2_ihevc_trans_32[23][k] |
803 | 61.9M | * pi2_src[23 * src_strd] |
804 | 61.9M | + g_ai2_ihevc_trans_32[25][k] |
805 | 61.9M | * pi2_src[25 * src_strd] |
806 | 61.9M | + g_ai2_ihevc_trans_32[27][k] |
807 | 61.9M | * pi2_src[27 * src_strd] |
808 | 61.9M | + g_ai2_ihevc_trans_32[29][k] |
809 | 61.9M | * pi2_src[29 * src_strd] |
810 | 61.9M | + g_ai2_ihevc_trans_32[31][k] |
811 | 61.9M | * pi2_src[31 * src_strd]; |
812 | 61.9M | } |
813 | 34.8M | for(k = 0; k < 8; k++) |
814 | 31.0M | { |
815 | 31.0M | eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_src[2 * src_strd] |
816 | 31.0M | + g_ai2_ihevc_trans_32[6][k] |
817 | 31.0M | * pi2_src[6 * src_strd] |
818 | 31.0M | + g_ai2_ihevc_trans_32[10][k] |
819 | 31.0M | * pi2_src[10 * src_strd] |
820 | 31.0M | + g_ai2_ihevc_trans_32[14][k] |
821 | 31.0M | * pi2_src[14 * src_strd] |
822 | 31.0M | + g_ai2_ihevc_trans_32[18][k] |
823 | 31.0M | * pi2_src[18 * src_strd] |
824 | 31.0M | + g_ai2_ihevc_trans_32[22][k] |
825 | 31.0M | * pi2_src[22 * src_strd] |
826 | 31.0M | + g_ai2_ihevc_trans_32[26][k] |
827 | 31.0M | * pi2_src[26 * src_strd] |
828 | 31.0M | + g_ai2_ihevc_trans_32[30][k] |
829 | 31.0M | * pi2_src[30 * src_strd]; |
830 | 31.0M | } |
831 | 19.3M | for(k = 0; k < 4; k++) |
832 | 15.5M | { |
833 | 15.5M | eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_src[4 * src_strd] |
834 | 15.5M | + g_ai2_ihevc_trans_32[12][k] |
835 | 15.5M | * pi2_src[12 * src_strd] |
836 | 15.5M | + g_ai2_ihevc_trans_32[20][k] |
837 | 15.5M | * pi2_src[20 * src_strd] |
838 | 15.5M | + g_ai2_ihevc_trans_32[28][k] |
839 | 15.5M | * pi2_src[28 * src_strd]; |
840 | 15.5M | } |
841 | 3.87M | eeeo[0] = g_ai2_ihevc_trans_32[8][0] * pi2_src[8 * src_strd] |
842 | 3.87M | + g_ai2_ihevc_trans_32[24][0] |
843 | 3.87M | * pi2_src[24 * src_strd]; |
844 | 3.87M | eeeo[1] = g_ai2_ihevc_trans_32[8][1] * pi2_src[8 * src_strd] |
845 | 3.87M | + g_ai2_ihevc_trans_32[24][1] |
846 | 3.87M | * pi2_src[24 * src_strd]; |
847 | 3.87M | eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_src[0] |
848 | 3.87M | + g_ai2_ihevc_trans_32[16][0] |
849 | 3.87M | * pi2_src[16 * src_strd]; |
850 | 3.87M | eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_src[0] |
851 | 3.87M | + g_ai2_ihevc_trans_32[16][1] |
852 | 3.87M | * pi2_src[16 * src_strd]; |
853 | | |
854 | | /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */ |
855 | 3.87M | eee[0] = eeee[0] + eeeo[0]; |
856 | 3.87M | eee[3] = eeee[0] - eeeo[0]; |
857 | 3.87M | eee[1] = eeee[1] + eeeo[1]; |
858 | 3.87M | eee[2] = eeee[1] - eeeo[1]; |
859 | 19.3M | for(k = 0; k < 4; k++) |
860 | 15.5M | { |
861 | 15.5M | ee[k] = eee[k] + eeo[k]; |
862 | 15.5M | ee[k + 4] = eee[3 - k] - eeo[3 - k]; |
863 | 15.5M | } |
864 | 34.8M | for(k = 0; k < 8; k++) |
865 | 31.0M | { |
866 | 31.0M | e[k] = ee[k] + eo[k]; |
867 | 31.0M | e[k + 8] = ee[7 - k] - eo[7 - k]; |
868 | 31.0M | } |
869 | 65.8M | for(k = 0; k < 16; k++) |
870 | 61.9M | { |
871 | 61.9M | pi2_tmp[k] = |
872 | 61.9M | CLIP_S16(((e[k] + o[k] + add) >> shift)); |
873 | 61.9M | pi2_tmp[k + 16] = |
874 | 61.9M | CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift)); |
875 | 61.9M | } |
876 | 3.87M | } |
877 | 6.83M | pi2_src++; |
878 | 6.83M | pi2_tmp += trans_size; |
879 | 6.83M | zero_cols = zero_cols >> 1; |
880 | 6.83M | } |
881 | | |
882 | 360k | pi2_tmp = pi2_tmp_orig; |
883 | | |
884 | | /* Inverse Transform 2nd stage */ |
885 | 360k | shift = IT_SHIFT_STAGE_2; |
886 | 360k | add = 1 << (shift - 1); |
887 | 360k | if((zero_rows_2nd_stage & 0xFFFFFFF0) == 0xFFFFFFF0) /* First 4 rows of output of 1st stage are non-zero */ |
888 | 86.4k | { |
889 | 2.77M | for(j = 0; j < trans_size; j++) |
890 | 2.69M | { |
891 | | /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ |
892 | 44.2M | for(k = 0; k < 16; k++) |
893 | 41.5M | { |
894 | 41.5M | o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size] |
895 | 41.5M | + g_ai2_ihevc_trans_32[3][k] |
896 | 41.5M | * pi2_tmp[3 * trans_size]; |
897 | 41.5M | } |
898 | 23.8M | for(k = 0; k < 8; k++) |
899 | 21.1M | { |
900 | 21.1M | eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size]; |
901 | 21.1M | } |
902 | | // for(k = 0; k < 4; k++) |
903 | 2.69M | { |
904 | 2.69M | eeo[0] = 0; |
905 | 2.69M | eeo[1] = 0; |
906 | 2.69M | eeo[2] = 0; |
907 | 2.69M | eeo[3] = 0; |
908 | 2.69M | } |
909 | 2.69M | eeeo[0] = 0; |
910 | 2.69M | eeeo[1] = 0; |
911 | 2.69M | eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0]; |
912 | 2.69M | eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0]; |
913 | | |
914 | | /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */ |
915 | 2.69M | eee[0] = eeee[0] + eeeo[0]; |
916 | 2.69M | eee[3] = eeee[0] - eeeo[0]; |
917 | 2.69M | eee[1] = eeee[1] + eeeo[1]; |
918 | 2.69M | eee[2] = eeee[1] - eeeo[1]; |
919 | 13.1M | for(k = 0; k < 4; k++) |
920 | 10.4M | { |
921 | 10.4M | ee[k] = eee[k] + eeo[k]; |
922 | 10.4M | ee[k + 4] = eee[3 - k] - eeo[3 - k]; |
923 | 10.4M | } |
924 | 23.4M | for(k = 0; k < 8; k++) |
925 | 20.8M | { |
926 | 20.8M | e[k] = ee[k] + eo[k]; |
927 | 20.8M | e[k + 8] = ee[7 - k] - eo[7 - k]; |
928 | 20.8M | } |
929 | 42.3M | for(k = 0; k < 16; k++) |
930 | 39.7M | { |
931 | 39.7M | WORD32 itrans_out; |
932 | 39.7M | itrans_out = |
933 | 39.7M | CLIP_S16(((e[k] + o[k] + add) >> shift)); |
934 | 39.7M | pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k])); |
935 | 39.7M | itrans_out = |
936 | 39.7M | CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift)); |
937 | 39.7M | pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16])); |
938 | 39.7M | } |
939 | 2.69M | pi2_tmp++; |
940 | 2.69M | pu1_pred += pred_strd; |
941 | 2.69M | pu1_dst += dst_strd; |
942 | 2.69M | } |
943 | 86.4k | } |
944 | 274k | else if((zero_rows_2nd_stage & 0xFFFFFF00) == 0xFFFFFF00) /* First 8 rows of output of 1st stage are non-zero */ |
945 | 95.2k | { |
946 | 3.10M | for(j = 0; j < trans_size; j++) |
947 | 3.01M | { |
948 | | /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ |
949 | 49.3M | for(k = 0; k < 16; k++) |
950 | 46.3M | { |
951 | 46.3M | o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size] |
952 | 46.3M | + g_ai2_ihevc_trans_32[3][k] |
953 | 46.3M | * pi2_tmp[3 * trans_size] |
954 | 46.3M | + g_ai2_ihevc_trans_32[5][k] |
955 | 46.3M | * pi2_tmp[5 * trans_size] |
956 | 46.3M | + g_ai2_ihevc_trans_32[7][k] |
957 | 46.3M | * pi2_tmp[7 * trans_size]; |
958 | 46.3M | } |
959 | 26.9M | for(k = 0; k < 8; k++) |
960 | 23.9M | { |
961 | 23.9M | eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size] |
962 | 23.9M | + g_ai2_ihevc_trans_32[6][k] |
963 | 23.9M | * pi2_tmp[6 * trans_size]; |
964 | 23.9M | } |
965 | 15.0M | for(k = 0; k < 4; k++) |
966 | 12.0M | { |
967 | 12.0M | eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_tmp[4 * trans_size]; |
968 | 12.0M | } |
969 | 3.01M | eeeo[0] = 0; |
970 | 3.01M | eeeo[1] = 0; |
971 | 3.01M | eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0]; |
972 | 3.01M | eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0]; |
973 | | |
974 | | /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */ |
975 | 3.01M | eee[0] = eeee[0] + eeeo[0]; |
976 | 3.01M | eee[3] = eeee[0] - eeeo[0]; |
977 | 3.01M | eee[1] = eeee[1] + eeeo[1]; |
978 | 3.01M | eee[2] = eeee[1] - eeeo[1]; |
979 | 14.9M | for(k = 0; k < 4; k++) |
980 | 11.9M | { |
981 | 11.9M | ee[k] = eee[k] + eeo[k]; |
982 | 11.9M | ee[k + 4] = eee[3 - k] - eeo[3 - k]; |
983 | 11.9M | } |
984 | 26.7M | for(k = 0; k < 8; k++) |
985 | 23.7M | { |
986 | 23.7M | e[k] = ee[k] + eo[k]; |
987 | 23.7M | e[k + 8] = ee[7 - k] - eo[7 - k]; |
988 | 23.7M | } |
989 | 49.0M | for(k = 0; k < 16; k++) |
990 | 46.0M | { |
991 | 46.0M | WORD32 itrans_out; |
992 | 46.0M | itrans_out = |
993 | 46.0M | CLIP_S16(((e[k] + o[k] + add) >> shift)); |
994 | 46.0M | pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k])); |
995 | 46.0M | itrans_out = |
996 | 46.0M | CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift)); |
997 | 46.0M | pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16])); |
998 | 46.0M | } |
999 | 3.01M | pi2_tmp++; |
1000 | 3.01M | pu1_pred += pred_strd; |
1001 | 3.01M | pu1_dst += dst_strd; |
1002 | 3.01M | } |
1003 | 95.2k | } |
1004 | 178k | else /* All rows of output of 1st stage are non-zero */ |
1005 | 178k | { |
1006 | 5.88M | for(j = 0; j < trans_size; j++) |
1007 | 5.70M | { |
1008 | | /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ |
1009 | 94.0M | for(k = 0; k < 16; k++) |
1010 | 88.3M | { |
1011 | 88.3M | o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size] |
1012 | 88.3M | + g_ai2_ihevc_trans_32[3][k] |
1013 | 88.3M | * pi2_tmp[3 * trans_size] |
1014 | 88.3M | + g_ai2_ihevc_trans_32[5][k] |
1015 | 88.3M | * pi2_tmp[5 * trans_size] |
1016 | 88.3M | + g_ai2_ihevc_trans_32[7][k] |
1017 | 88.3M | * pi2_tmp[7 * trans_size] |
1018 | 88.3M | + g_ai2_ihevc_trans_32[9][k] |
1019 | 88.3M | * pi2_tmp[9 * trans_size] |
1020 | 88.3M | + g_ai2_ihevc_trans_32[11][k] |
1021 | 88.3M | * pi2_tmp[11 * trans_size] |
1022 | 88.3M | + g_ai2_ihevc_trans_32[13][k] |
1023 | 88.3M | * pi2_tmp[13 * trans_size] |
1024 | 88.3M | + g_ai2_ihevc_trans_32[15][k] |
1025 | 88.3M | * pi2_tmp[15 * trans_size] |
1026 | 88.3M | + g_ai2_ihevc_trans_32[17][k] |
1027 | 88.3M | * pi2_tmp[17 * trans_size] |
1028 | 88.3M | + g_ai2_ihevc_trans_32[19][k] |
1029 | 88.3M | * pi2_tmp[19 * trans_size] |
1030 | 88.3M | + g_ai2_ihevc_trans_32[21][k] |
1031 | 88.3M | * pi2_tmp[21 * trans_size] |
1032 | 88.3M | + g_ai2_ihevc_trans_32[23][k] |
1033 | 88.3M | * pi2_tmp[23 * trans_size] |
1034 | 88.3M | + g_ai2_ihevc_trans_32[25][k] |
1035 | 88.3M | * pi2_tmp[25 * trans_size] |
1036 | 88.3M | + g_ai2_ihevc_trans_32[27][k] |
1037 | 88.3M | * pi2_tmp[27 * trans_size] |
1038 | 88.3M | + g_ai2_ihevc_trans_32[29][k] |
1039 | 88.3M | * pi2_tmp[29 * trans_size] |
1040 | 88.3M | + g_ai2_ihevc_trans_32[31][k] |
1041 | 88.3M | * pi2_tmp[31 * trans_size]; |
1042 | 88.3M | } |
1043 | 50.7M | for(k = 0; k < 8; k++) |
1044 | 45.0M | { |
1045 | 45.0M | eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size] |
1046 | 45.0M | + g_ai2_ihevc_trans_32[6][k] |
1047 | 45.0M | * pi2_tmp[6 * trans_size] |
1048 | 45.0M | + g_ai2_ihevc_trans_32[10][k] |
1049 | 45.0M | * pi2_tmp[10 * trans_size] |
1050 | 45.0M | + g_ai2_ihevc_trans_32[14][k] |
1051 | 45.0M | * pi2_tmp[14 * trans_size] |
1052 | 45.0M | + g_ai2_ihevc_trans_32[18][k] |
1053 | 45.0M | * pi2_tmp[18 * trans_size] |
1054 | 45.0M | + g_ai2_ihevc_trans_32[22][k] |
1055 | 45.0M | * pi2_tmp[22 * trans_size] |
1056 | 45.0M | + g_ai2_ihevc_trans_32[26][k] |
1057 | 45.0M | * pi2_tmp[26 * trans_size] |
1058 | 45.0M | + g_ai2_ihevc_trans_32[30][k] |
1059 | 45.0M | * pi2_tmp[30 * trans_size]; |
1060 | 45.0M | } |
1061 | 28.4M | for(k = 0; k < 4; k++) |
1062 | 22.7M | { |
1063 | 22.7M | eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_tmp[4 * trans_size] |
1064 | 22.7M | + g_ai2_ihevc_trans_32[12][k] |
1065 | 22.7M | * pi2_tmp[12 * trans_size] |
1066 | 22.7M | + g_ai2_ihevc_trans_32[20][k] |
1067 | 22.7M | * pi2_tmp[20 * trans_size] |
1068 | 22.7M | + g_ai2_ihevc_trans_32[28][k] |
1069 | 22.7M | * pi2_tmp[28 * trans_size]; |
1070 | 22.7M | } |
1071 | 5.70M | eeeo[0] = |
1072 | 5.70M | g_ai2_ihevc_trans_32[8][0] * pi2_tmp[8 * trans_size] |
1073 | 5.70M | + g_ai2_ihevc_trans_32[24][0] |
1074 | 5.70M | * pi2_tmp[24 |
1075 | 5.70M | * trans_size]; |
1076 | 5.70M | eeeo[1] = |
1077 | 5.70M | g_ai2_ihevc_trans_32[8][1] * pi2_tmp[8 * trans_size] |
1078 | 5.70M | + g_ai2_ihevc_trans_32[24][1] |
1079 | 5.70M | * pi2_tmp[24 |
1080 | 5.70M | * trans_size]; |
1081 | 5.70M | eeee[0] = |
1082 | 5.70M | g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0] |
1083 | 5.70M | + g_ai2_ihevc_trans_32[16][0] |
1084 | 5.70M | * pi2_tmp[16 |
1085 | 5.70M | * trans_size]; |
1086 | 5.70M | eeee[1] = |
1087 | 5.70M | g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0] |
1088 | 5.70M | + g_ai2_ihevc_trans_32[16][1] |
1089 | 5.70M | * pi2_tmp[16 |
1090 | 5.70M | * trans_size]; |
1091 | | |
1092 | | /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */ |
1093 | 5.70M | eee[0] = eeee[0] + eeeo[0]; |
1094 | 5.70M | eee[3] = eeee[0] - eeeo[0]; |
1095 | 5.70M | eee[1] = eeee[1] + eeeo[1]; |
1096 | 5.70M | eee[2] = eeee[1] - eeeo[1]; |
1097 | 28.4M | for(k = 0; k < 4; k++) |
1098 | 22.7M | { |
1099 | 22.7M | ee[k] = eee[k] + eeo[k]; |
1100 | 22.7M | ee[k + 4] = eee[3 - k] - eeo[3 - k]; |
1101 | 22.7M | } |
1102 | 51.1M | for(k = 0; k < 8; k++) |
1103 | 45.4M | { |
1104 | 45.4M | e[k] = ee[k] + eo[k]; |
1105 | 45.4M | e[k + 8] = ee[7 - k] - eo[7 - k]; |
1106 | 45.4M | } |
1107 | 95.4M | for(k = 0; k < 16; k++) |
1108 | 89.7M | { |
1109 | 89.7M | WORD32 itrans_out; |
1110 | 89.7M | itrans_out = |
1111 | 89.7M | CLIP_S16(((e[k] + o[k] + add) >> shift)); |
1112 | 89.7M | pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k])); |
1113 | 89.7M | itrans_out = |
1114 | 89.7M | CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift)); |
1115 | 89.7M | pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16])); |
1116 | 89.7M | } |
1117 | 5.70M | pi2_tmp++; |
1118 | 5.70M | pu1_pred += pred_strd; |
1119 | 5.70M | pu1_dst += dst_strd; |
1120 | 5.70M | } |
1121 | 178k | } |
1122 | | /************************************************************************************************/ |
1123 | | /************************************END - IT_RECON_32x32****************************************/ |
1124 | | /************************************************************************************************/ |
1125 | 360k | } |
1126 | 904k | } |
1127 | | |