/src/libhevc/common/ihevc_itrans_recon_32x32.c
Line | Count | Source |
1 | | /****************************************************************************** |
2 | | * |
3 | | * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore |
4 | | * |
5 | | * Licensed under the Apache License, Version 2.0 (the "License"); |
6 | | * you may not use this file except in compliance with the License. |
7 | | * You may obtain a copy of the License at: |
8 | | * |
9 | | * http://www.apache.org/licenses/LICENSE-2.0 |
10 | | * |
11 | | * Unless required by applicable law or agreed to in writing, software |
12 | | * distributed under the License is distributed on an "AS IS" BASIS, |
13 | | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | * See the License for the specific language governing permissions and |
15 | | * limitations under the License. |
16 | | * |
17 | | ******************************************************************************/ |
18 | | /** |
19 | | ******************************************************************************* |
20 | | * @file |
21 | | * ihevc_itrans_recon_32x32.c |
22 | | * |
23 | | * @brief |
24 | | * Contains function definitions for inverse transform and reconstruction 32x32 |
25 | | * |
26 | | * |
27 | | * @author |
28 | | * 100470 |
29 | | * |
30 | | * @par List of Functions: |
31 | | * - ihevc_itrans_recon_32x32() |
32 | | * |
33 | | * @remarks |
34 | | * None |
35 | | * |
36 | | ******************************************************************************* |
37 | | */ |
38 | | #include <stdio.h> |
39 | | #include <string.h> |
40 | | #include "ihevc_typedefs.h" |
41 | | #include "ihevc_macros.h" |
42 | | #include "ihevc_platform_macros.h" |
43 | | #include "ihevc_defs.h" |
44 | | #include "ihevc_trans_tables.h" |
45 | | #include "ihevc_itrans_recon.h" |
46 | | #include "ihevc_func_selector.h" |
47 | | #include "ihevc_trans_macros.h" |
48 | | |
49 | | |
50 | | /** |
51 | | ******************************************************************************* |
52 | | * |
53 | | * @brief |
54 | | * This function performs Inverse transform and reconstruction for 32x32 |
55 | | * input block |
56 | | * |
57 | | * @par Description: |
58 | | * Performs inverse transform and adds the prediction data and clips output |
59 | | * to 8 bit |
60 | | * |
61 | | * @param[in] pi2_src |
62 | | * Input 32x32 coefficients |
63 | | * |
64 | | * @param[in] pi2_tmp |
65 | | * Temporary 32x32 buffer for storing inverse |
66 | | * |
67 | | * transform |
68 | | * 1st stage output |
69 | | * |
70 | | * @param[in] pu1_pred |
71 | | * Prediction 32x32 block |
72 | | * |
73 | | * @param[out] pu1_dst |
74 | | * Output 32x32 block |
75 | | * |
76 | | * @param[in] src_strd |
77 | | * Input stride |
78 | | * |
79 | | * @param[in] pred_strd |
80 | | * Prediction stride |
81 | | * |
82 | | * @param[in] dst_strd |
83 | | * Output Stride |
84 | | * |
85 | | * @param[in] shift |
86 | | * Output shift |
87 | | * |
88 | | * @param[in] zero_cols |
89 | | * Zero columns in pi2_src |
90 | | * |
91 | | * @returns Void |
92 | | * |
93 | | * @remarks |
94 | | * None |
95 | | * |
96 | | ******************************************************************************* |
97 | | */ |
98 | | |
99 | | void ihevc_itrans_recon_32x32(WORD16 *pi2_src, |
100 | | WORD16 *pi2_tmp, |
101 | | UWORD8 *pu1_pred, |
102 | | UWORD8 *pu1_dst, |
103 | | WORD32 src_strd, |
104 | | WORD32 pred_strd, |
105 | | WORD32 dst_strd, |
106 | | WORD32 zero_cols, |
107 | | WORD32 zero_rows) |
108 | 340k | { |
109 | 340k | WORD32 j, k; |
110 | 340k | WORD32 e[16], o[16]; |
111 | 340k | WORD32 ee[8], eo[8]; |
112 | 340k | WORD32 eee[4], eeo[4]; |
113 | 340k | WORD32 eeee[2], eeeo[2]; |
114 | 340k | WORD32 add; |
115 | 340k | WORD32 shift; |
116 | 340k | WORD16 *pi2_tmp_orig; |
117 | 340k | WORD32 trans_size; |
118 | 340k | WORD32 zero_rows_2nd_stage = zero_cols; |
119 | 340k | WORD32 row_limit_2nd_stage; |
120 | | |
121 | 340k | trans_size = TRANS_SIZE_32; |
122 | 340k | pi2_tmp_orig = pi2_tmp; |
123 | | |
124 | 340k | if((zero_cols & 0xFFFFFFF0) == 0xFFFFFFF0) |
125 | 81.6k | row_limit_2nd_stage = 4; |
126 | 258k | else if((zero_cols & 0xFFFFFF00) == 0xFFFFFF00) |
127 | 47.7k | row_limit_2nd_stage = 8; |
128 | 211k | else |
129 | 211k | row_limit_2nd_stage = TRANS_SIZE_32; |
130 | | |
131 | 340k | if((zero_rows & 0xFFFFFFF0) == 0xFFFFFFF0) /* First 4 rows of input are non-zero */ |
132 | 68.5k | { |
133 | | /************************************************************************************************/ |
134 | | /**********************************START - IT_RECON_32x32****************************************/ |
135 | | /************************************************************************************************/ |
136 | | /* Inverse Transform 1st stage */ |
137 | 68.5k | shift = IT_SHIFT_STAGE_1; |
138 | 68.5k | add = 1 << (shift - 1); |
139 | | |
140 | 1.05M | for(j = 0; j < row_limit_2nd_stage; j++) |
141 | 984k | { |
142 | | /* Checking for Zero Cols */ |
143 | 984k | if((zero_cols & 1) == 1) |
144 | 513k | { |
145 | 513k | memset(pi2_tmp, 0, trans_size * sizeof(WORD16)); |
146 | 513k | } |
147 | 470k | else |
148 | 470k | { |
149 | | /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ |
150 | 7.99M | for(k = 0; k < 16; k++) |
151 | 7.52M | { |
152 | 7.52M | o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_src[src_strd] |
153 | 7.52M | + g_ai2_ihevc_trans_32[3][k] |
154 | 7.52M | * pi2_src[3 * src_strd]; |
155 | 7.52M | } |
156 | 4.23M | for(k = 0; k < 8; k++) |
157 | 3.76M | { |
158 | 3.76M | eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_src[2 * src_strd]; |
159 | 3.76M | } |
160 | | // for(k = 0; k < 4; k++) |
161 | 470k | { |
162 | 470k | eeo[0] = 0; |
163 | 470k | eeo[1] = 0; |
164 | 470k | eeo[2] = 0; |
165 | 470k | eeo[3] = 0; |
166 | 470k | } |
167 | 470k | eeeo[0] = 0; |
168 | 470k | eeeo[1] = 0; |
169 | 470k | eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_src[0]; |
170 | 470k | eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_src[0]; |
171 | | |
172 | | /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */ |
173 | 470k | eee[0] = eeee[0] + eeeo[0]; |
174 | 470k | eee[3] = eeee[0] - eeeo[0]; |
175 | 470k | eee[1] = eeee[1] + eeeo[1]; |
176 | 470k | eee[2] = eeee[1] - eeeo[1]; |
177 | 2.35M | for(k = 0; k < 4; k++) |
178 | 1.88M | { |
179 | 1.88M | ee[k] = eee[k] + eeo[k]; |
180 | 1.88M | ee[k + 4] = eee[3 - k] - eeo[3 - k]; |
181 | 1.88M | } |
182 | 4.23M | for(k = 0; k < 8; k++) |
183 | 3.76M | { |
184 | 3.76M | e[k] = ee[k] + eo[k]; |
185 | 3.76M | e[k + 8] = ee[7 - k] - eo[7 - k]; |
186 | 3.76M | } |
187 | 7.99M | for(k = 0; k < 16; k++) |
188 | 7.52M | { |
189 | 7.52M | pi2_tmp[k] = |
190 | 7.52M | CLIP_S16(((e[k] + o[k] + add) >> shift)); |
191 | 7.52M | pi2_tmp[k + 16] = |
192 | 7.52M | CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift)); |
193 | 7.52M | } |
194 | 470k | } |
195 | 984k | pi2_src++; |
196 | 984k | pi2_tmp += trans_size; |
197 | 984k | zero_cols = zero_cols >> 1; |
198 | 984k | } |
199 | | |
200 | 68.5k | pi2_tmp = pi2_tmp_orig; |
201 | | |
202 | | /* Inverse Transform 2nd stage */ |
203 | 68.5k | shift = IT_SHIFT_STAGE_2; |
204 | 68.5k | add = 1 << (shift - 1); |
205 | 68.5k | if((zero_rows_2nd_stage & 0xFFFFFFF0) == 0xFFFFFFF0) /* First 4 rows of output of 1st stage are non-zero */ |
206 | 32.0k | { |
207 | 1.05M | for(j = 0; j < trans_size; j++) |
208 | 1.02M | { |
209 | | /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ |
210 | 17.4M | for(k = 0; k < 16; k++) |
211 | 16.3M | { |
212 | 16.3M | o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size] |
213 | 16.3M | + g_ai2_ihevc_trans_32[3][k] |
214 | 16.3M | * pi2_tmp[3 * trans_size]; |
215 | 16.3M | } |
216 | 9.21M | for(k = 0; k < 8; k++) |
217 | 8.19M | { |
218 | 8.19M | eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size]; |
219 | 8.19M | } |
220 | | // for(k = 0; k < 4; k++) |
221 | 1.02M | { |
222 | 1.02M | eeo[0] = 0; |
223 | 1.02M | eeo[1] = 0; |
224 | 1.02M | eeo[2] = 0; |
225 | 1.02M | eeo[3] = 0; |
226 | 1.02M | } |
227 | 1.02M | eeeo[0] = 0; |
228 | 1.02M | eeeo[1] = 0; |
229 | 1.02M | eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0]; |
230 | 1.02M | eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0]; |
231 | | |
232 | | /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */ |
233 | 1.02M | eee[0] = eeee[0] + eeeo[0]; |
234 | 1.02M | eee[3] = eeee[0] - eeeo[0]; |
235 | 1.02M | eee[1] = eeee[1] + eeeo[1]; |
236 | 1.02M | eee[2] = eeee[1] - eeeo[1]; |
237 | 5.10M | for(k = 0; k < 4; k++) |
238 | 4.07M | { |
239 | 4.07M | ee[k] = eee[k] + eeo[k]; |
240 | 4.07M | ee[k + 4] = eee[3 - k] - eeo[3 - k]; |
241 | 4.07M | } |
242 | 9.18M | for(k = 0; k < 8; k++) |
243 | 8.16M | { |
244 | 8.16M | e[k] = ee[k] + eo[k]; |
245 | 8.16M | e[k + 8] = ee[7 - k] - eo[7 - k]; |
246 | 8.16M | } |
247 | 17.2M | for(k = 0; k < 16; k++) |
248 | 16.2M | { |
249 | 16.2M | WORD32 itrans_out; |
250 | 16.2M | itrans_out = |
251 | 16.2M | CLIP_S16(((e[k] + o[k] + add) >> shift)); |
252 | 16.2M | pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k])); |
253 | 16.2M | itrans_out = |
254 | 16.2M | CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift)); |
255 | 16.2M | pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16])); |
256 | 16.2M | } |
257 | 1.02M | pi2_tmp++; |
258 | 1.02M | pu1_pred += pred_strd; |
259 | 1.02M | pu1_dst += dst_strd; |
260 | 1.02M | } |
261 | 32.0k | } |
262 | 36.5k | else if((zero_rows_2nd_stage & 0xFFFFFF00) == 0xFFFFFF00) /* First 8 rows of output of 1st stage are non-zero */ |
263 | 12.9k | { |
264 | 428k | for(j = 0; j < trans_size; j++) |
265 | 415k | { |
266 | | /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ |
267 | 7.06M | for(k = 0; k < 16; k++) |
268 | 6.65M | { |
269 | 6.65M | o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size] |
270 | 6.65M | + g_ai2_ihevc_trans_32[3][k] |
271 | 6.65M | * pi2_tmp[3 * trans_size] |
272 | 6.65M | + g_ai2_ihevc_trans_32[5][k] |
273 | 6.65M | * pi2_tmp[5 * trans_size] |
274 | 6.65M | + g_ai2_ihevc_trans_32[7][k] |
275 | 6.65M | * pi2_tmp[7 * trans_size]; |
276 | 6.65M | } |
277 | 3.74M | for(k = 0; k < 8; k++) |
278 | 3.32M | { |
279 | 3.32M | eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size] |
280 | 3.32M | + g_ai2_ihevc_trans_32[6][k] |
281 | 3.32M | * pi2_tmp[6 * trans_size]; |
282 | 3.32M | } |
283 | 2.07M | for(k = 0; k < 4; k++) |
284 | 1.66M | { |
285 | 1.66M | eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_tmp[4 * trans_size]; |
286 | 1.66M | } |
287 | 415k | eeeo[0] = 0; |
288 | 415k | eeeo[1] = 0; |
289 | 415k | eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0]; |
290 | 415k | eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0]; |
291 | | |
292 | | /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */ |
293 | 415k | eee[0] = eeee[0] + eeeo[0]; |
294 | 415k | eee[3] = eeee[0] - eeeo[0]; |
295 | 415k | eee[1] = eeee[1] + eeeo[1]; |
296 | 415k | eee[2] = eeee[1] - eeeo[1]; |
297 | 2.07M | for(k = 0; k < 4; k++) |
298 | 1.66M | { |
299 | 1.66M | ee[k] = eee[k] + eeo[k]; |
300 | 1.66M | ee[k + 4] = eee[3 - k] - eeo[3 - k]; |
301 | 1.66M | } |
302 | 3.74M | for(k = 0; k < 8; k++) |
303 | 3.32M | { |
304 | 3.32M | e[k] = ee[k] + eo[k]; |
305 | 3.32M | e[k + 8] = ee[7 - k] - eo[7 - k]; |
306 | 3.32M | } |
307 | 7.06M | for(k = 0; k < 16; k++) |
308 | 6.64M | { |
309 | 6.64M | WORD32 itrans_out; |
310 | 6.64M | itrans_out = |
311 | 6.64M | CLIP_S16(((e[k] + o[k] + add) >> shift)); |
312 | 6.64M | pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k])); |
313 | 6.64M | itrans_out = |
314 | 6.64M | CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift)); |
315 | 6.64M | pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16])); |
316 | 6.64M | } |
317 | 415k | pi2_tmp++; |
318 | 415k | pu1_pred += pred_strd; |
319 | 415k | pu1_dst += dst_strd; |
320 | 415k | } |
321 | 12.9k | } |
322 | 23.5k | else /* All rows of output of 1st stage are non-zero */ |
323 | 23.5k | { |
324 | 775k | for(j = 0; j < trans_size; j++) |
325 | 752k | { |
326 | | /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ |
327 | 12.5M | for(k = 0; k < 16; k++) |
328 | 11.8M | { |
329 | 11.8M | o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size] |
330 | 11.8M | + g_ai2_ihevc_trans_32[3][k] |
331 | 11.8M | * pi2_tmp[3 * trans_size] |
332 | 11.8M | + g_ai2_ihevc_trans_32[5][k] |
333 | 11.8M | * pi2_tmp[5 * trans_size] |
334 | 11.8M | + g_ai2_ihevc_trans_32[7][k] |
335 | 11.8M | * pi2_tmp[7 * trans_size] |
336 | 11.8M | + g_ai2_ihevc_trans_32[9][k] |
337 | 11.8M | * pi2_tmp[9 * trans_size] |
338 | 11.8M | + g_ai2_ihevc_trans_32[11][k] |
339 | 11.8M | * pi2_tmp[11 * trans_size] |
340 | 11.8M | + g_ai2_ihevc_trans_32[13][k] |
341 | 11.8M | * pi2_tmp[13 * trans_size] |
342 | 11.8M | + g_ai2_ihevc_trans_32[15][k] |
343 | 11.8M | * pi2_tmp[15 * trans_size] |
344 | 11.8M | + g_ai2_ihevc_trans_32[17][k] |
345 | 11.8M | * pi2_tmp[17 * trans_size] |
346 | 11.8M | + g_ai2_ihevc_trans_32[19][k] |
347 | 11.8M | * pi2_tmp[19 * trans_size] |
348 | 11.8M | + g_ai2_ihevc_trans_32[21][k] |
349 | 11.8M | * pi2_tmp[21 * trans_size] |
350 | 11.8M | + g_ai2_ihevc_trans_32[23][k] |
351 | 11.8M | * pi2_tmp[23 * trans_size] |
352 | 11.8M | + g_ai2_ihevc_trans_32[25][k] |
353 | 11.8M | * pi2_tmp[25 * trans_size] |
354 | 11.8M | + g_ai2_ihevc_trans_32[27][k] |
355 | 11.8M | * pi2_tmp[27 * trans_size] |
356 | 11.8M | + g_ai2_ihevc_trans_32[29][k] |
357 | 11.8M | * pi2_tmp[29 * trans_size] |
358 | 11.8M | + g_ai2_ihevc_trans_32[31][k] |
359 | 11.8M | * pi2_tmp[31 * trans_size]; |
360 | 11.8M | } |
361 | 6.71M | for(k = 0; k < 8; k++) |
362 | 5.96M | { |
363 | 5.96M | eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size] |
364 | 5.96M | + g_ai2_ihevc_trans_32[6][k] |
365 | 5.96M | * pi2_tmp[6 * trans_size] |
366 | 5.96M | + g_ai2_ihevc_trans_32[10][k] |
367 | 5.96M | * pi2_tmp[10 * trans_size] |
368 | 5.96M | + g_ai2_ihevc_trans_32[14][k] |
369 | 5.96M | * pi2_tmp[14 * trans_size] |
370 | 5.96M | + g_ai2_ihevc_trans_32[18][k] |
371 | 5.96M | * pi2_tmp[18 * trans_size] |
372 | 5.96M | + g_ai2_ihevc_trans_32[22][k] |
373 | 5.96M | * pi2_tmp[22 * trans_size] |
374 | 5.96M | + g_ai2_ihevc_trans_32[26][k] |
375 | 5.96M | * pi2_tmp[26 * trans_size] |
376 | 5.96M | + g_ai2_ihevc_trans_32[30][k] |
377 | 5.96M | * pi2_tmp[30 * trans_size]; |
378 | 5.96M | } |
379 | 3.75M | for(k = 0; k < 4; k++) |
380 | 2.99M | { |
381 | 2.99M | eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_tmp[4 * trans_size] |
382 | 2.99M | + g_ai2_ihevc_trans_32[12][k] |
383 | 2.99M | * pi2_tmp[12 * trans_size] |
384 | 2.99M | + g_ai2_ihevc_trans_32[20][k] |
385 | 2.99M | * pi2_tmp[20 * trans_size] |
386 | 2.99M | + g_ai2_ihevc_trans_32[28][k] |
387 | 2.99M | * pi2_tmp[28 * trans_size]; |
388 | 2.99M | } |
389 | 752k | eeeo[0] = |
390 | 752k | g_ai2_ihevc_trans_32[8][0] * pi2_tmp[8 * trans_size] |
391 | 752k | + g_ai2_ihevc_trans_32[24][0] |
392 | 752k | * pi2_tmp[24 |
393 | 752k | * trans_size]; |
394 | 752k | eeeo[1] = |
395 | 752k | g_ai2_ihevc_trans_32[8][1] * pi2_tmp[8 * trans_size] |
396 | 752k | + g_ai2_ihevc_trans_32[24][1] |
397 | 752k | * pi2_tmp[24 |
398 | 752k | * trans_size]; |
399 | 752k | eeee[0] = |
400 | 752k | g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0] |
401 | 752k | + g_ai2_ihevc_trans_32[16][0] |
402 | 752k | * pi2_tmp[16 |
403 | 752k | * trans_size]; |
404 | 752k | eeee[1] = |
405 | 752k | g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0] |
406 | 752k | + g_ai2_ihevc_trans_32[16][1] |
407 | 752k | * pi2_tmp[16 |
408 | 752k | * trans_size]; |
409 | | |
410 | | /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */ |
411 | 752k | eee[0] = eeee[0] + eeeo[0]; |
412 | 752k | eee[3] = eeee[0] - eeeo[0]; |
413 | 752k | eee[1] = eeee[1] + eeeo[1]; |
414 | 752k | eee[2] = eeee[1] - eeeo[1]; |
415 | 3.74M | for(k = 0; k < 4; k++) |
416 | 2.99M | { |
417 | 2.99M | ee[k] = eee[k] + eeo[k]; |
418 | 2.99M | ee[k + 4] = eee[3 - k] - eeo[3 - k]; |
419 | 2.99M | } |
420 | 6.74M | for(k = 0; k < 8; k++) |
421 | 5.99M | { |
422 | 5.99M | e[k] = ee[k] + eo[k]; |
423 | 5.99M | e[k + 8] = ee[7 - k] - eo[7 - k]; |
424 | 5.99M | } |
425 | 12.6M | for(k = 0; k < 16; k++) |
426 | 11.9M | { |
427 | 11.9M | WORD32 itrans_out; |
428 | 11.9M | itrans_out = |
429 | 11.9M | CLIP_S16(((e[k] + o[k] + add) >> shift)); |
430 | 11.9M | pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k])); |
431 | 11.9M | itrans_out = |
432 | 11.9M | CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift)); |
433 | 11.9M | pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16])); |
434 | 11.9M | } |
435 | 752k | pi2_tmp++; |
436 | 752k | pu1_pred += pred_strd; |
437 | 752k | pu1_dst += dst_strd; |
438 | 752k | } |
439 | 23.5k | } |
440 | | /************************************************************************************************/ |
441 | | /************************************END - IT_RECON_32x32****************************************/ |
442 | | /************************************************************************************************/ |
443 | 68.5k | } |
444 | 272k | else if((zero_rows & 0xFFFFFF00) == 0xFFFFFF00) /* First 8 rows of input are non-zero */ |
445 | 49.5k | { |
446 | | /************************************************************************************************/ |
447 | | /**********************************START - IT_RECON_32x32****************************************/ |
448 | | /************************************************************************************************/ |
449 | | /* Inverse Transform 1st stage */ |
450 | 49.5k | shift = IT_SHIFT_STAGE_1; |
451 | 49.5k | add = 1 << (shift - 1); |
452 | | |
453 | 702k | for(j = 0; j < row_limit_2nd_stage; j++) |
454 | 652k | { |
455 | | /* Checking for Zero Cols */ |
456 | 652k | if((zero_cols & 1) == 1) |
457 | 445k | { |
458 | 445k | memset(pi2_tmp, 0, trans_size * sizeof(WORD16)); |
459 | 445k | } |
460 | 207k | else |
461 | 207k | { |
462 | | /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ |
463 | 3.52M | for(k = 0; k < 16; k++) |
464 | 3.31M | { |
465 | 3.31M | o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_src[src_strd] |
466 | 3.31M | + g_ai2_ihevc_trans_32[3][k] |
467 | 3.31M | * pi2_src[3 * src_strd] |
468 | 3.31M | + g_ai2_ihevc_trans_32[5][k] |
469 | 3.31M | * pi2_src[5 * src_strd] |
470 | 3.31M | + g_ai2_ihevc_trans_32[7][k] |
471 | 3.31M | * pi2_src[7 * src_strd]; |
472 | 3.31M | } |
473 | 1.86M | for(k = 0; k < 8; k++) |
474 | 1.65M | { |
475 | 1.65M | eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_src[2 * src_strd] |
476 | 1.65M | + g_ai2_ihevc_trans_32[6][k] |
477 | 1.65M | * pi2_src[6 * src_strd]; |
478 | 1.65M | } |
479 | 1.03M | for(k = 0; k < 4; k++) |
480 | 828k | { |
481 | 828k | eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_src[4 * src_strd]; |
482 | 828k | } |
483 | 207k | eeeo[0] = 0; |
484 | 207k | eeeo[1] = 0; |
485 | 207k | eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_src[0]; |
486 | 207k | eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_src[0]; |
487 | | |
488 | | /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */ |
489 | 207k | eee[0] = eeee[0] + eeeo[0]; |
490 | 207k | eee[3] = eeee[0] - eeeo[0]; |
491 | 207k | eee[1] = eeee[1] + eeeo[1]; |
492 | 207k | eee[2] = eeee[1] - eeeo[1]; |
493 | 1.03M | for(k = 0; k < 4; k++) |
494 | 828k | { |
495 | 828k | ee[k] = eee[k] + eeo[k]; |
496 | 828k | ee[k + 4] = eee[3 - k] - eeo[3 - k]; |
497 | 828k | } |
498 | 1.86M | for(k = 0; k < 8; k++) |
499 | 1.65M | { |
500 | 1.65M | e[k] = ee[k] + eo[k]; |
501 | 1.65M | e[k + 8] = ee[7 - k] - eo[7 - k]; |
502 | 1.65M | } |
503 | 3.51M | for(k = 0; k < 16; k++) |
504 | 3.31M | { |
505 | 3.31M | pi2_tmp[k] = |
506 | 3.31M | CLIP_S16(((e[k] + o[k] + add) >> shift)); |
507 | 3.31M | pi2_tmp[k + 16] = |
508 | 3.31M | CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift)); |
509 | 3.31M | } |
510 | 207k | } |
511 | 652k | pi2_src++; |
512 | 652k | pi2_tmp += trans_size; |
513 | 652k | zero_cols = zero_cols >> 1; |
514 | 652k | } |
515 | | |
516 | 49.5k | pi2_tmp = pi2_tmp_orig; |
517 | | |
518 | | /* Inverse Transform 2nd stage */ |
519 | 49.5k | shift = IT_SHIFT_STAGE_2; |
520 | 49.5k | add = 1 << (shift - 1); |
521 | 49.5k | if((zero_rows_2nd_stage & 0xFFFFFFF0) == 0xFFFFFFF0) /* First 4 rows of output of 1st stage are non-zero */ |
522 | 19.0k | { |
523 | 624k | for(j = 0; j < trans_size; j++) |
524 | 605k | { |
525 | | /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ |
526 | 10.1M | for(k = 0; k < 16; k++) |
527 | 9.51M | { |
528 | 9.51M | o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size] |
529 | 9.51M | + g_ai2_ihevc_trans_32[3][k] |
530 | 9.51M | * pi2_tmp[3 * trans_size]; |
531 | 9.51M | } |
532 | 5.39M | for(k = 0; k < 8; k++) |
533 | 4.79M | { |
534 | 4.79M | eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size]; |
535 | 4.79M | } |
536 | | // for(k = 0; k < 4; k++) |
537 | 605k | { |
538 | 605k | eeo[0] = 0; |
539 | 605k | eeo[1] = 0; |
540 | 605k | eeo[2] = 0; |
541 | 605k | eeo[3] = 0; |
542 | 605k | } |
543 | 605k | eeeo[0] = 0; |
544 | 605k | eeeo[1] = 0; |
545 | 605k | eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0]; |
546 | 605k | eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0]; |
547 | | |
548 | | /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */ |
549 | 605k | eee[0] = eeee[0] + eeeo[0]; |
550 | 605k | eee[3] = eeee[0] - eeeo[0]; |
551 | 605k | eee[1] = eeee[1] + eeeo[1]; |
552 | 605k | eee[2] = eeee[1] - eeeo[1]; |
553 | 2.99M | for(k = 0; k < 4; k++) |
554 | 2.38M | { |
555 | 2.38M | ee[k] = eee[k] + eeo[k]; |
556 | 2.38M | ee[k + 4] = eee[3 - k] - eeo[3 - k]; |
557 | 2.38M | } |
558 | 5.36M | for(k = 0; k < 8; k++) |
559 | 4.75M | { |
560 | 4.75M | e[k] = ee[k] + eo[k]; |
561 | 4.75M | e[k + 8] = ee[7 - k] - eo[7 - k]; |
562 | 4.75M | } |
563 | 9.92M | for(k = 0; k < 16; k++) |
564 | 9.32M | { |
565 | 9.32M | WORD32 itrans_out; |
566 | 9.32M | itrans_out = |
567 | 9.32M | CLIP_S16(((e[k] + o[k] + add) >> shift)); |
568 | 9.32M | pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k])); |
569 | 9.32M | itrans_out = |
570 | 9.32M | CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift)); |
571 | 9.32M | pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16])); |
572 | 9.32M | } |
573 | 605k | pi2_tmp++; |
574 | 605k | pu1_pred += pred_strd; |
575 | 605k | pu1_dst += dst_strd; |
576 | 605k | } |
577 | 19.0k | } |
578 | 30.4k | else if((zero_rows_2nd_stage & 0xFFFFFF00) == 0xFFFFFF00) /* First 8 rows of output of 1st stage are non-zero */ |
579 | 16.5k | { |
580 | 543k | for(j = 0; j < trans_size; j++) |
581 | 526k | { |
582 | | /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ |
583 | 8.83M | for(k = 0; k < 16; k++) |
584 | 8.31M | { |
585 | 8.31M | o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size] |
586 | 8.31M | + g_ai2_ihevc_trans_32[3][k] |
587 | 8.31M | * pi2_tmp[3 * trans_size] |
588 | 8.31M | + g_ai2_ihevc_trans_32[5][k] |
589 | 8.31M | * pi2_tmp[5 * trans_size] |
590 | 8.31M | + g_ai2_ihevc_trans_32[7][k] |
591 | 8.31M | * pi2_tmp[7 * trans_size]; |
592 | 8.31M | } |
593 | 4.72M | for(k = 0; k < 8; k++) |
594 | 4.19M | { |
595 | 4.19M | eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size] |
596 | 4.19M | + g_ai2_ihevc_trans_32[6][k] |
597 | 4.19M | * pi2_tmp[6 * trans_size]; |
598 | 4.19M | } |
599 | 2.62M | for(k = 0; k < 4; k++) |
600 | 2.09M | { |
601 | 2.09M | eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_tmp[4 * trans_size]; |
602 | 2.09M | } |
603 | 526k | eeeo[0] = 0; |
604 | 526k | eeeo[1] = 0; |
605 | 526k | eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0]; |
606 | 526k | eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0]; |
607 | | |
608 | | /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */ |
609 | 526k | eee[0] = eeee[0] + eeeo[0]; |
610 | 526k | eee[3] = eeee[0] - eeeo[0]; |
611 | 526k | eee[1] = eeee[1] + eeeo[1]; |
612 | 526k | eee[2] = eeee[1] - eeeo[1]; |
613 | 2.60M | for(k = 0; k < 4; k++) |
614 | 2.07M | { |
615 | 2.07M | ee[k] = eee[k] + eeo[k]; |
616 | 2.07M | ee[k + 4] = eee[3 - k] - eeo[3 - k]; |
617 | 2.07M | } |
618 | 4.63M | for(k = 0; k < 8; k++) |
619 | 4.10M | { |
620 | 4.10M | e[k] = ee[k] + eo[k]; |
621 | 4.10M | e[k + 8] = ee[7 - k] - eo[7 - k]; |
622 | 4.10M | } |
623 | 8.57M | for(k = 0; k < 16; k++) |
624 | 8.04M | { |
625 | 8.04M | WORD32 itrans_out; |
626 | 8.04M | itrans_out = |
627 | 8.04M | CLIP_S16(((e[k] + o[k] + add) >> shift)); |
628 | 8.04M | pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k])); |
629 | 8.04M | itrans_out = |
630 | 8.04M | CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift)); |
631 | 8.04M | pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16])); |
632 | 8.04M | } |
633 | 526k | pi2_tmp++; |
634 | 526k | pu1_pred += pred_strd; |
635 | 526k | pu1_dst += dst_strd; |
636 | 526k | } |
637 | 16.5k | } |
638 | 13.8k | else /* All rows of output of 1st stage are non-zero */ |
639 | 13.8k | { |
640 | 455k | for(j = 0; j < trans_size; j++) |
641 | 441k | { |
642 | | /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ |
643 | 7.30M | for(k = 0; k < 16; k++) |
644 | 6.86M | { |
645 | 6.86M | o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size] |
646 | 6.86M | + g_ai2_ihevc_trans_32[3][k] |
647 | 6.86M | * pi2_tmp[3 * trans_size] |
648 | 6.86M | + g_ai2_ihevc_trans_32[5][k] |
649 | 6.86M | * pi2_tmp[5 * trans_size] |
650 | 6.86M | + g_ai2_ihevc_trans_32[7][k] |
651 | 6.86M | * pi2_tmp[7 * trans_size] |
652 | 6.86M | + g_ai2_ihevc_trans_32[9][k] |
653 | 6.86M | * pi2_tmp[9 * trans_size] |
654 | 6.86M | + g_ai2_ihevc_trans_32[11][k] |
655 | 6.86M | * pi2_tmp[11 * trans_size] |
656 | 6.86M | + g_ai2_ihevc_trans_32[13][k] |
657 | 6.86M | * pi2_tmp[13 * trans_size] |
658 | 6.86M | + g_ai2_ihevc_trans_32[15][k] |
659 | 6.86M | * pi2_tmp[15 * trans_size] |
660 | 6.86M | + g_ai2_ihevc_trans_32[17][k] |
661 | 6.86M | * pi2_tmp[17 * trans_size] |
662 | 6.86M | + g_ai2_ihevc_trans_32[19][k] |
663 | 6.86M | * pi2_tmp[19 * trans_size] |
664 | 6.86M | + g_ai2_ihevc_trans_32[21][k] |
665 | 6.86M | * pi2_tmp[21 * trans_size] |
666 | 6.86M | + g_ai2_ihevc_trans_32[23][k] |
667 | 6.86M | * pi2_tmp[23 * trans_size] |
668 | 6.86M | + g_ai2_ihevc_trans_32[25][k] |
669 | 6.86M | * pi2_tmp[25 * trans_size] |
670 | 6.86M | + g_ai2_ihevc_trans_32[27][k] |
671 | 6.86M | * pi2_tmp[27 * trans_size] |
672 | 6.86M | + g_ai2_ihevc_trans_32[29][k] |
673 | 6.86M | * pi2_tmp[29 * trans_size] |
674 | 6.86M | + g_ai2_ihevc_trans_32[31][k] |
675 | 6.86M | * pi2_tmp[31 * trans_size]; |
676 | 6.86M | } |
677 | 3.96M | for(k = 0; k < 8; k++) |
678 | 3.52M | { |
679 | 3.52M | eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size] |
680 | 3.52M | + g_ai2_ihevc_trans_32[6][k] |
681 | 3.52M | * pi2_tmp[6 * trans_size] |
682 | 3.52M | + g_ai2_ihevc_trans_32[10][k] |
683 | 3.52M | * pi2_tmp[10 * trans_size] |
684 | 3.52M | + g_ai2_ihevc_trans_32[14][k] |
685 | 3.52M | * pi2_tmp[14 * trans_size] |
686 | 3.52M | + g_ai2_ihevc_trans_32[18][k] |
687 | 3.52M | * pi2_tmp[18 * trans_size] |
688 | 3.52M | + g_ai2_ihevc_trans_32[22][k] |
689 | 3.52M | * pi2_tmp[22 * trans_size] |
690 | 3.52M | + g_ai2_ihevc_trans_32[26][k] |
691 | 3.52M | * pi2_tmp[26 * trans_size] |
692 | 3.52M | + g_ai2_ihevc_trans_32[30][k] |
693 | 3.52M | * pi2_tmp[30 * trans_size]; |
694 | 3.52M | } |
695 | 2.21M | for(k = 0; k < 4; k++) |
696 | 1.77M | { |
697 | 1.77M | eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_tmp[4 * trans_size] |
698 | 1.77M | + g_ai2_ihevc_trans_32[12][k] |
699 | 1.77M | * pi2_tmp[12 * trans_size] |
700 | 1.77M | + g_ai2_ihevc_trans_32[20][k] |
701 | 1.77M | * pi2_tmp[20 * trans_size] |
702 | 1.77M | + g_ai2_ihevc_trans_32[28][k] |
703 | 1.77M | * pi2_tmp[28 * trans_size]; |
704 | 1.77M | } |
705 | 441k | eeeo[0] = |
706 | 441k | g_ai2_ihevc_trans_32[8][0] * pi2_tmp[8 * trans_size] |
707 | 441k | + g_ai2_ihevc_trans_32[24][0] |
708 | 441k | * pi2_tmp[24 |
709 | 441k | * trans_size]; |
710 | 441k | eeeo[1] = |
711 | 441k | g_ai2_ihevc_trans_32[8][1] * pi2_tmp[8 * trans_size] |
712 | 441k | + g_ai2_ihevc_trans_32[24][1] |
713 | 441k | * pi2_tmp[24 |
714 | 441k | * trans_size]; |
715 | 441k | eeee[0] = |
716 | 441k | g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0] |
717 | 441k | + g_ai2_ihevc_trans_32[16][0] |
718 | 441k | * pi2_tmp[16 |
719 | 441k | * trans_size]; |
720 | 441k | eeee[1] = |
721 | 441k | g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0] |
722 | 441k | + g_ai2_ihevc_trans_32[16][1] |
723 | 441k | * pi2_tmp[16 |
724 | 441k | * trans_size]; |
725 | | |
726 | | /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */ |
727 | 441k | eee[0] = eeee[0] + eeeo[0]; |
728 | 441k | eee[3] = eeee[0] - eeeo[0]; |
729 | 441k | eee[1] = eeee[1] + eeeo[1]; |
730 | 441k | eee[2] = eeee[1] - eeeo[1]; |
731 | 2.20M | for(k = 0; k < 4; k++) |
732 | 1.76M | { |
733 | 1.76M | ee[k] = eee[k] + eeo[k]; |
734 | 1.76M | ee[k + 4] = eee[3 - k] - eeo[3 - k]; |
735 | 1.76M | } |
736 | 3.96M | for(k = 0; k < 8; k++) |
737 | 3.52M | { |
738 | 3.52M | e[k] = ee[k] + eo[k]; |
739 | 3.52M | e[k + 8] = ee[7 - k] - eo[7 - k]; |
740 | 3.52M | } |
741 | 7.30M | for(k = 0; k < 16; k++) |
742 | 6.86M | { |
743 | 6.86M | WORD32 itrans_out; |
744 | 6.86M | itrans_out = |
745 | 6.86M | CLIP_S16(((e[k] + o[k] + add) >> shift)); |
746 | 6.86M | pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k])); |
747 | 6.86M | itrans_out = |
748 | 6.86M | CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift)); |
749 | 6.86M | pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16])); |
750 | 6.86M | } |
751 | 441k | pi2_tmp++; |
752 | 441k | pu1_pred += pred_strd; |
753 | 441k | pu1_dst += dst_strd; |
754 | 441k | } |
755 | 13.8k | } |
756 | | /************************************************************************************************/ |
757 | | /************************************END - IT_RECON_32x32****************************************/ |
758 | | /************************************************************************************************/ |
759 | 49.5k | } |
760 | 222k | else /* All rows of input are non-zero */ |
761 | 222k | { |
762 | | /************************************************************************************************/ |
763 | | /**********************************START - IT_RECON_32x32****************************************/ |
764 | | /************************************************************************************************/ |
765 | | /* Inverse Transform 1st stage */ |
766 | 222k | shift = IT_SHIFT_STAGE_1; |
767 | 222k | add = 1 << (shift - 1); |
768 | | |
769 | 6.05M | for(j = 0; j < row_limit_2nd_stage; j++) |
770 | 5.82M | { |
771 | | /* Checking for Zero Cols */ |
772 | 5.82M | if((zero_cols & 1) == 1) |
773 | 796k | { |
774 | 796k | memset(pi2_tmp, 0, trans_size * sizeof(WORD16)); |
775 | 796k | } |
776 | 5.03M | else |
777 | 5.03M | { |
778 | | /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ |
779 | 85.5M | for(k = 0; k < 16; k++) |
780 | 80.5M | { |
781 | 80.5M | o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_src[src_strd] |
782 | 80.5M | + g_ai2_ihevc_trans_32[3][k] |
783 | 80.5M | * pi2_src[3 * src_strd] |
784 | 80.5M | + g_ai2_ihevc_trans_32[5][k] |
785 | 80.5M | * pi2_src[5 * src_strd] |
786 | 80.5M | + g_ai2_ihevc_trans_32[7][k] |
787 | 80.5M | * pi2_src[7 * src_strd] |
788 | 80.5M | + g_ai2_ihevc_trans_32[9][k] |
789 | 80.5M | * pi2_src[9 * src_strd] |
790 | 80.5M | + g_ai2_ihevc_trans_32[11][k] |
791 | 80.5M | * pi2_src[11 * src_strd] |
792 | 80.5M | + g_ai2_ihevc_trans_32[13][k] |
793 | 80.5M | * pi2_src[13 * src_strd] |
794 | 80.5M | + g_ai2_ihevc_trans_32[15][k] |
795 | 80.5M | * pi2_src[15 * src_strd] |
796 | 80.5M | + g_ai2_ihevc_trans_32[17][k] |
797 | 80.5M | * pi2_src[17 * src_strd] |
798 | 80.5M | + g_ai2_ihevc_trans_32[19][k] |
799 | 80.5M | * pi2_src[19 * src_strd] |
800 | 80.5M | + g_ai2_ihevc_trans_32[21][k] |
801 | 80.5M | * pi2_src[21 * src_strd] |
802 | 80.5M | + g_ai2_ihevc_trans_32[23][k] |
803 | 80.5M | * pi2_src[23 * src_strd] |
804 | 80.5M | + g_ai2_ihevc_trans_32[25][k] |
805 | 80.5M | * pi2_src[25 * src_strd] |
806 | 80.5M | + g_ai2_ihevc_trans_32[27][k] |
807 | 80.5M | * pi2_src[27 * src_strd] |
808 | 80.5M | + g_ai2_ihevc_trans_32[29][k] |
809 | 80.5M | * pi2_src[29 * src_strd] |
810 | 80.5M | + g_ai2_ihevc_trans_32[31][k] |
811 | 80.5M | * pi2_src[31 * src_strd]; |
812 | 80.5M | } |
813 | 45.2M | for(k = 0; k < 8; k++) |
814 | 40.2M | { |
815 | 40.2M | eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_src[2 * src_strd] |
816 | 40.2M | + g_ai2_ihevc_trans_32[6][k] |
817 | 40.2M | * pi2_src[6 * src_strd] |
818 | 40.2M | + g_ai2_ihevc_trans_32[10][k] |
819 | 40.2M | * pi2_src[10 * src_strd] |
820 | 40.2M | + g_ai2_ihevc_trans_32[14][k] |
821 | 40.2M | * pi2_src[14 * src_strd] |
822 | 40.2M | + g_ai2_ihevc_trans_32[18][k] |
823 | 40.2M | * pi2_src[18 * src_strd] |
824 | 40.2M | + g_ai2_ihevc_trans_32[22][k] |
825 | 40.2M | * pi2_src[22 * src_strd] |
826 | 40.2M | + g_ai2_ihevc_trans_32[26][k] |
827 | 40.2M | * pi2_src[26 * src_strd] |
828 | 40.2M | + g_ai2_ihevc_trans_32[30][k] |
829 | 40.2M | * pi2_src[30 * src_strd]; |
830 | 40.2M | } |
831 | 25.1M | for(k = 0; k < 4; k++) |
832 | 20.1M | { |
833 | 20.1M | eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_src[4 * src_strd] |
834 | 20.1M | + g_ai2_ihevc_trans_32[12][k] |
835 | 20.1M | * pi2_src[12 * src_strd] |
836 | 20.1M | + g_ai2_ihevc_trans_32[20][k] |
837 | 20.1M | * pi2_src[20 * src_strd] |
838 | 20.1M | + g_ai2_ihevc_trans_32[28][k] |
839 | 20.1M | * pi2_src[28 * src_strd]; |
840 | 20.1M | } |
841 | 5.03M | eeeo[0] = g_ai2_ihevc_trans_32[8][0] * pi2_src[8 * src_strd] |
842 | 5.03M | + g_ai2_ihevc_trans_32[24][0] |
843 | 5.03M | * pi2_src[24 * src_strd]; |
844 | 5.03M | eeeo[1] = g_ai2_ihevc_trans_32[8][1] * pi2_src[8 * src_strd] |
845 | 5.03M | + g_ai2_ihevc_trans_32[24][1] |
846 | 5.03M | * pi2_src[24 * src_strd]; |
847 | 5.03M | eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_src[0] |
848 | 5.03M | + g_ai2_ihevc_trans_32[16][0] |
849 | 5.03M | * pi2_src[16 * src_strd]; |
850 | 5.03M | eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_src[0] |
851 | 5.03M | + g_ai2_ihevc_trans_32[16][1] |
852 | 5.03M | * pi2_src[16 * src_strd]; |
853 | | |
854 | | /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */ |
855 | 5.03M | eee[0] = eeee[0] + eeeo[0]; |
856 | 5.03M | eee[3] = eeee[0] - eeeo[0]; |
857 | 5.03M | eee[1] = eeee[1] + eeeo[1]; |
858 | 5.03M | eee[2] = eeee[1] - eeeo[1]; |
859 | 25.1M | for(k = 0; k < 4; k++) |
860 | 20.1M | { |
861 | 20.1M | ee[k] = eee[k] + eeo[k]; |
862 | 20.1M | ee[k + 4] = eee[3 - k] - eeo[3 - k]; |
863 | 20.1M | } |
864 | 45.2M | for(k = 0; k < 8; k++) |
865 | 40.2M | { |
866 | 40.2M | e[k] = ee[k] + eo[k]; |
867 | 40.2M | e[k + 8] = ee[7 - k] - eo[7 - k]; |
868 | 40.2M | } |
869 | 85.5M | for(k = 0; k < 16; k++) |
870 | 80.5M | { |
871 | 80.5M | pi2_tmp[k] = |
872 | 80.5M | CLIP_S16(((e[k] + o[k] + add) >> shift)); |
873 | 80.5M | pi2_tmp[k + 16] = |
874 | 80.5M | CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift)); |
875 | 80.5M | } |
876 | 5.03M | } |
877 | 5.82M | pi2_src++; |
878 | 5.82M | pi2_tmp += trans_size; |
879 | 5.82M | zero_cols = zero_cols >> 1; |
880 | 5.82M | } |
881 | | |
882 | 222k | pi2_tmp = pi2_tmp_orig; |
883 | | |
884 | | /* Inverse Transform 2nd stage */ |
885 | 222k | shift = IT_SHIFT_STAGE_2; |
886 | 222k | add = 1 << (shift - 1); |
887 | 222k | if((zero_rows_2nd_stage & 0xFFFFFFF0) == 0xFFFFFFF0) /* First 4 rows of output of 1st stage are non-zero */ |
888 | 30.5k | { |
889 | 1.00M | for(j = 0; j < trans_size; j++) |
890 | 972k | { |
891 | | /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ |
892 | 16.3M | for(k = 0; k < 16; k++) |
893 | 15.4M | { |
894 | 15.4M | o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size] |
895 | 15.4M | + g_ai2_ihevc_trans_32[3][k] |
896 | 15.4M | * pi2_tmp[3 * trans_size]; |
897 | 15.4M | } |
898 | 8.71M | for(k = 0; k < 8; k++) |
899 | 7.74M | { |
900 | 7.74M | eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size]; |
901 | 7.74M | } |
902 | | // for(k = 0; k < 4; k++) |
903 | 972k | { |
904 | 972k | eeo[0] = 0; |
905 | 972k | eeo[1] = 0; |
906 | 972k | eeo[2] = 0; |
907 | 972k | eeo[3] = 0; |
908 | 972k | } |
909 | 972k | eeeo[0] = 0; |
910 | 972k | eeeo[1] = 0; |
911 | 972k | eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0]; |
912 | 972k | eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0]; |
913 | | |
914 | | /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */ |
915 | 972k | eee[0] = eeee[0] + eeeo[0]; |
916 | 972k | eee[3] = eeee[0] - eeeo[0]; |
917 | 972k | eee[1] = eeee[1] + eeeo[1]; |
918 | 972k | eee[2] = eeee[1] - eeeo[1]; |
919 | 4.83M | for(k = 0; k < 4; k++) |
920 | 3.86M | { |
921 | 3.86M | ee[k] = eee[k] + eeo[k]; |
922 | 3.86M | ee[k + 4] = eee[3 - k] - eeo[3 - k]; |
923 | 3.86M | } |
924 | 8.69M | for(k = 0; k < 8; k++) |
925 | 7.72M | { |
926 | 7.72M | e[k] = ee[k] + eo[k]; |
927 | 7.72M | e[k + 8] = ee[7 - k] - eo[7 - k]; |
928 | 7.72M | } |
929 | 16.2M | for(k = 0; k < 16; k++) |
930 | 15.3M | { |
931 | 15.3M | WORD32 itrans_out; |
932 | 15.3M | itrans_out = |
933 | 15.3M | CLIP_S16(((e[k] + o[k] + add) >> shift)); |
934 | 15.3M | pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k])); |
935 | 15.3M | itrans_out = |
936 | 15.3M | CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift)); |
937 | 15.3M | pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16])); |
938 | 15.3M | } |
939 | 972k | pi2_tmp++; |
940 | 972k | pu1_pred += pred_strd; |
941 | 972k | pu1_dst += dst_strd; |
942 | 972k | } |
943 | 30.5k | } |
944 | 192k | else if((zero_rows_2nd_stage & 0xFFFFFF00) == 0xFFFFFF00) /* First 8 rows of output of 1st stage are non-zero */ |
945 | 18.2k | { |
946 | 598k | for(j = 0; j < trans_size; j++) |
947 | 579k | { |
948 | | /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ |
949 | 9.66M | for(k = 0; k < 16; k++) |
950 | 9.08M | { |
951 | 9.08M | o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size] |
952 | 9.08M | + g_ai2_ihevc_trans_32[3][k] |
953 | 9.08M | * pi2_tmp[3 * trans_size] |
954 | 9.08M | + g_ai2_ihevc_trans_32[5][k] |
955 | 9.08M | * pi2_tmp[5 * trans_size] |
956 | 9.08M | + g_ai2_ihevc_trans_32[7][k] |
957 | 9.08M | * pi2_tmp[7 * trans_size]; |
958 | 9.08M | } |
959 | 5.18M | for(k = 0; k < 8; k++) |
960 | 4.60M | { |
961 | 4.60M | eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size] |
962 | 4.60M | + g_ai2_ihevc_trans_32[6][k] |
963 | 4.60M | * pi2_tmp[6 * trans_size]; |
964 | 4.60M | } |
965 | 2.89M | for(k = 0; k < 4; k++) |
966 | 2.31M | { |
967 | 2.31M | eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_tmp[4 * trans_size]; |
968 | 2.31M | } |
969 | 579k | eeeo[0] = 0; |
970 | 579k | eeeo[1] = 0; |
971 | 579k | eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0]; |
972 | 579k | eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0]; |
973 | | |
974 | | /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */ |
975 | 579k | eee[0] = eeee[0] + eeeo[0]; |
976 | 579k | eee[3] = eeee[0] - eeeo[0]; |
977 | 579k | eee[1] = eeee[1] + eeeo[1]; |
978 | 579k | eee[2] = eeee[1] - eeeo[1]; |
979 | 2.88M | for(k = 0; k < 4; k++) |
980 | 2.30M | { |
981 | 2.30M | ee[k] = eee[k] + eeo[k]; |
982 | 2.30M | ee[k + 4] = eee[3 - k] - eeo[3 - k]; |
983 | 2.30M | } |
984 | 5.17M | for(k = 0; k < 8; k++) |
985 | 4.59M | { |
986 | 4.59M | e[k] = ee[k] + eo[k]; |
987 | 4.59M | e[k + 8] = ee[7 - k] - eo[7 - k]; |
988 | 4.59M | } |
989 | 9.61M | for(k = 0; k < 16; k++) |
990 | 9.03M | { |
991 | 9.03M | WORD32 itrans_out; |
992 | 9.03M | itrans_out = |
993 | 9.03M | CLIP_S16(((e[k] + o[k] + add) >> shift)); |
994 | 9.03M | pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k])); |
995 | 9.03M | itrans_out = |
996 | 9.03M | CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift)); |
997 | 9.03M | pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16])); |
998 | 9.03M | } |
999 | 579k | pi2_tmp++; |
1000 | 579k | pu1_pred += pred_strd; |
1001 | 579k | pu1_dst += dst_strd; |
1002 | 579k | } |
1003 | 18.2k | } |
1004 | 173k | else /* All rows of output of 1st stage are non-zero */ |
1005 | 173k | { |
1006 | 5.73M | for(j = 0; j < trans_size; j++) |
1007 | 5.55M | { |
1008 | | /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ |
1009 | 93.9M | for(k = 0; k < 16; k++) |
1010 | 88.3M | { |
1011 | 88.3M | o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size] |
1012 | 88.3M | + g_ai2_ihevc_trans_32[3][k] |
1013 | 88.3M | * pi2_tmp[3 * trans_size] |
1014 | 88.3M | + g_ai2_ihevc_trans_32[5][k] |
1015 | 88.3M | * pi2_tmp[5 * trans_size] |
1016 | 88.3M | + g_ai2_ihevc_trans_32[7][k] |
1017 | 88.3M | * pi2_tmp[7 * trans_size] |
1018 | 88.3M | + g_ai2_ihevc_trans_32[9][k] |
1019 | 88.3M | * pi2_tmp[9 * trans_size] |
1020 | 88.3M | + g_ai2_ihevc_trans_32[11][k] |
1021 | 88.3M | * pi2_tmp[11 * trans_size] |
1022 | 88.3M | + g_ai2_ihevc_trans_32[13][k] |
1023 | 88.3M | * pi2_tmp[13 * trans_size] |
1024 | 88.3M | + g_ai2_ihevc_trans_32[15][k] |
1025 | 88.3M | * pi2_tmp[15 * trans_size] |
1026 | 88.3M | + g_ai2_ihevc_trans_32[17][k] |
1027 | 88.3M | * pi2_tmp[17 * trans_size] |
1028 | 88.3M | + g_ai2_ihevc_trans_32[19][k] |
1029 | 88.3M | * pi2_tmp[19 * trans_size] |
1030 | 88.3M | + g_ai2_ihevc_trans_32[21][k] |
1031 | 88.3M | * pi2_tmp[21 * trans_size] |
1032 | 88.3M | + g_ai2_ihevc_trans_32[23][k] |
1033 | 88.3M | * pi2_tmp[23 * trans_size] |
1034 | 88.3M | + g_ai2_ihevc_trans_32[25][k] |
1035 | 88.3M | * pi2_tmp[25 * trans_size] |
1036 | 88.3M | + g_ai2_ihevc_trans_32[27][k] |
1037 | 88.3M | * pi2_tmp[27 * trans_size] |
1038 | 88.3M | + g_ai2_ihevc_trans_32[29][k] |
1039 | 88.3M | * pi2_tmp[29 * trans_size] |
1040 | 88.3M | + g_ai2_ihevc_trans_32[31][k] |
1041 | 88.3M | * pi2_tmp[31 * trans_size]; |
1042 | 88.3M | } |
1043 | 49.9M | for(k = 0; k < 8; k++) |
1044 | 44.3M | { |
1045 | 44.3M | eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size] |
1046 | 44.3M | + g_ai2_ihevc_trans_32[6][k] |
1047 | 44.3M | * pi2_tmp[6 * trans_size] |
1048 | 44.3M | + g_ai2_ihevc_trans_32[10][k] |
1049 | 44.3M | * pi2_tmp[10 * trans_size] |
1050 | 44.3M | + g_ai2_ihevc_trans_32[14][k] |
1051 | 44.3M | * pi2_tmp[14 * trans_size] |
1052 | 44.3M | + g_ai2_ihevc_trans_32[18][k] |
1053 | 44.3M | * pi2_tmp[18 * trans_size] |
1054 | 44.3M | + g_ai2_ihevc_trans_32[22][k] |
1055 | 44.3M | * pi2_tmp[22 * trans_size] |
1056 | 44.3M | + g_ai2_ihevc_trans_32[26][k] |
1057 | 44.3M | * pi2_tmp[26 * trans_size] |
1058 | 44.3M | + g_ai2_ihevc_trans_32[30][k] |
1059 | 44.3M | * pi2_tmp[30 * trans_size]; |
1060 | 44.3M | } |
1061 | 27.7M | for(k = 0; k < 4; k++) |
1062 | 22.2M | { |
1063 | 22.2M | eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_tmp[4 * trans_size] |
1064 | 22.2M | + g_ai2_ihevc_trans_32[12][k] |
1065 | 22.2M | * pi2_tmp[12 * trans_size] |
1066 | 22.2M | + g_ai2_ihevc_trans_32[20][k] |
1067 | 22.2M | * pi2_tmp[20 * trans_size] |
1068 | 22.2M | + g_ai2_ihevc_trans_32[28][k] |
1069 | 22.2M | * pi2_tmp[28 * trans_size]; |
1070 | 22.2M | } |
1071 | 5.55M | eeeo[0] = |
1072 | 5.55M | g_ai2_ihevc_trans_32[8][0] * pi2_tmp[8 * trans_size] |
1073 | 5.55M | + g_ai2_ihevc_trans_32[24][0] |
1074 | 5.55M | * pi2_tmp[24 |
1075 | 5.55M | * trans_size]; |
1076 | 5.55M | eeeo[1] = |
1077 | 5.55M | g_ai2_ihevc_trans_32[8][1] * pi2_tmp[8 * trans_size] |
1078 | 5.55M | + g_ai2_ihevc_trans_32[24][1] |
1079 | 5.55M | * pi2_tmp[24 |
1080 | 5.55M | * trans_size]; |
1081 | 5.55M | eeee[0] = |
1082 | 5.55M | g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0] |
1083 | 5.55M | + g_ai2_ihevc_trans_32[16][0] |
1084 | 5.55M | * pi2_tmp[16 |
1085 | 5.55M | * trans_size]; |
1086 | 5.55M | eeee[1] = |
1087 | 5.55M | g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0] |
1088 | 5.55M | + g_ai2_ihevc_trans_32[16][1] |
1089 | 5.55M | * pi2_tmp[16 |
1090 | 5.55M | * trans_size]; |
1091 | | |
1092 | | /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */ |
1093 | 5.55M | eee[0] = eeee[0] + eeeo[0]; |
1094 | 5.55M | eee[3] = eeee[0] - eeeo[0]; |
1095 | 5.55M | eee[1] = eeee[1] + eeeo[1]; |
1096 | 5.55M | eee[2] = eeee[1] - eeeo[1]; |
1097 | 27.7M | for(k = 0; k < 4; k++) |
1098 | 22.2M | { |
1099 | 22.2M | ee[k] = eee[k] + eeo[k]; |
1100 | 22.2M | ee[k + 4] = eee[3 - k] - eeo[3 - k]; |
1101 | 22.2M | } |
1102 | 49.9M | for(k = 0; k < 8; k++) |
1103 | 44.4M | { |
1104 | 44.4M | e[k] = ee[k] + eo[k]; |
1105 | 44.4M | e[k + 8] = ee[7 - k] - eo[7 - k]; |
1106 | 44.4M | } |
1107 | 94.1M | for(k = 0; k < 16; k++) |
1108 | 88.6M | { |
1109 | 88.6M | WORD32 itrans_out; |
1110 | 88.6M | itrans_out = |
1111 | 88.6M | CLIP_S16(((e[k] + o[k] + add) >> shift)); |
1112 | 88.6M | pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k])); |
1113 | 88.6M | itrans_out = |
1114 | 88.6M | CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift)); |
1115 | 88.6M | pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16])); |
1116 | 88.6M | } |
1117 | 5.55M | pi2_tmp++; |
1118 | 5.55M | pu1_pred += pred_strd; |
1119 | 5.55M | pu1_dst += dst_strd; |
1120 | 5.55M | } |
1121 | 173k | } |
1122 | | /************************************************************************************************/ |
1123 | | /************************************END - IT_RECON_32x32****************************************/ |
1124 | | /************************************************************************************************/ |
1125 | 222k | } |
1126 | 340k | } |
1127 | | |