/src/libhevc/common/ihevc_chroma_itrans_recon_32x32.c
Line | Count | Source |
1 | | /****************************************************************************** |
2 | | * |
3 | | * Copyright (C) 2025 Ittiam Systems Pvt Ltd, Bangalore |
4 | | * |
5 | | * Licensed under the Apache License, Version 2.0 (the "License"); |
6 | | * you may not use this file except in compliance with the License. |
7 | | * You may obtain a copy of the License at: |
8 | | * |
9 | | * http://www.apache.org/licenses/LICENSE-2.0 |
10 | | * |
11 | | * Unless required by applicable law or agreed to in writing, software |
12 | | * distributed under the License is distributed on an "AS IS" BASIS, |
13 | | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | * See the License for the specific language governing permissions and |
15 | | * limitations under the License. |
16 | | * |
17 | | ******************************************************************************/ |
18 | | /** |
19 | | ******************************************************************************* |
20 | | * @file |
21 | | * ihevc_chroma_itrans_recon_32x32.c |
22 | | * |
23 | | * @brief |
24 | | * Contains function definitions for 32x32 inverse transform and reconstruction |
25 | | * of chroma interleaved data. |
26 | | * |
27 | | * @author |
28 | | * 100927 |
29 | | * |
30 | | * @par List of Functions: |
31 | | * - ihevc_chroma_itrans_recon_32x32() |
32 | | * |
33 | | * @remarks |
34 | | * None |
35 | | * |
36 | | ******************************************************************************* |
37 | | */ |
38 | | |
39 | | #include <stdio.h> |
40 | | #include <string.h> |
41 | | #include "ihevc_typedefs.h" |
42 | | #include "ihevc_macros.h" |
43 | | #include "ihevc_platform_macros.h" |
44 | | #include "ihevc_defs.h" |
45 | | #include "ihevc_trans_tables.h" |
46 | | #include "ihevc_chroma_itrans_recon.h" |
47 | | #include "ihevc_func_selector.h" |
48 | | #include "ihevc_trans_macros.h" |
49 | | |
50 | | /* All the functions work one component(U or V) of interleaved data depending upon pointers passed to it */ |
51 | | /* Data visualization */ |
52 | | /* U V U V U V U V */ |
53 | | /* U V U V U V U V */ |
54 | | /* U V U V U V U V */ |
55 | | /* U V U V U V U V */ |
56 | | /* If the pointer points to first byte of above stream (U) , functions will operate on U component */ |
57 | | /* If the pointer points to second byte of above stream (V) , functions will operate on V component */ |
58 | | |
59 | | |
60 | | /** |
61 | | ******************************************************************************* |
62 | | * |
63 | | * @brief |
64 | | * This function performs Inverse transform and reconstruction for 32x32 |
65 | | * input block |
66 | | * |
67 | | * @par Description: |
68 | | * Performs inverse transform and adds the prediction data and clips output |
69 | | * to 8 bit |
70 | | * |
71 | | * @param[in] pi2_src |
72 | | * Input 32x32 coefficients |
73 | | * |
74 | | * @param[in] pi2_tmp |
75 | | * Temporary 32x32 buffer for storing inverse transform |
76 | | * 1st stage output |
77 | | * |
78 | | * @param[in] pu1_pred |
79 | | * Prediction 32x32 block |
80 | | * |
81 | | * @param[out] pu1_dst |
82 | | * Output 32x32 block |
83 | | * |
84 | | * @param[in] src_strd |
85 | | * Input stride |
86 | | * |
87 | | * @param[in] pred_strd |
88 | | * Prediction stride |
89 | | * |
90 | | * @param[in] dst_strd |
91 | | * Output Stride |
92 | | * |
93 | | * @param[in] shift |
94 | | * Output shift |
95 | | * |
96 | | * @param[in] zero_cols |
97 | | * Zero columns in pi2_src |
98 | | * |
99 | | * @returns Void |
100 | | * |
101 | | * @remarks |
102 | | * None |
103 | | * |
104 | | ******************************************************************************* |
105 | | */ |
106 | | |
107 | | |
108 | | void ihevc_chroma_itrans_recon_32x32(WORD16 *pi2_src, |
109 | | WORD16 *pi2_tmp, |
110 | | UWORD8 *pu1_pred, |
111 | | UWORD8 *pu1_dst, |
112 | | WORD32 src_strd, |
113 | | WORD32 pred_strd, |
114 | | WORD32 dst_strd, |
115 | | WORD32 zero_cols, |
116 | | WORD32 zero_rows) |
117 | 0 | { |
118 | 0 | WORD32 j, k; |
119 | 0 | WORD32 e[16], o[16]; |
120 | 0 | WORD32 ee[8], eo[8]; |
121 | 0 | WORD32 eee[4], eeo[4]; |
122 | 0 | WORD32 eeee[2], eeeo[2]; |
123 | 0 | WORD32 add; |
124 | 0 | WORD32 shift; |
125 | 0 | WORD16 *pi2_tmp_orig; |
126 | 0 | WORD32 trans_size; |
127 | 0 | WORD32 row_limit_2nd_stage, zero_rows_2nd_stage = zero_cols; |
128 | |
|
129 | 0 | trans_size = TRANS_SIZE_32; |
130 | 0 | pi2_tmp_orig = pi2_tmp; |
131 | |
|
132 | 0 | if((zero_cols & 0xFFFFFFF0) == 0xFFFFFFF0) |
133 | 0 | row_limit_2nd_stage = 4; |
134 | 0 | else if((zero_cols & 0xFFFFFF00) == 0xFFFFFF00) |
135 | 0 | row_limit_2nd_stage = 8; |
136 | 0 | else |
137 | 0 | row_limit_2nd_stage = TRANS_SIZE_32; |
138 | |
|
139 | 0 | if((zero_rows & 0xFFFFFFF0) == 0xFFFFFFF0) /* First 4 rows of input are non-zero */ |
140 | 0 | { |
141 | | /************************************************************************************************/ |
142 | | /**********************************START - IT_RECON_32x32****************************************/ |
143 | | /************************************************************************************************/ |
144 | | /* Inverse Transform 1st stage */ |
145 | 0 | shift = IT_SHIFT_STAGE_1; |
146 | 0 | add = 1 << (shift - 1); |
147 | |
|
148 | 0 | for(j = 0; j < row_limit_2nd_stage; j++) |
149 | 0 | { |
150 | | /* Checking for Zero Cols */ |
151 | 0 | if((zero_cols & 1) == 1) |
152 | 0 | { |
153 | 0 | memset(pi2_tmp, 0, trans_size * sizeof(WORD16)); |
154 | 0 | } |
155 | 0 | else |
156 | 0 | { |
157 | | /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ |
158 | 0 | for(k = 0; k < 16; k++) |
159 | 0 | { |
160 | 0 | o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_src[src_strd] |
161 | 0 | + g_ai2_ihevc_trans_32[3][k] |
162 | 0 | * pi2_src[3 * src_strd]; |
163 | 0 | } |
164 | 0 | for(k = 0; k < 8; k++) |
165 | 0 | { |
166 | 0 | eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_src[2 * src_strd]; |
167 | 0 | } |
168 | | // for(k = 0; k < 4; k++) |
169 | 0 | { |
170 | 0 | eeo[0] = 0; |
171 | 0 | eeo[1] = 0; |
172 | 0 | eeo[2] = 0; |
173 | 0 | eeo[3] = 0; |
174 | 0 | } |
175 | 0 | eeeo[0] = 0; |
176 | 0 | eeeo[1] = 0; |
177 | 0 | eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_src[0]; |
178 | 0 | eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_src[0]; |
179 | | |
180 | | /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */ |
181 | 0 | eee[0] = eeee[0] + eeeo[0]; |
182 | 0 | eee[3] = eeee[0] - eeeo[0]; |
183 | 0 | eee[1] = eeee[1] + eeeo[1]; |
184 | 0 | eee[2] = eeee[1] - eeeo[1]; |
185 | 0 | for(k = 0; k < 4; k++) |
186 | 0 | { |
187 | 0 | ee[k] = eee[k] + eeo[k]; |
188 | 0 | ee[k + 4] = eee[3 - k] - eeo[3 - k]; |
189 | 0 | } |
190 | 0 | for(k = 0; k < 8; k++) |
191 | 0 | { |
192 | 0 | e[k] = ee[k] + eo[k]; |
193 | 0 | e[k + 8] = ee[7 - k] - eo[7 - k]; |
194 | 0 | } |
195 | 0 | for(k = 0; k < 16; k++) |
196 | 0 | { |
197 | 0 | pi2_tmp[k] = |
198 | 0 | CLIP_S16(((e[k] + o[k] + add) >> shift)); |
199 | 0 | pi2_tmp[k + 16] = |
200 | 0 | CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift)); |
201 | 0 | } |
202 | 0 | } |
203 | 0 | pi2_src++; |
204 | 0 | pi2_tmp += trans_size; |
205 | 0 | zero_cols = zero_cols >> 1; |
206 | 0 | } |
207 | |
|
208 | 0 | pi2_tmp = pi2_tmp_orig; |
209 | | |
210 | | /* Inverse Transform 2nd stage */ |
211 | 0 | shift = IT_SHIFT_STAGE_2; |
212 | 0 | add = 1 << (shift - 1); |
213 | 0 | if((zero_rows_2nd_stage & 0xFFFFFFF0) == 0xFFFFFFF0) /* First 4 rows of output of 1st stage are non-zero */ |
214 | 0 | { |
215 | 0 | for(j = 0; j < trans_size; j++) |
216 | 0 | { |
217 | | /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ |
218 | 0 | for(k = 0; k < 16; k++) |
219 | 0 | { |
220 | 0 | o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size] |
221 | 0 | + g_ai2_ihevc_trans_32[3][k] |
222 | 0 | * pi2_tmp[3 * trans_size]; |
223 | 0 | } |
224 | 0 | for(k = 0; k < 8; k++) |
225 | 0 | { |
226 | 0 | eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size]; |
227 | 0 | } |
228 | | // for(k = 0; k < 4; k++) |
229 | 0 | { |
230 | 0 | eeo[0] = 0; |
231 | 0 | eeo[1] = 0; |
232 | 0 | eeo[2] = 0; |
233 | 0 | eeo[3] = 0; |
234 | 0 | } |
235 | 0 | eeeo[0] = 0; |
236 | 0 | eeeo[1] = 0; |
237 | 0 | eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0]; |
238 | 0 | eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0]; |
239 | | |
240 | | /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */ |
241 | 0 | eee[0] = eeee[0] + eeeo[0]; |
242 | 0 | eee[3] = eeee[0] - eeeo[0]; |
243 | 0 | eee[1] = eeee[1] + eeeo[1]; |
244 | 0 | eee[2] = eeee[1] - eeeo[1]; |
245 | 0 | for(k = 0; k < 4; k++) |
246 | 0 | { |
247 | 0 | ee[k] = eee[k] + eeo[k]; |
248 | 0 | ee[k + 4] = eee[3 - k] - eeo[3 - k]; |
249 | 0 | } |
250 | 0 | for(k = 0; k < 8; k++) |
251 | 0 | { |
252 | 0 | e[k] = ee[k] + eo[k]; |
253 | 0 | e[k + 8] = ee[7 - k] - eo[7 - k]; |
254 | 0 | } |
255 | 0 | for(k = 0; k < 16; k++) |
256 | 0 | { |
257 | 0 | WORD32 itrans_out; |
258 | 0 | itrans_out = |
259 | 0 | CLIP_S16(((e[k] + o[k] + add) >> shift)); |
260 | 0 | pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2])); |
261 | 0 | itrans_out = |
262 | 0 | CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift)); |
263 | 0 | pu1_dst[(k + 16) * 2] = CLIP_U8((itrans_out + pu1_pred[(k + 16) * 2])); |
264 | 0 | } |
265 | 0 | pi2_tmp++; |
266 | 0 | pu1_pred += pred_strd; |
267 | 0 | pu1_dst += dst_strd; |
268 | 0 | } |
269 | 0 | } |
270 | 0 | else if((zero_rows_2nd_stage & 0xFFFFFF00) == 0xFFFFFF00) /* First 8 rows of output of 1st stage are non-zero */ |
271 | 0 | { |
272 | 0 | for(j = 0; j < trans_size; j++) |
273 | 0 | { |
274 | | /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ |
275 | 0 | for(k = 0; k < 16; k++) |
276 | 0 | { |
277 | 0 | o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size] |
278 | 0 | + g_ai2_ihevc_trans_32[3][k] |
279 | 0 | * pi2_tmp[3 * trans_size] |
280 | 0 | + g_ai2_ihevc_trans_32[5][k] |
281 | 0 | * pi2_tmp[5 * trans_size] |
282 | 0 | + g_ai2_ihevc_trans_32[7][k] |
283 | 0 | * pi2_tmp[7 * trans_size]; |
284 | 0 | } |
285 | 0 | for(k = 0; k < 8; k++) |
286 | 0 | { |
287 | 0 | eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size] |
288 | 0 | + g_ai2_ihevc_trans_32[6][k] |
289 | 0 | * pi2_tmp[6 * trans_size]; |
290 | 0 | } |
291 | 0 | for(k = 0; k < 4; k++) |
292 | 0 | { |
293 | 0 | eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_tmp[4 * trans_size]; |
294 | 0 | } |
295 | 0 | eeeo[0] = 0; |
296 | 0 | eeeo[1] = 0; |
297 | 0 | eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0]; |
298 | 0 | eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0]; |
299 | | |
300 | | /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */ |
301 | 0 | eee[0] = eeee[0] + eeeo[0]; |
302 | 0 | eee[3] = eeee[0] - eeeo[0]; |
303 | 0 | eee[1] = eeee[1] + eeeo[1]; |
304 | 0 | eee[2] = eeee[1] - eeeo[1]; |
305 | 0 | for(k = 0; k < 4; k++) |
306 | 0 | { |
307 | 0 | ee[k] = eee[k] + eeo[k]; |
308 | 0 | ee[k + 4] = eee[3 - k] - eeo[3 - k]; |
309 | 0 | } |
310 | 0 | for(k = 0; k < 8; k++) |
311 | 0 | { |
312 | 0 | e[k] = ee[k] + eo[k]; |
313 | 0 | e[k + 8] = ee[7 - k] - eo[7 - k]; |
314 | 0 | } |
315 | 0 | for(k = 0; k < 16; k++) |
316 | 0 | { |
317 | 0 | WORD32 itrans_out; |
318 | 0 | itrans_out = |
319 | 0 | CLIP_S16(((e[k] + o[k] + add) >> shift)); |
320 | 0 | pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2])); |
321 | 0 | itrans_out = |
322 | 0 | CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift)); |
323 | 0 | pu1_dst[(k + 16) * 2] = CLIP_U8((itrans_out + pu1_pred[(k + 16) * 2])); |
324 | 0 | } |
325 | 0 | pi2_tmp++; |
326 | 0 | pu1_pred += pred_strd; |
327 | 0 | pu1_dst += dst_strd; |
328 | 0 | } |
329 | 0 | } |
330 | 0 | else /* All rows of output of 1st stage are non-zero */ |
331 | 0 | { |
332 | 0 | for(j = 0; j < trans_size; j++) |
333 | 0 | { |
334 | | /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ |
335 | 0 | for(k = 0; k < 16; k++) |
336 | 0 | { |
337 | 0 | o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size] |
338 | 0 | + g_ai2_ihevc_trans_32[3][k] |
339 | 0 | * pi2_tmp[3 * trans_size] |
340 | 0 | + g_ai2_ihevc_trans_32[5][k] |
341 | 0 | * pi2_tmp[5 * trans_size] |
342 | 0 | + g_ai2_ihevc_trans_32[7][k] |
343 | 0 | * pi2_tmp[7 * trans_size] |
344 | 0 | + g_ai2_ihevc_trans_32[9][k] |
345 | 0 | * pi2_tmp[9 * trans_size] |
346 | 0 | + g_ai2_ihevc_trans_32[11][k] |
347 | 0 | * pi2_tmp[11 * trans_size] |
348 | 0 | + g_ai2_ihevc_trans_32[13][k] |
349 | 0 | * pi2_tmp[13 * trans_size] |
350 | 0 | + g_ai2_ihevc_trans_32[15][k] |
351 | 0 | * pi2_tmp[15 * trans_size] |
352 | 0 | + g_ai2_ihevc_trans_32[17][k] |
353 | 0 | * pi2_tmp[17 * trans_size] |
354 | 0 | + g_ai2_ihevc_trans_32[19][k] |
355 | 0 | * pi2_tmp[19 * trans_size] |
356 | 0 | + g_ai2_ihevc_trans_32[21][k] |
357 | 0 | * pi2_tmp[21 * trans_size] |
358 | 0 | + g_ai2_ihevc_trans_32[23][k] |
359 | 0 | * pi2_tmp[23 * trans_size] |
360 | 0 | + g_ai2_ihevc_trans_32[25][k] |
361 | 0 | * pi2_tmp[25 * trans_size] |
362 | 0 | + g_ai2_ihevc_trans_32[27][k] |
363 | 0 | * pi2_tmp[27 * trans_size] |
364 | 0 | + g_ai2_ihevc_trans_32[29][k] |
365 | 0 | * pi2_tmp[29 * trans_size] |
366 | 0 | + g_ai2_ihevc_trans_32[31][k] |
367 | 0 | * pi2_tmp[31 * trans_size]; |
368 | 0 | } |
369 | 0 | for(k = 0; k < 8; k++) |
370 | 0 | { |
371 | 0 | eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size] |
372 | 0 | + g_ai2_ihevc_trans_32[6][k] |
373 | 0 | * pi2_tmp[6 * trans_size] |
374 | 0 | + g_ai2_ihevc_trans_32[10][k] |
375 | 0 | * pi2_tmp[10 * trans_size] |
376 | 0 | + g_ai2_ihevc_trans_32[14][k] |
377 | 0 | * pi2_tmp[14 * trans_size] |
378 | 0 | + g_ai2_ihevc_trans_32[18][k] |
379 | 0 | * pi2_tmp[18 * trans_size] |
380 | 0 | + g_ai2_ihevc_trans_32[22][k] |
381 | 0 | * pi2_tmp[22 * trans_size] |
382 | 0 | + g_ai2_ihevc_trans_32[26][k] |
383 | 0 | * pi2_tmp[26 * trans_size] |
384 | 0 | + g_ai2_ihevc_trans_32[30][k] |
385 | 0 | * pi2_tmp[30 * trans_size]; |
386 | 0 | } |
387 | 0 | for(k = 0; k < 4; k++) |
388 | 0 | { |
389 | 0 | eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_tmp[4 * trans_size] |
390 | 0 | + g_ai2_ihevc_trans_32[12][k] |
391 | 0 | * pi2_tmp[12 * trans_size] |
392 | 0 | + g_ai2_ihevc_trans_32[20][k] |
393 | 0 | * pi2_tmp[20 * trans_size] |
394 | 0 | + g_ai2_ihevc_trans_32[28][k] |
395 | 0 | * pi2_tmp[28 * trans_size]; |
396 | 0 | } |
397 | 0 | eeeo[0] = |
398 | 0 | g_ai2_ihevc_trans_32[8][0] * pi2_tmp[8 * trans_size] |
399 | 0 | + g_ai2_ihevc_trans_32[24][0] |
400 | 0 | * pi2_tmp[24 |
401 | 0 | * trans_size]; |
402 | 0 | eeeo[1] = |
403 | 0 | g_ai2_ihevc_trans_32[8][1] * pi2_tmp[8 * trans_size] |
404 | 0 | + g_ai2_ihevc_trans_32[24][1] |
405 | 0 | * pi2_tmp[24 |
406 | 0 | * trans_size]; |
407 | 0 | eeee[0] = |
408 | 0 | g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0] |
409 | 0 | + g_ai2_ihevc_trans_32[16][0] |
410 | 0 | * pi2_tmp[16 |
411 | 0 | * trans_size]; |
412 | 0 | eeee[1] = |
413 | 0 | g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0] |
414 | 0 | + g_ai2_ihevc_trans_32[16][1] |
415 | 0 | * pi2_tmp[16 |
416 | 0 | * trans_size]; |
417 | | |
418 | | /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */ |
419 | 0 | eee[0] = eeee[0] + eeeo[0]; |
420 | 0 | eee[3] = eeee[0] - eeeo[0]; |
421 | 0 | eee[1] = eeee[1] + eeeo[1]; |
422 | 0 | eee[2] = eeee[1] - eeeo[1]; |
423 | 0 | for(k = 0; k < 4; k++) |
424 | 0 | { |
425 | 0 | ee[k] = eee[k] + eeo[k]; |
426 | 0 | ee[k + 4] = eee[3 - k] - eeo[3 - k]; |
427 | 0 | } |
428 | 0 | for(k = 0; k < 8; k++) |
429 | 0 | { |
430 | 0 | e[k] = ee[k] + eo[k]; |
431 | 0 | e[k + 8] = ee[7 - k] - eo[7 - k]; |
432 | 0 | } |
433 | 0 | for(k = 0; k < 16; k++) |
434 | 0 | { |
435 | 0 | WORD32 itrans_out; |
436 | 0 | itrans_out = |
437 | 0 | CLIP_S16(((e[k] + o[k] + add) >> shift)); |
438 | 0 | pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2])); |
439 | 0 | itrans_out = |
440 | 0 | CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift)); |
441 | 0 | pu1_dst[(k + 16) * 2] = CLIP_U8((itrans_out + pu1_pred[(k + 16) * 2])); |
442 | 0 | } |
443 | 0 | pi2_tmp++; |
444 | 0 | pu1_pred += pred_strd; |
445 | 0 | pu1_dst += dst_strd; |
446 | 0 | } |
447 | 0 | } |
448 | | /************************************************************************************************/ |
449 | | /************************************END - IT_RECON_32x32****************************************/ |
450 | | /************************************************************************************************/ |
451 | 0 | } |
452 | 0 | else if((zero_rows & 0xFFFFFF00) == 0xFFFFFF00) /* First 8 rows of input are non-zero */ |
453 | 0 | { |
454 | | /************************************************************************************************/ |
455 | | /**********************************START - IT_RECON_32x32****************************************/ |
456 | | /************************************************************************************************/ |
457 | | /* Inverse Transform 1st stage */ |
458 | 0 | shift = IT_SHIFT_STAGE_1; |
459 | 0 | add = 1 << (shift - 1); |
460 | |
|
461 | 0 | for(j = 0; j < row_limit_2nd_stage; j++) |
462 | 0 | { |
463 | | /* Checking for Zero Cols */ |
464 | 0 | if((zero_cols & 1) == 1) |
465 | 0 | { |
466 | 0 | memset(pi2_tmp, 0, trans_size * sizeof(WORD16)); |
467 | 0 | } |
468 | 0 | else |
469 | 0 | { |
470 | | /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ |
471 | 0 | for(k = 0; k < 16; k++) |
472 | 0 | { |
473 | 0 | o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_src[src_strd] |
474 | 0 | + g_ai2_ihevc_trans_32[3][k] |
475 | 0 | * pi2_src[3 * src_strd] |
476 | 0 | + g_ai2_ihevc_trans_32[5][k] |
477 | 0 | * pi2_src[5 * src_strd] |
478 | 0 | + g_ai2_ihevc_trans_32[7][k] |
479 | 0 | * pi2_src[7 * src_strd]; |
480 | 0 | } |
481 | 0 | for(k = 0; k < 8; k++) |
482 | 0 | { |
483 | 0 | eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_src[2 * src_strd] |
484 | 0 | + g_ai2_ihevc_trans_32[6][k] |
485 | 0 | * pi2_src[6 * src_strd]; |
486 | 0 | } |
487 | 0 | for(k = 0; k < 4; k++) |
488 | 0 | { |
489 | 0 | eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_src[4 * src_strd]; |
490 | 0 | } |
491 | 0 | eeeo[0] = 0; |
492 | 0 | eeeo[1] = 0; |
493 | 0 | eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_src[0]; |
494 | 0 | eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_src[0]; |
495 | | |
496 | | /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */ |
497 | 0 | eee[0] = eeee[0] + eeeo[0]; |
498 | 0 | eee[3] = eeee[0] - eeeo[0]; |
499 | 0 | eee[1] = eeee[1] + eeeo[1]; |
500 | 0 | eee[2] = eeee[1] - eeeo[1]; |
501 | 0 | for(k = 0; k < 4; k++) |
502 | 0 | { |
503 | 0 | ee[k] = eee[k] + eeo[k]; |
504 | 0 | ee[k + 4] = eee[3 - k] - eeo[3 - k]; |
505 | 0 | } |
506 | 0 | for(k = 0; k < 8; k++) |
507 | 0 | { |
508 | 0 | e[k] = ee[k] + eo[k]; |
509 | 0 | e[k + 8] = ee[7 - k] - eo[7 - k]; |
510 | 0 | } |
511 | 0 | for(k = 0; k < 16; k++) |
512 | 0 | { |
513 | 0 | pi2_tmp[k] = |
514 | 0 | CLIP_S16(((e[k] + o[k] + add) >> shift)); |
515 | 0 | pi2_tmp[k + 16] = |
516 | 0 | CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift)); |
517 | 0 | } |
518 | 0 | } |
519 | 0 | pi2_src++; |
520 | 0 | pi2_tmp += trans_size; |
521 | 0 | zero_cols = zero_cols >> 1; |
522 | 0 | } |
523 | |
|
524 | 0 | pi2_tmp = pi2_tmp_orig; |
525 | | |
526 | | /* Inverse Transform 2nd stage */ |
527 | 0 | shift = IT_SHIFT_STAGE_2; |
528 | 0 | add = 1 << (shift - 1); |
529 | 0 | if((zero_rows_2nd_stage & 0xFFFFFFF0) == 0xFFFFFFF0) /* First 4 rows of output of 1st stage are non-zero */ |
530 | 0 | { |
531 | 0 | for(j = 0; j < trans_size; j++) |
532 | 0 | { |
533 | | /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ |
534 | 0 | for(k = 0; k < 16; k++) |
535 | 0 | { |
536 | 0 | o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size] |
537 | 0 | + g_ai2_ihevc_trans_32[3][k] |
538 | 0 | * pi2_tmp[3 * trans_size]; |
539 | 0 | } |
540 | 0 | for(k = 0; k < 8; k++) |
541 | 0 | { |
542 | 0 | eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size]; |
543 | 0 | } |
544 | | // for(k = 0; k < 4; k++) |
545 | 0 | { |
546 | 0 | eeo[0] = 0; |
547 | 0 | eeo[1] = 0; |
548 | 0 | eeo[2] = 0; |
549 | 0 | eeo[3] = 0; |
550 | 0 | } |
551 | 0 | eeeo[0] = 0; |
552 | 0 | eeeo[1] = 0; |
553 | 0 | eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0]; |
554 | 0 | eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0]; |
555 | | |
556 | | /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */ |
557 | 0 | eee[0] = eeee[0] + eeeo[0]; |
558 | 0 | eee[3] = eeee[0] - eeeo[0]; |
559 | 0 | eee[1] = eeee[1] + eeeo[1]; |
560 | 0 | eee[2] = eeee[1] - eeeo[1]; |
561 | 0 | for(k = 0; k < 4; k++) |
562 | 0 | { |
563 | 0 | ee[k] = eee[k] + eeo[k]; |
564 | 0 | ee[k + 4] = eee[3 - k] - eeo[3 - k]; |
565 | 0 | } |
566 | 0 | for(k = 0; k < 8; k++) |
567 | 0 | { |
568 | 0 | e[k] = ee[k] + eo[k]; |
569 | 0 | e[k + 8] = ee[7 - k] - eo[7 - k]; |
570 | 0 | } |
571 | 0 | for(k = 0; k < 16; k++) |
572 | 0 | { |
573 | 0 | WORD32 itrans_out; |
574 | 0 | itrans_out = |
575 | 0 | CLIP_S16(((e[k] + o[k] + add) >> shift)); |
576 | 0 | pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2])); |
577 | 0 | itrans_out = |
578 | 0 | CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift)); |
579 | 0 | pu1_dst[(k + 16) * 2] = CLIP_U8((itrans_out + pu1_pred[(k + 16) * 2])); |
580 | 0 | } |
581 | 0 | pi2_tmp++; |
582 | 0 | pu1_pred += pred_strd; |
583 | 0 | pu1_dst += dst_strd; |
584 | 0 | } |
585 | 0 | } |
586 | 0 | else if((zero_rows_2nd_stage & 0xFFFFFF00) == 0xFFFFFF00) /* First 8 rows of output of 1st stage are non-zero */ |
587 | 0 | { |
588 | 0 | for(j = 0; j < trans_size; j++) |
589 | 0 | { |
590 | | /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ |
591 | 0 | for(k = 0; k < 16; k++) |
592 | 0 | { |
593 | 0 | o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size] |
594 | 0 | + g_ai2_ihevc_trans_32[3][k] |
595 | 0 | * pi2_tmp[3 * trans_size] |
596 | 0 | + g_ai2_ihevc_trans_32[5][k] |
597 | 0 | * pi2_tmp[5 * trans_size] |
598 | 0 | + g_ai2_ihevc_trans_32[7][k] |
599 | 0 | * pi2_tmp[7 * trans_size]; |
600 | 0 | } |
601 | 0 | for(k = 0; k < 8; k++) |
602 | 0 | { |
603 | 0 | eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size] |
604 | 0 | + g_ai2_ihevc_trans_32[6][k] |
605 | 0 | * pi2_tmp[6 * trans_size]; |
606 | 0 | } |
607 | 0 | for(k = 0; k < 4; k++) |
608 | 0 | { |
609 | 0 | eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_tmp[4 * trans_size]; |
610 | 0 | } |
611 | 0 | eeeo[0] = 0; |
612 | 0 | eeeo[1] = 0; |
613 | 0 | eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0]; |
614 | 0 | eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0]; |
615 | | |
616 | | /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */ |
617 | 0 | eee[0] = eeee[0] + eeeo[0]; |
618 | 0 | eee[3] = eeee[0] - eeeo[0]; |
619 | 0 | eee[1] = eeee[1] + eeeo[1]; |
620 | 0 | eee[2] = eeee[1] - eeeo[1]; |
621 | 0 | for(k = 0; k < 4; k++) |
622 | 0 | { |
623 | 0 | ee[k] = eee[k] + eeo[k]; |
624 | 0 | ee[k + 4] = eee[3 - k] - eeo[3 - k]; |
625 | 0 | } |
626 | 0 | for(k = 0; k < 8; k++) |
627 | 0 | { |
628 | 0 | e[k] = ee[k] + eo[k]; |
629 | 0 | e[k + 8] = ee[7 - k] - eo[7 - k]; |
630 | 0 | } |
631 | 0 | for(k = 0; k < 16; k++) |
632 | 0 | { |
633 | 0 | WORD32 itrans_out; |
634 | 0 | itrans_out = |
635 | 0 | CLIP_S16(((e[k] + o[k] + add) >> shift)); |
636 | 0 | pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2])); |
637 | 0 | itrans_out = |
638 | 0 | CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift)); |
639 | 0 | pu1_dst[(k + 16) * 2] = CLIP_U8((itrans_out + pu1_pred[(k + 16) * 2])); |
640 | 0 | } |
641 | 0 | pi2_tmp++; |
642 | 0 | pu1_pred += pred_strd; |
643 | 0 | pu1_dst += dst_strd; |
644 | 0 | } |
645 | 0 | } |
646 | 0 | else /* All rows of output of 1st stage are non-zero */ |
647 | 0 | { |
648 | 0 | for(j = 0; j < trans_size; j++) |
649 | 0 | { |
650 | | /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ |
651 | 0 | for(k = 0; k < 16; k++) |
652 | 0 | { |
653 | 0 | o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size] |
654 | 0 | + g_ai2_ihevc_trans_32[3][k] |
655 | 0 | * pi2_tmp[3 * trans_size] |
656 | 0 | + g_ai2_ihevc_trans_32[5][k] |
657 | 0 | * pi2_tmp[5 * trans_size] |
658 | 0 | + g_ai2_ihevc_trans_32[7][k] |
659 | 0 | * pi2_tmp[7 * trans_size] |
660 | 0 | + g_ai2_ihevc_trans_32[9][k] |
661 | 0 | * pi2_tmp[9 * trans_size] |
662 | 0 | + g_ai2_ihevc_trans_32[11][k] |
663 | 0 | * pi2_tmp[11 * trans_size] |
664 | 0 | + g_ai2_ihevc_trans_32[13][k] |
665 | 0 | * pi2_tmp[13 * trans_size] |
666 | 0 | + g_ai2_ihevc_trans_32[15][k] |
667 | 0 | * pi2_tmp[15 * trans_size] |
668 | 0 | + g_ai2_ihevc_trans_32[17][k] |
669 | 0 | * pi2_tmp[17 * trans_size] |
670 | 0 | + g_ai2_ihevc_trans_32[19][k] |
671 | 0 | * pi2_tmp[19 * trans_size] |
672 | 0 | + g_ai2_ihevc_trans_32[21][k] |
673 | 0 | * pi2_tmp[21 * trans_size] |
674 | 0 | + g_ai2_ihevc_trans_32[23][k] |
675 | 0 | * pi2_tmp[23 * trans_size] |
676 | 0 | + g_ai2_ihevc_trans_32[25][k] |
677 | 0 | * pi2_tmp[25 * trans_size] |
678 | 0 | + g_ai2_ihevc_trans_32[27][k] |
679 | 0 | * pi2_tmp[27 * trans_size] |
680 | 0 | + g_ai2_ihevc_trans_32[29][k] |
681 | 0 | * pi2_tmp[29 * trans_size] |
682 | 0 | + g_ai2_ihevc_trans_32[31][k] |
683 | 0 | * pi2_tmp[31 * trans_size]; |
684 | 0 | } |
685 | 0 | for(k = 0; k < 8; k++) |
686 | 0 | { |
687 | 0 | eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size] |
688 | 0 | + g_ai2_ihevc_trans_32[6][k] |
689 | 0 | * pi2_tmp[6 * trans_size] |
690 | 0 | + g_ai2_ihevc_trans_32[10][k] |
691 | 0 | * pi2_tmp[10 * trans_size] |
692 | 0 | + g_ai2_ihevc_trans_32[14][k] |
693 | 0 | * pi2_tmp[14 * trans_size] |
694 | 0 | + g_ai2_ihevc_trans_32[18][k] |
695 | 0 | * pi2_tmp[18 * trans_size] |
696 | 0 | + g_ai2_ihevc_trans_32[22][k] |
697 | 0 | * pi2_tmp[22 * trans_size] |
698 | 0 | + g_ai2_ihevc_trans_32[26][k] |
699 | 0 | * pi2_tmp[26 * trans_size] |
700 | 0 | + g_ai2_ihevc_trans_32[30][k] |
701 | 0 | * pi2_tmp[30 * trans_size]; |
702 | 0 | } |
703 | 0 | for(k = 0; k < 4; k++) |
704 | 0 | { |
705 | 0 | eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_tmp[4 * trans_size] |
706 | 0 | + g_ai2_ihevc_trans_32[12][k] |
707 | 0 | * pi2_tmp[12 * trans_size] |
708 | 0 | + g_ai2_ihevc_trans_32[20][k] |
709 | 0 | * pi2_tmp[20 * trans_size] |
710 | 0 | + g_ai2_ihevc_trans_32[28][k] |
711 | 0 | * pi2_tmp[28 * trans_size]; |
712 | 0 | } |
713 | 0 | eeeo[0] = |
714 | 0 | g_ai2_ihevc_trans_32[8][0] * pi2_tmp[8 * trans_size] |
715 | 0 | + g_ai2_ihevc_trans_32[24][0] |
716 | 0 | * pi2_tmp[24 |
717 | 0 | * trans_size]; |
718 | 0 | eeeo[1] = |
719 | 0 | g_ai2_ihevc_trans_32[8][1] * pi2_tmp[8 * trans_size] |
720 | 0 | + g_ai2_ihevc_trans_32[24][1] |
721 | 0 | * pi2_tmp[24 |
722 | 0 | * trans_size]; |
723 | 0 | eeee[0] = |
724 | 0 | g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0] |
725 | 0 | + g_ai2_ihevc_trans_32[16][0] |
726 | 0 | * pi2_tmp[16 |
727 | 0 | * trans_size]; |
728 | 0 | eeee[1] = |
729 | 0 | g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0] |
730 | 0 | + g_ai2_ihevc_trans_32[16][1] |
731 | 0 | * pi2_tmp[16 |
732 | 0 | * trans_size]; |
733 | | |
734 | | /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */ |
735 | 0 | eee[0] = eeee[0] + eeeo[0]; |
736 | 0 | eee[3] = eeee[0] - eeeo[0]; |
737 | 0 | eee[1] = eeee[1] + eeeo[1]; |
738 | 0 | eee[2] = eeee[1] - eeeo[1]; |
739 | 0 | for(k = 0; k < 4; k++) |
740 | 0 | { |
741 | 0 | ee[k] = eee[k] + eeo[k]; |
742 | 0 | ee[k + 4] = eee[3 - k] - eeo[3 - k]; |
743 | 0 | } |
744 | 0 | for(k = 0; k < 8; k++) |
745 | 0 | { |
746 | 0 | e[k] = ee[k] + eo[k]; |
747 | 0 | e[k + 8] = ee[7 - k] - eo[7 - k]; |
748 | 0 | } |
749 | 0 | for(k = 0; k < 16; k++) |
750 | 0 | { |
751 | 0 | WORD32 itrans_out; |
752 | 0 | itrans_out = |
753 | 0 | CLIP_S16(((e[k] + o[k] + add) >> shift)); |
754 | 0 | pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2])); |
755 | 0 | itrans_out = |
756 | 0 | CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift)); |
757 | 0 | pu1_dst[(k + 16) * 2] = CLIP_U8((itrans_out + pu1_pred[(k + 16) * 2])); |
758 | 0 | } |
759 | 0 | pi2_tmp++; |
760 | 0 | pu1_pred += pred_strd; |
761 | 0 | pu1_dst += dst_strd; |
762 | 0 | } |
763 | 0 | } |
764 | | /************************************************************************************************/ |
765 | | /************************************END - IT_RECON_32x32****************************************/ |
766 | | /************************************************************************************************/ |
767 | 0 | } |
768 | 0 | else /* All rows of input are non-zero */ |
769 | 0 | { |
770 | | /************************************************************************************************/ |
771 | | /**********************************START - IT_RECON_32x32****************************************/ |
772 | | /************************************************************************************************/ |
773 | | /* Inverse Transform 1st stage */ |
774 | 0 | shift = IT_SHIFT_STAGE_1; |
775 | 0 | add = 1 << (shift - 1); |
776 | |
|
777 | 0 | for(j = 0; j < row_limit_2nd_stage; j++) |
778 | 0 | { |
779 | | /* Checking for Zero Cols */ |
780 | 0 | if((zero_cols & 1) == 1) |
781 | 0 | { |
782 | 0 | memset(pi2_tmp, 0, trans_size * sizeof(WORD16)); |
783 | 0 | } |
784 | 0 | else |
785 | 0 | { |
786 | | /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ |
787 | 0 | for(k = 0; k < 16; k++) |
788 | 0 | { |
789 | 0 | o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_src[src_strd] |
790 | 0 | + g_ai2_ihevc_trans_32[3][k] |
791 | 0 | * pi2_src[3 * src_strd] |
792 | 0 | + g_ai2_ihevc_trans_32[5][k] |
793 | 0 | * pi2_src[5 * src_strd] |
794 | 0 | + g_ai2_ihevc_trans_32[7][k] |
795 | 0 | * pi2_src[7 * src_strd] |
796 | 0 | + g_ai2_ihevc_trans_32[9][k] |
797 | 0 | * pi2_src[9 * src_strd] |
798 | 0 | + g_ai2_ihevc_trans_32[11][k] |
799 | 0 | * pi2_src[11 * src_strd] |
800 | 0 | + g_ai2_ihevc_trans_32[13][k] |
801 | 0 | * pi2_src[13 * src_strd] |
802 | 0 | + g_ai2_ihevc_trans_32[15][k] |
803 | 0 | * pi2_src[15 * src_strd] |
804 | 0 | + g_ai2_ihevc_trans_32[17][k] |
805 | 0 | * pi2_src[17 * src_strd] |
806 | 0 | + g_ai2_ihevc_trans_32[19][k] |
807 | 0 | * pi2_src[19 * src_strd] |
808 | 0 | + g_ai2_ihevc_trans_32[21][k] |
809 | 0 | * pi2_src[21 * src_strd] |
810 | 0 | + g_ai2_ihevc_trans_32[23][k] |
811 | 0 | * pi2_src[23 * src_strd] |
812 | 0 | + g_ai2_ihevc_trans_32[25][k] |
813 | 0 | * pi2_src[25 * src_strd] |
814 | 0 | + g_ai2_ihevc_trans_32[27][k] |
815 | 0 | * pi2_src[27 * src_strd] |
816 | 0 | + g_ai2_ihevc_trans_32[29][k] |
817 | 0 | * pi2_src[29 * src_strd] |
818 | 0 | + g_ai2_ihevc_trans_32[31][k] |
819 | 0 | * pi2_src[31 * src_strd]; |
820 | 0 | } |
821 | 0 | for(k = 0; k < 8; k++) |
822 | 0 | { |
823 | 0 | eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_src[2 * src_strd] |
824 | 0 | + g_ai2_ihevc_trans_32[6][k] |
825 | 0 | * pi2_src[6 * src_strd] |
826 | 0 | + g_ai2_ihevc_trans_32[10][k] |
827 | 0 | * pi2_src[10 * src_strd] |
828 | 0 | + g_ai2_ihevc_trans_32[14][k] |
829 | 0 | * pi2_src[14 * src_strd] |
830 | 0 | + g_ai2_ihevc_trans_32[18][k] |
831 | 0 | * pi2_src[18 * src_strd] |
832 | 0 | + g_ai2_ihevc_trans_32[22][k] |
833 | 0 | * pi2_src[22 * src_strd] |
834 | 0 | + g_ai2_ihevc_trans_32[26][k] |
835 | 0 | * pi2_src[26 * src_strd] |
836 | 0 | + g_ai2_ihevc_trans_32[30][k] |
837 | 0 | * pi2_src[30 * src_strd]; |
838 | 0 | } |
839 | 0 | for(k = 0; k < 4; k++) |
840 | 0 | { |
841 | 0 | eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_src[4 * src_strd] |
842 | 0 | + g_ai2_ihevc_trans_32[12][k] |
843 | 0 | * pi2_src[12 * src_strd] |
844 | 0 | + g_ai2_ihevc_trans_32[20][k] |
845 | 0 | * pi2_src[20 * src_strd] |
846 | 0 | + g_ai2_ihevc_trans_32[28][k] |
847 | 0 | * pi2_src[28 * src_strd]; |
848 | 0 | } |
849 | 0 | eeeo[0] = g_ai2_ihevc_trans_32[8][0] * pi2_src[8 * src_strd] |
850 | 0 | + g_ai2_ihevc_trans_32[24][0] |
851 | 0 | * pi2_src[24 * src_strd]; |
852 | 0 | eeeo[1] = g_ai2_ihevc_trans_32[8][1] * pi2_src[8 * src_strd] |
853 | 0 | + g_ai2_ihevc_trans_32[24][1] |
854 | 0 | * pi2_src[24 * src_strd]; |
855 | 0 | eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_src[0] |
856 | 0 | + g_ai2_ihevc_trans_32[16][0] |
857 | 0 | * pi2_src[16 * src_strd]; |
858 | 0 | eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_src[0] |
859 | 0 | + g_ai2_ihevc_trans_32[16][1] |
860 | 0 | * pi2_src[16 * src_strd]; |
861 | | |
862 | | /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */ |
863 | 0 | eee[0] = eeee[0] + eeeo[0]; |
864 | 0 | eee[3] = eeee[0] - eeeo[0]; |
865 | 0 | eee[1] = eeee[1] + eeeo[1]; |
866 | 0 | eee[2] = eeee[1] - eeeo[1]; |
867 | 0 | for(k = 0; k < 4; k++) |
868 | 0 | { |
869 | 0 | ee[k] = eee[k] + eeo[k]; |
870 | 0 | ee[k + 4] = eee[3 - k] - eeo[3 - k]; |
871 | 0 | } |
872 | 0 | for(k = 0; k < 8; k++) |
873 | 0 | { |
874 | 0 | e[k] = ee[k] + eo[k]; |
875 | 0 | e[k + 8] = ee[7 - k] - eo[7 - k]; |
876 | 0 | } |
877 | 0 | for(k = 0; k < 16; k++) |
878 | 0 | { |
879 | 0 | pi2_tmp[k] = |
880 | 0 | CLIP_S16(((e[k] + o[k] + add) >> shift)); |
881 | 0 | pi2_tmp[k + 16] = |
882 | 0 | CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift)); |
883 | 0 | } |
884 | 0 | } |
885 | 0 | pi2_src++; |
886 | 0 | pi2_tmp += trans_size; |
887 | 0 | zero_cols = zero_cols >> 1; |
888 | 0 | } |
889 | |
|
890 | 0 | pi2_tmp = pi2_tmp_orig; |
891 | | |
892 | | /* Inverse Transform 2nd stage */ |
893 | 0 | shift = IT_SHIFT_STAGE_2; |
894 | 0 | add = 1 << (shift - 1); |
895 | 0 | if((zero_rows_2nd_stage & 0xFFFFFFF0) == 0xFFFFFFF0) /* First 4 rows of output of 1st stage are non-zero */ |
896 | 0 | { |
897 | 0 | for(j = 0; j < trans_size; j++) |
898 | 0 | { |
899 | | /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ |
900 | 0 | for(k = 0; k < 16; k++) |
901 | 0 | { |
902 | 0 | o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size] |
903 | 0 | + g_ai2_ihevc_trans_32[3][k] |
904 | 0 | * pi2_tmp[3 * trans_size]; |
905 | 0 | } |
906 | 0 | for(k = 0; k < 8; k++) |
907 | 0 | { |
908 | 0 | eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size]; |
909 | 0 | } |
910 | | // for(k = 0; k < 4; k++) |
911 | 0 | { |
912 | 0 | eeo[0] = 0; |
913 | 0 | eeo[1] = 0; |
914 | 0 | eeo[2] = 0; |
915 | 0 | eeo[3] = 0; |
916 | 0 | } |
917 | 0 | eeeo[0] = 0; |
918 | 0 | eeeo[1] = 0; |
919 | 0 | eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0]; |
920 | 0 | eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0]; |
921 | | |
922 | | /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */ |
923 | 0 | eee[0] = eeee[0] + eeeo[0]; |
924 | 0 | eee[3] = eeee[0] - eeeo[0]; |
925 | 0 | eee[1] = eeee[1] + eeeo[1]; |
926 | 0 | eee[2] = eeee[1] - eeeo[1]; |
927 | 0 | for(k = 0; k < 4; k++) |
928 | 0 | { |
929 | 0 | ee[k] = eee[k] + eeo[k]; |
930 | 0 | ee[k + 4] = eee[3 - k] - eeo[3 - k]; |
931 | 0 | } |
932 | 0 | for(k = 0; k < 8; k++) |
933 | 0 | { |
934 | 0 | e[k] = ee[k] + eo[k]; |
935 | 0 | e[k + 8] = ee[7 - k] - eo[7 - k]; |
936 | 0 | } |
937 | 0 | for(k = 0; k < 16; k++) |
938 | 0 | { |
939 | 0 | WORD32 itrans_out; |
940 | 0 | itrans_out = |
941 | 0 | CLIP_S16(((e[k] + o[k] + add) >> shift)); |
942 | 0 | pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2])); |
943 | 0 | itrans_out = |
944 | 0 | CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift)); |
945 | 0 | pu1_dst[(k + 16) * 2] = CLIP_U8((itrans_out + pu1_pred[(k + 16) * 2])); |
946 | 0 | } |
947 | 0 | pi2_tmp++; |
948 | 0 | pu1_pred += pred_strd; |
949 | 0 | pu1_dst += dst_strd; |
950 | 0 | } |
951 | 0 | } |
952 | 0 | else if((zero_rows_2nd_stage & 0xFFFFFF00) == 0xFFFFFF00) /* First 8 rows of output of 1st stage are non-zero */ |
953 | 0 | { |
954 | 0 | for(j = 0; j < trans_size; j++) |
955 | 0 | { |
956 | | /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ |
957 | 0 | for(k = 0; k < 16; k++) |
958 | 0 | { |
959 | 0 | o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size] |
960 | 0 | + g_ai2_ihevc_trans_32[3][k] |
961 | 0 | * pi2_tmp[3 * trans_size] |
962 | 0 | + g_ai2_ihevc_trans_32[5][k] |
963 | 0 | * pi2_tmp[5 * trans_size] |
964 | 0 | + g_ai2_ihevc_trans_32[7][k] |
965 | 0 | * pi2_tmp[7 * trans_size]; |
966 | 0 | } |
967 | 0 | for(k = 0; k < 8; k++) |
968 | 0 | { |
969 | 0 | eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size] |
970 | 0 | + g_ai2_ihevc_trans_32[6][k] |
971 | 0 | * pi2_tmp[6 * trans_size]; |
972 | 0 | } |
973 | 0 | for(k = 0; k < 4; k++) |
974 | 0 | { |
975 | 0 | eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_tmp[4 * trans_size]; |
976 | 0 | } |
977 | 0 | eeeo[0] = 0; |
978 | 0 | eeeo[1] = 0; |
979 | 0 | eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0]; |
980 | 0 | eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0]; |
981 | | |
982 | | /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */ |
983 | 0 | eee[0] = eeee[0] + eeeo[0]; |
984 | 0 | eee[3] = eeee[0] - eeeo[0]; |
985 | 0 | eee[1] = eeee[1] + eeeo[1]; |
986 | 0 | eee[2] = eeee[1] - eeeo[1]; |
987 | 0 | for(k = 0; k < 4; k++) |
988 | 0 | { |
989 | 0 | ee[k] = eee[k] + eeo[k]; |
990 | 0 | ee[k + 4] = eee[3 - k] - eeo[3 - k]; |
991 | 0 | } |
992 | 0 | for(k = 0; k < 8; k++) |
993 | 0 | { |
994 | 0 | e[k] = ee[k] + eo[k]; |
995 | 0 | e[k + 8] = ee[7 - k] - eo[7 - k]; |
996 | 0 | } |
997 | 0 | for(k = 0; k < 16; k++) |
998 | 0 | { |
999 | 0 | WORD32 itrans_out; |
1000 | 0 | itrans_out = |
1001 | 0 | CLIP_S16(((e[k] + o[k] + add) >> shift)); |
1002 | 0 | pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2])); |
1003 | 0 | itrans_out = |
1004 | 0 | CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift)); |
1005 | 0 | pu1_dst[(k + 16) * 2] = CLIP_U8((itrans_out + pu1_pred[(k + 16) * 2])); |
1006 | 0 | } |
1007 | 0 | pi2_tmp++; |
1008 | 0 | pu1_pred += pred_strd; |
1009 | 0 | pu1_dst += dst_strd; |
1010 | 0 | } |
1011 | 0 | } |
1012 | 0 | else /* All rows of output of 1st stage are non-zero */ |
1013 | 0 | { |
1014 | 0 | for(j = 0; j < trans_size; j++) |
1015 | 0 | { |
1016 | | /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ |
1017 | 0 | for(k = 0; k < 16; k++) |
1018 | 0 | { |
1019 | 0 | o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size] |
1020 | 0 | + g_ai2_ihevc_trans_32[3][k] |
1021 | 0 | * pi2_tmp[3 * trans_size] |
1022 | 0 | + g_ai2_ihevc_trans_32[5][k] |
1023 | 0 | * pi2_tmp[5 * trans_size] |
1024 | 0 | + g_ai2_ihevc_trans_32[7][k] |
1025 | 0 | * pi2_tmp[7 * trans_size] |
1026 | 0 | + g_ai2_ihevc_trans_32[9][k] |
1027 | 0 | * pi2_tmp[9 * trans_size] |
1028 | 0 | + g_ai2_ihevc_trans_32[11][k] |
1029 | 0 | * pi2_tmp[11 * trans_size] |
1030 | 0 | + g_ai2_ihevc_trans_32[13][k] |
1031 | 0 | * pi2_tmp[13 * trans_size] |
1032 | 0 | + g_ai2_ihevc_trans_32[15][k] |
1033 | 0 | * pi2_tmp[15 * trans_size] |
1034 | 0 | + g_ai2_ihevc_trans_32[17][k] |
1035 | 0 | * pi2_tmp[17 * trans_size] |
1036 | 0 | + g_ai2_ihevc_trans_32[19][k] |
1037 | 0 | * pi2_tmp[19 * trans_size] |
1038 | 0 | + g_ai2_ihevc_trans_32[21][k] |
1039 | 0 | * pi2_tmp[21 * trans_size] |
1040 | 0 | + g_ai2_ihevc_trans_32[23][k] |
1041 | 0 | * pi2_tmp[23 * trans_size] |
1042 | 0 | + g_ai2_ihevc_trans_32[25][k] |
1043 | 0 | * pi2_tmp[25 * trans_size] |
1044 | 0 | + g_ai2_ihevc_trans_32[27][k] |
1045 | 0 | * pi2_tmp[27 * trans_size] |
1046 | 0 | + g_ai2_ihevc_trans_32[29][k] |
1047 | 0 | * pi2_tmp[29 * trans_size] |
1048 | 0 | + g_ai2_ihevc_trans_32[31][k] |
1049 | 0 | * pi2_tmp[31 * trans_size]; |
1050 | 0 | } |
1051 | 0 | for(k = 0; k < 8; k++) |
1052 | 0 | { |
1053 | 0 | eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size] |
1054 | 0 | + g_ai2_ihevc_trans_32[6][k] |
1055 | 0 | * pi2_tmp[6 * trans_size] |
1056 | 0 | + g_ai2_ihevc_trans_32[10][k] |
1057 | 0 | * pi2_tmp[10 * trans_size] |
1058 | 0 | + g_ai2_ihevc_trans_32[14][k] |
1059 | 0 | * pi2_tmp[14 * trans_size] |
1060 | 0 | + g_ai2_ihevc_trans_32[18][k] |
1061 | 0 | * pi2_tmp[18 * trans_size] |
1062 | 0 | + g_ai2_ihevc_trans_32[22][k] |
1063 | 0 | * pi2_tmp[22 * trans_size] |
1064 | 0 | + g_ai2_ihevc_trans_32[26][k] |
1065 | 0 | * pi2_tmp[26 * trans_size] |
1066 | 0 | + g_ai2_ihevc_trans_32[30][k] |
1067 | 0 | * pi2_tmp[30 * trans_size]; |
1068 | 0 | } |
1069 | 0 | for(k = 0; k < 4; k++) |
1070 | 0 | { |
1071 | 0 | eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_tmp[4 * trans_size] |
1072 | 0 | + g_ai2_ihevc_trans_32[12][k] |
1073 | 0 | * pi2_tmp[12 * trans_size] |
1074 | 0 | + g_ai2_ihevc_trans_32[20][k] |
1075 | 0 | * pi2_tmp[20 * trans_size] |
1076 | 0 | + g_ai2_ihevc_trans_32[28][k] |
1077 | 0 | * pi2_tmp[28 * trans_size]; |
1078 | 0 | } |
1079 | 0 | eeeo[0] = |
1080 | 0 | g_ai2_ihevc_trans_32[8][0] * pi2_tmp[8 * trans_size] |
1081 | 0 | + g_ai2_ihevc_trans_32[24][0] |
1082 | 0 | * pi2_tmp[24 |
1083 | 0 | * trans_size]; |
1084 | 0 | eeeo[1] = |
1085 | 0 | g_ai2_ihevc_trans_32[8][1] * pi2_tmp[8 * trans_size] |
1086 | 0 | + g_ai2_ihevc_trans_32[24][1] |
1087 | 0 | * pi2_tmp[24 |
1088 | 0 | * trans_size]; |
1089 | 0 | eeee[0] = |
1090 | 0 | g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0] |
1091 | 0 | + g_ai2_ihevc_trans_32[16][0] |
1092 | 0 | * pi2_tmp[16 |
1093 | 0 | * trans_size]; |
1094 | 0 | eeee[1] = |
1095 | 0 | g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0] |
1096 | 0 | + g_ai2_ihevc_trans_32[16][1] |
1097 | 0 | * pi2_tmp[16 |
1098 | 0 | * trans_size]; |
1099 | | |
1100 | | /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */ |
1101 | 0 | eee[0] = eeee[0] + eeeo[0]; |
1102 | 0 | eee[3] = eeee[0] - eeeo[0]; |
1103 | 0 | eee[1] = eeee[1] + eeeo[1]; |
1104 | 0 | eee[2] = eeee[1] - eeeo[1]; |
1105 | 0 | for(k = 0; k < 4; k++) |
1106 | 0 | { |
1107 | 0 | ee[k] = eee[k] + eeo[k]; |
1108 | 0 | ee[k + 4] = eee[3 - k] - eeo[3 - k]; |
1109 | 0 | } |
1110 | 0 | for(k = 0; k < 8; k++) |
1111 | 0 | { |
1112 | 0 | e[k] = ee[k] + eo[k]; |
1113 | 0 | e[k + 8] = ee[7 - k] - eo[7 - k]; |
1114 | 0 | } |
1115 | 0 | for(k = 0; k < 16; k++) |
1116 | 0 | { |
1117 | 0 | WORD32 itrans_out; |
1118 | 0 | itrans_out = |
1119 | 0 | CLIP_S16(((e[k] + o[k] + add) >> shift)); |
1120 | 0 | pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2])); |
1121 | 0 | itrans_out = |
1122 | 0 | CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift)); |
1123 | 0 | pu1_dst[(k + 16) * 2] = CLIP_U8((itrans_out + pu1_pred[(k + 16) * 2])); |
1124 | 0 | } |
1125 | 0 | pi2_tmp++; |
1126 | 0 | pu1_pred += pred_strd; |
1127 | 0 | pu1_dst += dst_strd; |
1128 | 0 | } |
1129 | 0 | } |
1130 | | /************************************************************************************************/ |
1131 | | /************************************END - IT_RECON_32x32****************************************/ |
1132 | | /************************************************************************************************/ |
1133 | 0 | } |
1134 | 0 | } |
1135 | | |