Coverage Report

Created: 2025-07-12 07:16

/src/libhevc/common/ihevc_itrans_recon_32x32.c
Line
Count
Source
1
/******************************************************************************
2
*
3
* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4
*
5
* Licensed under the Apache License, Version 2.0 (the "License");
6
* you may not use this file except in compliance with the License.
7
* You may obtain a copy of the License at:
8
*
9
* http://www.apache.org/licenses/LICENSE-2.0
10
*
11
* Unless required by applicable law or agreed to in writing, software
12
* distributed under the License is distributed on an "AS IS" BASIS,
13
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
* See the License for the specific language governing permissions and
15
* limitations under the License.
16
*
17
******************************************************************************/
18
/**
19
 *******************************************************************************
20
 * @file
21
 *  ihevc_itrans_recon_32x32.c
22
 *
23
 * @brief
24
 *  Contains function definitions for inverse transform  and reconstruction 32x32
25
 *
26
 *
27
 * @author
28
 *  100470
29
 *
30
 * @par List of Functions:
31
 *  - ihevc_itrans_recon_32x32()
32
 *
33
 * @remarks
34
 *  None
35
 *
36
 *******************************************************************************
37
 */
38
#include <stdio.h>
39
#include <string.h>
40
#include "ihevc_typedefs.h"
41
#include "ihevc_macros.h"
42
#include "ihevc_platform_macros.h"
43
#include "ihevc_defs.h"
44
#include "ihevc_trans_tables.h"
45
#include "ihevc_itrans_recon.h"
46
#include "ihevc_func_selector.h"
47
#include "ihevc_trans_macros.h"
48
49
50
/**
51
 *******************************************************************************
52
 *
53
 * @brief
54
 *  This function performs Inverse transform  and reconstruction for 32x32
55
 * input block
56
 *
57
 * @par Description:
58
 *  Performs inverse transform and adds the prediction  data and clips output
59
 * to 8 bit
60
 *
61
 * @param[in] pi2_src
62
 *  Input 32x32 coefficients
63
 *
64
 * @param[in] pi2_tmp
65
 *  Temporary 32x32 buffer for storing inverse
66
 *
67
 *  transform
68
 *  1st stage output
69
 *
70
 * @param[in] pu1_pred
71
 *  Prediction 32x32 block
72
 *
73
 * @param[out] pu1_dst
74
 *  Output 32x32 block
75
 *
76
 * @param[in] src_strd
77
 *  Input stride
78
 *
79
 * @param[in] pred_strd
80
 *  Prediction stride
81
 *
82
 * @param[in] dst_strd
83
 *  Output Stride
84
 *
85
 * @param[in] shift
86
 *  Output shift
87
 *
88
 * @param[in] zero_cols
89
 *  Zero columns in pi2_src
90
 *
91
 * @returns  Void
92
 *
93
 * @remarks
94
 *  None
95
 *
96
 *******************************************************************************
97
 */
98
99
void ihevc_itrans_recon_32x32(WORD16 *pi2_src,
100
                              WORD16 *pi2_tmp,
101
                              UWORD8 *pu1_pred,
102
                              UWORD8 *pu1_dst,
103
                              WORD32 src_strd,
104
                              WORD32 pred_strd,
105
                              WORD32 dst_strd,
106
                              WORD32 zero_cols,
107
                              WORD32 zero_rows)
108
340k
{
109
340k
    WORD32 j, k;
110
340k
    WORD32 e[16], o[16];
111
340k
    WORD32 ee[8], eo[8];
112
340k
    WORD32 eee[4], eeo[4];
113
340k
    WORD32 eeee[2], eeeo[2];
114
340k
    WORD32 add;
115
340k
    WORD32 shift;
116
340k
    WORD16 *pi2_tmp_orig;
117
340k
    WORD32 trans_size;
118
340k
    WORD32 zero_rows_2nd_stage = zero_cols;
119
340k
    WORD32 row_limit_2nd_stage;
120
121
340k
    trans_size = TRANS_SIZE_32;
122
340k
    pi2_tmp_orig = pi2_tmp;
123
124
340k
    if((zero_cols & 0xFFFFFFF0) == 0xFFFFFFF0)
125
81.6k
        row_limit_2nd_stage = 4;
126
258k
    else if((zero_cols & 0xFFFFFF00) == 0xFFFFFF00)
127
47.7k
        row_limit_2nd_stage = 8;
128
211k
    else
129
211k
        row_limit_2nd_stage = TRANS_SIZE_32;
130
131
340k
    if((zero_rows & 0xFFFFFFF0) == 0xFFFFFFF0)  /* First 4 rows of input are non-zero */
132
68.5k
    {
133
        /************************************************************************************************/
134
        /**********************************START - IT_RECON_32x32****************************************/
135
        /************************************************************************************************/
136
        /* Inverse Transform 1st stage */
137
68.5k
        shift = IT_SHIFT_STAGE_1;
138
68.5k
        add = 1 << (shift - 1);
139
140
1.05M
        for(j = 0; j < row_limit_2nd_stage; j++)
141
984k
        {
142
            /* Checking for Zero Cols */
143
984k
            if((zero_cols & 1) == 1)
144
513k
            {
145
513k
                memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
146
513k
            }
147
470k
            else
148
470k
            {
149
                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
150
7.99M
                for(k = 0; k < 16; k++)
151
7.52M
                {
152
7.52M
                    o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_src[src_strd]
153
7.52M
                                    + g_ai2_ihevc_trans_32[3][k]
154
7.52M
                                                    * pi2_src[3 * src_strd];
155
7.52M
                }
156
4.23M
                for(k = 0; k < 8; k++)
157
3.76M
                {
158
3.76M
                    eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_src[2 * src_strd];
159
3.76M
                }
160
//                for(k = 0; k < 4; k++)
161
470k
                {
162
470k
                    eeo[0] = 0;
163
470k
                    eeo[1] = 0;
164
470k
                    eeo[2] = 0;
165
470k
                    eeo[3] = 0;
166
470k
                }
167
470k
                eeeo[0] = 0;
168
470k
                eeeo[1] = 0;
169
470k
                eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_src[0];
170
470k
                eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_src[0];
171
172
                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
173
470k
                eee[0] = eeee[0] + eeeo[0];
174
470k
                eee[3] = eeee[0] - eeeo[0];
175
470k
                eee[1] = eeee[1] + eeeo[1];
176
470k
                eee[2] = eeee[1] - eeeo[1];
177
2.35M
                for(k = 0; k < 4; k++)
178
1.88M
                {
179
1.88M
                    ee[k] = eee[k] + eeo[k];
180
1.88M
                    ee[k + 4] = eee[3 - k] - eeo[3 - k];
181
1.88M
                }
182
4.23M
                for(k = 0; k < 8; k++)
183
3.76M
                {
184
3.76M
                    e[k] = ee[k] + eo[k];
185
3.76M
                    e[k + 8] = ee[7 - k] - eo[7 - k];
186
3.76M
                }
187
7.99M
                for(k = 0; k < 16; k++)
188
7.52M
                {
189
7.52M
                    pi2_tmp[k] =
190
7.52M
                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
191
7.52M
                    pi2_tmp[k + 16] =
192
7.52M
                                    CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
193
7.52M
                }
194
470k
            }
195
984k
            pi2_src++;
196
984k
            pi2_tmp += trans_size;
197
984k
            zero_cols = zero_cols >> 1;
198
984k
        }
199
200
68.5k
        pi2_tmp = pi2_tmp_orig;
201
202
        /* Inverse Transform 2nd stage */
203
68.5k
        shift = IT_SHIFT_STAGE_2;
204
68.5k
        add = 1 << (shift - 1);
205
68.5k
        if((zero_rows_2nd_stage & 0xFFFFFFF0) == 0xFFFFFFF0) /* First 4 rows of output of 1st stage are non-zero */
206
32.0k
        {
207
1.05M
            for(j = 0; j < trans_size; j++)
208
1.02M
            {
209
                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
210
17.4M
                for(k = 0; k < 16; k++)
211
16.3M
                {
212
16.3M
                    o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size]
213
16.3M
                                    + g_ai2_ihevc_trans_32[3][k]
214
16.3M
                                                    * pi2_tmp[3 * trans_size];
215
16.3M
                }
216
9.21M
                for(k = 0; k < 8; k++)
217
8.19M
                {
218
8.19M
                    eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size];
219
8.19M
                }
220
//                for(k = 0; k < 4; k++)
221
1.02M
                {
222
1.02M
                    eeo[0] = 0;
223
1.02M
                    eeo[1] = 0;
224
1.02M
                    eeo[2] = 0;
225
1.02M
                    eeo[3] = 0;
226
1.02M
                }
227
1.02M
                eeeo[0] = 0;
228
1.02M
                eeeo[1] = 0;
229
1.02M
                eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0];
230
1.02M
                eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0];
231
232
                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
233
1.02M
                eee[0] = eeee[0] + eeeo[0];
234
1.02M
                eee[3] = eeee[0] - eeeo[0];
235
1.02M
                eee[1] = eeee[1] + eeeo[1];
236
1.02M
                eee[2] = eeee[1] - eeeo[1];
237
5.10M
                for(k = 0; k < 4; k++)
238
4.07M
                {
239
4.07M
                    ee[k] = eee[k] + eeo[k];
240
4.07M
                    ee[k + 4] = eee[3 - k] - eeo[3 - k];
241
4.07M
                }
242
9.18M
                for(k = 0; k < 8; k++)
243
8.16M
                {
244
8.16M
                    e[k] = ee[k] + eo[k];
245
8.16M
                    e[k + 8] = ee[7 - k] - eo[7 - k];
246
8.16M
                }
247
17.2M
                for(k = 0; k < 16; k++)
248
16.2M
                {
249
16.2M
                    WORD32 itrans_out;
250
16.2M
                    itrans_out =
251
16.2M
                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
252
16.2M
                    pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
253
16.2M
                    itrans_out =
254
16.2M
                                    CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
255
16.2M
                    pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16]));
256
16.2M
                }
257
1.02M
                pi2_tmp++;
258
1.02M
                pu1_pred += pred_strd;
259
1.02M
                pu1_dst += dst_strd;
260
1.02M
            }
261
32.0k
        }
262
36.5k
        else if((zero_rows_2nd_stage & 0xFFFFFF00) == 0xFFFFFF00) /* First 8 rows of output of 1st stage are non-zero */
263
12.9k
        {
264
428k
            for(j = 0; j < trans_size; j++)
265
415k
            {
266
                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
267
7.06M
                for(k = 0; k < 16; k++)
268
6.65M
                {
269
6.65M
                    o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size]
270
6.65M
                                    + g_ai2_ihevc_trans_32[3][k]
271
6.65M
                                                    * pi2_tmp[3 * trans_size]
272
6.65M
                                    + g_ai2_ihevc_trans_32[5][k]
273
6.65M
                                                    * pi2_tmp[5 * trans_size]
274
6.65M
                                    + g_ai2_ihevc_trans_32[7][k]
275
6.65M
                                                    * pi2_tmp[7 * trans_size];
276
6.65M
                }
277
3.74M
                for(k = 0; k < 8; k++)
278
3.32M
                {
279
3.32M
                    eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size]
280
3.32M
                                    + g_ai2_ihevc_trans_32[6][k]
281
3.32M
                                                    * pi2_tmp[6 * trans_size];
282
3.32M
                }
283
2.07M
                for(k = 0; k < 4; k++)
284
1.66M
                {
285
1.66M
                    eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_tmp[4 * trans_size];
286
1.66M
                }
287
415k
                eeeo[0] = 0;
288
415k
                eeeo[1] = 0;
289
415k
                eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0];
290
415k
                eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0];
291
292
                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
293
415k
                eee[0] = eeee[0] + eeeo[0];
294
415k
                eee[3] = eeee[0] - eeeo[0];
295
415k
                eee[1] = eeee[1] + eeeo[1];
296
415k
                eee[2] = eeee[1] - eeeo[1];
297
2.07M
                for(k = 0; k < 4; k++)
298
1.66M
                {
299
1.66M
                    ee[k] = eee[k] + eeo[k];
300
1.66M
                    ee[k + 4] = eee[3 - k] - eeo[3 - k];
301
1.66M
                }
302
3.74M
                for(k = 0; k < 8; k++)
303
3.32M
                {
304
3.32M
                    e[k] = ee[k] + eo[k];
305
3.32M
                    e[k + 8] = ee[7 - k] - eo[7 - k];
306
3.32M
                }
307
7.06M
                for(k = 0; k < 16; k++)
308
6.64M
                {
309
6.64M
                    WORD32 itrans_out;
310
6.64M
                    itrans_out =
311
6.64M
                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
312
6.64M
                    pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
313
6.64M
                    itrans_out =
314
6.64M
                                    CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
315
6.64M
                    pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16]));
316
6.64M
                }
317
415k
                pi2_tmp++;
318
415k
                pu1_pred += pred_strd;
319
415k
                pu1_dst += dst_strd;
320
415k
            }
321
12.9k
        }
322
23.5k
        else /* All rows of output of 1st stage are non-zero */
323
23.5k
        {
324
775k
            for(j = 0; j < trans_size; j++)
325
752k
            {
326
                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
327
12.5M
                for(k = 0; k < 16; k++)
328
11.8M
                {
329
11.8M
                    o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size]
330
11.8M
                                    + g_ai2_ihevc_trans_32[3][k]
331
11.8M
                                                    * pi2_tmp[3 * trans_size]
332
11.8M
                                    + g_ai2_ihevc_trans_32[5][k]
333
11.8M
                                                    * pi2_tmp[5 * trans_size]
334
11.8M
                                    + g_ai2_ihevc_trans_32[7][k]
335
11.8M
                                                    * pi2_tmp[7 * trans_size]
336
11.8M
                                    + g_ai2_ihevc_trans_32[9][k]
337
11.8M
                                                    * pi2_tmp[9 * trans_size]
338
11.8M
                                    + g_ai2_ihevc_trans_32[11][k]
339
11.8M
                                                    * pi2_tmp[11 * trans_size]
340
11.8M
                                    + g_ai2_ihevc_trans_32[13][k]
341
11.8M
                                                    * pi2_tmp[13 * trans_size]
342
11.8M
                                    + g_ai2_ihevc_trans_32[15][k]
343
11.8M
                                                    * pi2_tmp[15 * trans_size]
344
11.8M
                                    + g_ai2_ihevc_trans_32[17][k]
345
11.8M
                                                    * pi2_tmp[17 * trans_size]
346
11.8M
                                    + g_ai2_ihevc_trans_32[19][k]
347
11.8M
                                                    * pi2_tmp[19 * trans_size]
348
11.8M
                                    + g_ai2_ihevc_trans_32[21][k]
349
11.8M
                                                    * pi2_tmp[21 * trans_size]
350
11.8M
                                    + g_ai2_ihevc_trans_32[23][k]
351
11.8M
                                                    * pi2_tmp[23 * trans_size]
352
11.8M
                                    + g_ai2_ihevc_trans_32[25][k]
353
11.8M
                                                    * pi2_tmp[25 * trans_size]
354
11.8M
                                    + g_ai2_ihevc_trans_32[27][k]
355
11.8M
                                                    * pi2_tmp[27 * trans_size]
356
11.8M
                                    + g_ai2_ihevc_trans_32[29][k]
357
11.8M
                                                    * pi2_tmp[29 * trans_size]
358
11.8M
                                    + g_ai2_ihevc_trans_32[31][k]
359
11.8M
                                                    * pi2_tmp[31 * trans_size];
360
11.8M
                }
361
6.71M
                for(k = 0; k < 8; k++)
362
5.96M
                {
363
5.96M
                    eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size]
364
5.96M
                                    + g_ai2_ihevc_trans_32[6][k]
365
5.96M
                                                    * pi2_tmp[6 * trans_size]
366
5.96M
                                    + g_ai2_ihevc_trans_32[10][k]
367
5.96M
                                                    * pi2_tmp[10 * trans_size]
368
5.96M
                                    + g_ai2_ihevc_trans_32[14][k]
369
5.96M
                                                    * pi2_tmp[14 * trans_size]
370
5.96M
                                    + g_ai2_ihevc_trans_32[18][k]
371
5.96M
                                                    * pi2_tmp[18 * trans_size]
372
5.96M
                                    + g_ai2_ihevc_trans_32[22][k]
373
5.96M
                                                    * pi2_tmp[22 * trans_size]
374
5.96M
                                    + g_ai2_ihevc_trans_32[26][k]
375
5.96M
                                                    * pi2_tmp[26 * trans_size]
376
5.96M
                                    + g_ai2_ihevc_trans_32[30][k]
377
5.96M
                                                    * pi2_tmp[30 * trans_size];
378
5.96M
                }
379
3.75M
                for(k = 0; k < 4; k++)
380
2.99M
                {
381
2.99M
                    eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_tmp[4 * trans_size]
382
2.99M
                                    + g_ai2_ihevc_trans_32[12][k]
383
2.99M
                                                    * pi2_tmp[12 * trans_size]
384
2.99M
                                    + g_ai2_ihevc_trans_32[20][k]
385
2.99M
                                                    * pi2_tmp[20 * trans_size]
386
2.99M
                                    + g_ai2_ihevc_trans_32[28][k]
387
2.99M
                                                    * pi2_tmp[28 * trans_size];
388
2.99M
                }
389
752k
                eeeo[0] =
390
752k
                                g_ai2_ihevc_trans_32[8][0] * pi2_tmp[8 * trans_size]
391
752k
                                                + g_ai2_ihevc_trans_32[24][0]
392
752k
                                                                * pi2_tmp[24
393
752k
                                                                                * trans_size];
394
752k
                eeeo[1] =
395
752k
                                g_ai2_ihevc_trans_32[8][1] * pi2_tmp[8 * trans_size]
396
752k
                                                + g_ai2_ihevc_trans_32[24][1]
397
752k
                                                                * pi2_tmp[24
398
752k
                                                                                * trans_size];
399
752k
                eeee[0] =
400
752k
                                g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0]
401
752k
                                                + g_ai2_ihevc_trans_32[16][0]
402
752k
                                                                * pi2_tmp[16
403
752k
                                                                                * trans_size];
404
752k
                eeee[1] =
405
752k
                                g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0]
406
752k
                                                + g_ai2_ihevc_trans_32[16][1]
407
752k
                                                                * pi2_tmp[16
408
752k
                                                                                * trans_size];
409
410
                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
411
752k
                eee[0] = eeee[0] + eeeo[0];
412
752k
                eee[3] = eeee[0] - eeeo[0];
413
752k
                eee[1] = eeee[1] + eeeo[1];
414
752k
                eee[2] = eeee[1] - eeeo[1];
415
3.74M
                for(k = 0; k < 4; k++)
416
2.99M
                {
417
2.99M
                    ee[k] = eee[k] + eeo[k];
418
2.99M
                    ee[k + 4] = eee[3 - k] - eeo[3 - k];
419
2.99M
                }
420
6.74M
                for(k = 0; k < 8; k++)
421
5.99M
                {
422
5.99M
                    e[k] = ee[k] + eo[k];
423
5.99M
                    e[k + 8] = ee[7 - k] - eo[7 - k];
424
5.99M
                }
425
12.6M
                for(k = 0; k < 16; k++)
426
11.9M
                {
427
11.9M
                    WORD32 itrans_out;
428
11.9M
                    itrans_out =
429
11.9M
                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
430
11.9M
                    pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
431
11.9M
                    itrans_out =
432
11.9M
                                    CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
433
11.9M
                    pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16]));
434
11.9M
                }
435
752k
                pi2_tmp++;
436
752k
                pu1_pred += pred_strd;
437
752k
                pu1_dst += dst_strd;
438
752k
            }
439
23.5k
        }
440
        /************************************************************************************************/
441
        /************************************END - IT_RECON_32x32****************************************/
442
        /************************************************************************************************/
443
68.5k
    }
444
272k
    else if((zero_rows & 0xFFFFFF00) == 0xFFFFFF00) /* First 8 rows of input are non-zero */
445
49.5k
    {
446
        /************************************************************************************************/
447
        /**********************************START - IT_RECON_32x32****************************************/
448
        /************************************************************************************************/
449
        /* Inverse Transform 1st stage */
450
49.5k
        shift = IT_SHIFT_STAGE_1;
451
49.5k
        add = 1 << (shift - 1);
452
453
702k
        for(j = 0; j < row_limit_2nd_stage; j++)
454
652k
        {
455
            /* Checking for Zero Cols */
456
652k
            if((zero_cols & 1) == 1)
457
445k
            {
458
445k
                memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
459
445k
            }
460
207k
            else
461
207k
            {
462
                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
463
3.52M
                for(k = 0; k < 16; k++)
464
3.31M
                {
465
3.31M
                    o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_src[src_strd]
466
3.31M
                                    + g_ai2_ihevc_trans_32[3][k]
467
3.31M
                                                    * pi2_src[3 * src_strd]
468
3.31M
                                    + g_ai2_ihevc_trans_32[5][k]
469
3.31M
                                                    * pi2_src[5 * src_strd]
470
3.31M
                                    + g_ai2_ihevc_trans_32[7][k]
471
3.31M
                                                    * pi2_src[7 * src_strd];
472
3.31M
                }
473
1.86M
                for(k = 0; k < 8; k++)
474
1.65M
                {
475
1.65M
                    eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_src[2 * src_strd]
476
1.65M
                                    + g_ai2_ihevc_trans_32[6][k]
477
1.65M
                                                    * pi2_src[6 * src_strd];
478
1.65M
                }
479
1.03M
                for(k = 0; k < 4; k++)
480
828k
                {
481
828k
                    eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_src[4 * src_strd];
482
828k
                }
483
207k
                eeeo[0] = 0;
484
207k
                eeeo[1] = 0;
485
207k
                eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_src[0];
486
207k
                eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_src[0];
487
488
                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
489
207k
                eee[0] = eeee[0] + eeeo[0];
490
207k
                eee[3] = eeee[0] - eeeo[0];
491
207k
                eee[1] = eeee[1] + eeeo[1];
492
207k
                eee[2] = eeee[1] - eeeo[1];
493
1.03M
                for(k = 0; k < 4; k++)
494
828k
                {
495
828k
                    ee[k] = eee[k] + eeo[k];
496
828k
                    ee[k + 4] = eee[3 - k] - eeo[3 - k];
497
828k
                }
498
1.86M
                for(k = 0; k < 8; k++)
499
1.65M
                {
500
1.65M
                    e[k] = ee[k] + eo[k];
501
1.65M
                    e[k + 8] = ee[7 - k] - eo[7 - k];
502
1.65M
                }
503
3.51M
                for(k = 0; k < 16; k++)
504
3.31M
                {
505
3.31M
                    pi2_tmp[k] =
506
3.31M
                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
507
3.31M
                    pi2_tmp[k + 16] =
508
3.31M
                                    CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
509
3.31M
                }
510
207k
            }
511
652k
            pi2_src++;
512
652k
            pi2_tmp += trans_size;
513
652k
            zero_cols = zero_cols >> 1;
514
652k
        }
515
516
49.5k
        pi2_tmp = pi2_tmp_orig;
517
518
        /* Inverse Transform 2nd stage */
519
49.5k
        shift = IT_SHIFT_STAGE_2;
520
49.5k
        add = 1 << (shift - 1);
521
49.5k
        if((zero_rows_2nd_stage & 0xFFFFFFF0) == 0xFFFFFFF0) /* First 4 rows of output of 1st stage are non-zero */
522
19.0k
        {
523
624k
            for(j = 0; j < trans_size; j++)
524
605k
            {
525
                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
526
10.1M
                for(k = 0; k < 16; k++)
527
9.51M
                {
528
9.51M
                    o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size]
529
9.51M
                                    + g_ai2_ihevc_trans_32[3][k]
530
9.51M
                                                    * pi2_tmp[3 * trans_size];
531
9.51M
                }
532
5.39M
                for(k = 0; k < 8; k++)
533
4.79M
                {
534
4.79M
                    eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size];
535
4.79M
                }
536
//                for(k = 0; k < 4; k++)
537
605k
                {
538
605k
                    eeo[0] = 0;
539
605k
                    eeo[1] = 0;
540
605k
                    eeo[2] = 0;
541
605k
                    eeo[3] = 0;
542
605k
                }
543
605k
                eeeo[0] = 0;
544
605k
                eeeo[1] = 0;
545
605k
                eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0];
546
605k
                eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0];
547
548
                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
549
605k
                eee[0] = eeee[0] + eeeo[0];
550
605k
                eee[3] = eeee[0] - eeeo[0];
551
605k
                eee[1] = eeee[1] + eeeo[1];
552
605k
                eee[2] = eeee[1] - eeeo[1];
553
2.99M
                for(k = 0; k < 4; k++)
554
2.38M
                {
555
2.38M
                    ee[k] = eee[k] + eeo[k];
556
2.38M
                    ee[k + 4] = eee[3 - k] - eeo[3 - k];
557
2.38M
                }
558
5.36M
                for(k = 0; k < 8; k++)
559
4.75M
                {
560
4.75M
                    e[k] = ee[k] + eo[k];
561
4.75M
                    e[k + 8] = ee[7 - k] - eo[7 - k];
562
4.75M
                }
563
9.92M
                for(k = 0; k < 16; k++)
564
9.32M
                {
565
9.32M
                    WORD32 itrans_out;
566
9.32M
                    itrans_out =
567
9.32M
                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
568
9.32M
                    pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
569
9.32M
                    itrans_out =
570
9.32M
                                    CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
571
9.32M
                    pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16]));
572
9.32M
                }
573
605k
                pi2_tmp++;
574
605k
                pu1_pred += pred_strd;
575
605k
                pu1_dst += dst_strd;
576
605k
            }
577
19.0k
        }
578
30.4k
        else if((zero_rows_2nd_stage & 0xFFFFFF00) == 0xFFFFFF00) /* First 8 rows of output of 1st stage are non-zero */
579
16.5k
        {
580
543k
            for(j = 0; j < trans_size; j++)
581
526k
            {
582
                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
583
8.83M
                for(k = 0; k < 16; k++)
584
8.31M
                {
585
8.31M
                    o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size]
586
8.31M
                                    + g_ai2_ihevc_trans_32[3][k]
587
8.31M
                                                    * pi2_tmp[3 * trans_size]
588
8.31M
                                    + g_ai2_ihevc_trans_32[5][k]
589
8.31M
                                                    * pi2_tmp[5 * trans_size]
590
8.31M
                                    + g_ai2_ihevc_trans_32[7][k]
591
8.31M
                                                    * pi2_tmp[7 * trans_size];
592
8.31M
                }
593
4.72M
                for(k = 0; k < 8; k++)
594
4.19M
                {
595
4.19M
                    eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size]
596
4.19M
                                    + g_ai2_ihevc_trans_32[6][k]
597
4.19M
                                                    * pi2_tmp[6 * trans_size];
598
4.19M
                }
599
2.62M
                for(k = 0; k < 4; k++)
600
2.09M
                {
601
2.09M
                    eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_tmp[4 * trans_size];
602
2.09M
                }
603
526k
                eeeo[0] = 0;
604
526k
                eeeo[1] = 0;
605
526k
                eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0];
606
526k
                eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0];
607
608
                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
609
526k
                eee[0] = eeee[0] + eeeo[0];
610
526k
                eee[3] = eeee[0] - eeeo[0];
611
526k
                eee[1] = eeee[1] + eeeo[1];
612
526k
                eee[2] = eeee[1] - eeeo[1];
613
2.60M
                for(k = 0; k < 4; k++)
614
2.07M
                {
615
2.07M
                    ee[k] = eee[k] + eeo[k];
616
2.07M
                    ee[k + 4] = eee[3 - k] - eeo[3 - k];
617
2.07M
                }
618
4.63M
                for(k = 0; k < 8; k++)
619
4.10M
                {
620
4.10M
                    e[k] = ee[k] + eo[k];
621
4.10M
                    e[k + 8] = ee[7 - k] - eo[7 - k];
622
4.10M
                }
623
8.57M
                for(k = 0; k < 16; k++)
624
8.04M
                {
625
8.04M
                    WORD32 itrans_out;
626
8.04M
                    itrans_out =
627
8.04M
                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
628
8.04M
                    pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
629
8.04M
                    itrans_out =
630
8.04M
                                    CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
631
8.04M
                    pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16]));
632
8.04M
                }
633
526k
                pi2_tmp++;
634
526k
                pu1_pred += pred_strd;
635
526k
                pu1_dst += dst_strd;
636
526k
            }
637
16.5k
        }
638
13.8k
        else /* All rows of output of 1st stage are non-zero */
639
13.8k
        {
640
455k
            for(j = 0; j < trans_size; j++)
641
441k
            {
642
                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
643
7.30M
                for(k = 0; k < 16; k++)
644
6.86M
                {
645
6.86M
                    o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size]
646
6.86M
                                    + g_ai2_ihevc_trans_32[3][k]
647
6.86M
                                                    * pi2_tmp[3 * trans_size]
648
6.86M
                                    + g_ai2_ihevc_trans_32[5][k]
649
6.86M
                                                    * pi2_tmp[5 * trans_size]
650
6.86M
                                    + g_ai2_ihevc_trans_32[7][k]
651
6.86M
                                                    * pi2_tmp[7 * trans_size]
652
6.86M
                                    + g_ai2_ihevc_trans_32[9][k]
653
6.86M
                                                    * pi2_tmp[9 * trans_size]
654
6.86M
                                    + g_ai2_ihevc_trans_32[11][k]
655
6.86M
                                                    * pi2_tmp[11 * trans_size]
656
6.86M
                                    + g_ai2_ihevc_trans_32[13][k]
657
6.86M
                                                    * pi2_tmp[13 * trans_size]
658
6.86M
                                    + g_ai2_ihevc_trans_32[15][k]
659
6.86M
                                                    * pi2_tmp[15 * trans_size]
660
6.86M
                                    + g_ai2_ihevc_trans_32[17][k]
661
6.86M
                                                    * pi2_tmp[17 * trans_size]
662
6.86M
                                    + g_ai2_ihevc_trans_32[19][k]
663
6.86M
                                                    * pi2_tmp[19 * trans_size]
664
6.86M
                                    + g_ai2_ihevc_trans_32[21][k]
665
6.86M
                                                    * pi2_tmp[21 * trans_size]
666
6.86M
                                    + g_ai2_ihevc_trans_32[23][k]
667
6.86M
                                                    * pi2_tmp[23 * trans_size]
668
6.86M
                                    + g_ai2_ihevc_trans_32[25][k]
669
6.86M
                                                    * pi2_tmp[25 * trans_size]
670
6.86M
                                    + g_ai2_ihevc_trans_32[27][k]
671
6.86M
                                                    * pi2_tmp[27 * trans_size]
672
6.86M
                                    + g_ai2_ihevc_trans_32[29][k]
673
6.86M
                                                    * pi2_tmp[29 * trans_size]
674
6.86M
                                    + g_ai2_ihevc_trans_32[31][k]
675
6.86M
                                                    * pi2_tmp[31 * trans_size];
676
6.86M
                }
677
3.96M
                for(k = 0; k < 8; k++)
678
3.52M
                {
679
3.52M
                    eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size]
680
3.52M
                                    + g_ai2_ihevc_trans_32[6][k]
681
3.52M
                                                    * pi2_tmp[6 * trans_size]
682
3.52M
                                    + g_ai2_ihevc_trans_32[10][k]
683
3.52M
                                                    * pi2_tmp[10 * trans_size]
684
3.52M
                                    + g_ai2_ihevc_trans_32[14][k]
685
3.52M
                                                    * pi2_tmp[14 * trans_size]
686
3.52M
                                    + g_ai2_ihevc_trans_32[18][k]
687
3.52M
                                                    * pi2_tmp[18 * trans_size]
688
3.52M
                                    + g_ai2_ihevc_trans_32[22][k]
689
3.52M
                                                    * pi2_tmp[22 * trans_size]
690
3.52M
                                    + g_ai2_ihevc_trans_32[26][k]
691
3.52M
                                                    * pi2_tmp[26 * trans_size]
692
3.52M
                                    + g_ai2_ihevc_trans_32[30][k]
693
3.52M
                                                    * pi2_tmp[30 * trans_size];
694
3.52M
                }
695
2.21M
                for(k = 0; k < 4; k++)
696
1.77M
                {
697
1.77M
                    eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_tmp[4 * trans_size]
698
1.77M
                                    + g_ai2_ihevc_trans_32[12][k]
699
1.77M
                                                    * pi2_tmp[12 * trans_size]
700
1.77M
                                    + g_ai2_ihevc_trans_32[20][k]
701
1.77M
                                                    * pi2_tmp[20 * trans_size]
702
1.77M
                                    + g_ai2_ihevc_trans_32[28][k]
703
1.77M
                                                    * pi2_tmp[28 * trans_size];
704
1.77M
                }
705
441k
                eeeo[0] =
706
441k
                                g_ai2_ihevc_trans_32[8][0] * pi2_tmp[8 * trans_size]
707
441k
                                                + g_ai2_ihevc_trans_32[24][0]
708
441k
                                                                * pi2_tmp[24
709
441k
                                                                                * trans_size];
710
441k
                eeeo[1] =
711
441k
                                g_ai2_ihevc_trans_32[8][1] * pi2_tmp[8 * trans_size]
712
441k
                                                + g_ai2_ihevc_trans_32[24][1]
713
441k
                                                                * pi2_tmp[24
714
441k
                                                                                * trans_size];
715
441k
                eeee[0] =
716
441k
                                g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0]
717
441k
                                                + g_ai2_ihevc_trans_32[16][0]
718
441k
                                                                * pi2_tmp[16
719
441k
                                                                                * trans_size];
720
441k
                eeee[1] =
721
441k
                                g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0]
722
441k
                                                + g_ai2_ihevc_trans_32[16][1]
723
441k
                                                                * pi2_tmp[16
724
441k
                                                                                * trans_size];
725
726
                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
727
441k
                eee[0] = eeee[0] + eeeo[0];
728
441k
                eee[3] = eeee[0] - eeeo[0];
729
441k
                eee[1] = eeee[1] + eeeo[1];
730
441k
                eee[2] = eeee[1] - eeeo[1];
731
2.20M
                for(k = 0; k < 4; k++)
732
1.76M
                {
733
1.76M
                    ee[k] = eee[k] + eeo[k];
734
1.76M
                    ee[k + 4] = eee[3 - k] - eeo[3 - k];
735
1.76M
                }
736
3.96M
                for(k = 0; k < 8; k++)
737
3.52M
                {
738
3.52M
                    e[k] = ee[k] + eo[k];
739
3.52M
                    e[k + 8] = ee[7 - k] - eo[7 - k];
740
3.52M
                }
741
7.30M
                for(k = 0; k < 16; k++)
742
6.86M
                {
743
6.86M
                    WORD32 itrans_out;
744
6.86M
                    itrans_out =
745
6.86M
                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
746
6.86M
                    pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
747
6.86M
                    itrans_out =
748
6.86M
                                    CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
749
6.86M
                    pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16]));
750
6.86M
                }
751
441k
                pi2_tmp++;
752
441k
                pu1_pred += pred_strd;
753
441k
                pu1_dst += dst_strd;
754
441k
            }
755
13.8k
        }
756
        /************************************************************************************************/
757
        /************************************END - IT_RECON_32x32****************************************/
758
        /************************************************************************************************/
759
49.5k
    }
760
222k
    else  /* All rows of input are non-zero */
761
222k
    {
762
        /************************************************************************************************/
763
        /**********************************START - IT_RECON_32x32****************************************/
764
        /************************************************************************************************/
765
        /* Inverse Transform 1st stage */
766
222k
        shift = IT_SHIFT_STAGE_1;
767
222k
        add = 1 << (shift - 1);
768
769
6.05M
        for(j = 0; j < row_limit_2nd_stage; j++)
770
5.82M
        {
771
            /* Checking for Zero Cols */
772
5.82M
            if((zero_cols & 1) == 1)
773
796k
            {
774
796k
                memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
775
796k
            }
776
5.03M
            else
777
5.03M
            {
778
                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
779
85.5M
                for(k = 0; k < 16; k++)
780
80.5M
                {
781
80.5M
                    o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_src[src_strd]
782
80.5M
                                    + g_ai2_ihevc_trans_32[3][k]
783
80.5M
                                                    * pi2_src[3 * src_strd]
784
80.5M
                                    + g_ai2_ihevc_trans_32[5][k]
785
80.5M
                                                    * pi2_src[5 * src_strd]
786
80.5M
                                    + g_ai2_ihevc_trans_32[7][k]
787
80.5M
                                                    * pi2_src[7 * src_strd]
788
80.5M
                                    + g_ai2_ihevc_trans_32[9][k]
789
80.5M
                                                    * pi2_src[9 * src_strd]
790
80.5M
                                    + g_ai2_ihevc_trans_32[11][k]
791
80.5M
                                                    * pi2_src[11 * src_strd]
792
80.5M
                                    + g_ai2_ihevc_trans_32[13][k]
793
80.5M
                                                    * pi2_src[13 * src_strd]
794
80.5M
                                    + g_ai2_ihevc_trans_32[15][k]
795
80.5M
                                                    * pi2_src[15 * src_strd]
796
80.5M
                                    + g_ai2_ihevc_trans_32[17][k]
797
80.5M
                                                    * pi2_src[17 * src_strd]
798
80.5M
                                    + g_ai2_ihevc_trans_32[19][k]
799
80.5M
                                                    * pi2_src[19 * src_strd]
800
80.5M
                                    + g_ai2_ihevc_trans_32[21][k]
801
80.5M
                                                    * pi2_src[21 * src_strd]
802
80.5M
                                    + g_ai2_ihevc_trans_32[23][k]
803
80.5M
                                                    * pi2_src[23 * src_strd]
804
80.5M
                                    + g_ai2_ihevc_trans_32[25][k]
805
80.5M
                                                    * pi2_src[25 * src_strd]
806
80.5M
                                    + g_ai2_ihevc_trans_32[27][k]
807
80.5M
                                                    * pi2_src[27 * src_strd]
808
80.5M
                                    + g_ai2_ihevc_trans_32[29][k]
809
80.5M
                                                    * pi2_src[29 * src_strd]
810
80.5M
                                    + g_ai2_ihevc_trans_32[31][k]
811
80.5M
                                                    * pi2_src[31 * src_strd];
812
80.5M
                }
813
45.2M
                for(k = 0; k < 8; k++)
814
40.2M
                {
815
40.2M
                    eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_src[2 * src_strd]
816
40.2M
                                    + g_ai2_ihevc_trans_32[6][k]
817
40.2M
                                                    * pi2_src[6 * src_strd]
818
40.2M
                                    + g_ai2_ihevc_trans_32[10][k]
819
40.2M
                                                    * pi2_src[10 * src_strd]
820
40.2M
                                    + g_ai2_ihevc_trans_32[14][k]
821
40.2M
                                                    * pi2_src[14 * src_strd]
822
40.2M
                                    + g_ai2_ihevc_trans_32[18][k]
823
40.2M
                                                    * pi2_src[18 * src_strd]
824
40.2M
                                    + g_ai2_ihevc_trans_32[22][k]
825
40.2M
                                                    * pi2_src[22 * src_strd]
826
40.2M
                                    + g_ai2_ihevc_trans_32[26][k]
827
40.2M
                                                    * pi2_src[26 * src_strd]
828
40.2M
                                    + g_ai2_ihevc_trans_32[30][k]
829
40.2M
                                                    * pi2_src[30 * src_strd];
830
40.2M
                }
831
25.1M
                for(k = 0; k < 4; k++)
832
20.1M
                {
833
20.1M
                    eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_src[4 * src_strd]
834
20.1M
                                    + g_ai2_ihevc_trans_32[12][k]
835
20.1M
                                                    * pi2_src[12 * src_strd]
836
20.1M
                                    + g_ai2_ihevc_trans_32[20][k]
837
20.1M
                                                    * pi2_src[20 * src_strd]
838
20.1M
                                    + g_ai2_ihevc_trans_32[28][k]
839
20.1M
                                                    * pi2_src[28 * src_strd];
840
20.1M
                }
841
5.03M
                eeeo[0] = g_ai2_ihevc_trans_32[8][0] * pi2_src[8 * src_strd]
842
5.03M
                                + g_ai2_ihevc_trans_32[24][0]
843
5.03M
                                                * pi2_src[24 * src_strd];
844
5.03M
                eeeo[1] = g_ai2_ihevc_trans_32[8][1] * pi2_src[8 * src_strd]
845
5.03M
                                + g_ai2_ihevc_trans_32[24][1]
846
5.03M
                                                * pi2_src[24 * src_strd];
847
5.03M
                eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_src[0]
848
5.03M
                                + g_ai2_ihevc_trans_32[16][0]
849
5.03M
                                                * pi2_src[16 * src_strd];
850
5.03M
                eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_src[0]
851
5.03M
                                + g_ai2_ihevc_trans_32[16][1]
852
5.03M
                                                * pi2_src[16 * src_strd];
853
854
                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
855
5.03M
                eee[0] = eeee[0] + eeeo[0];
856
5.03M
                eee[3] = eeee[0] - eeeo[0];
857
5.03M
                eee[1] = eeee[1] + eeeo[1];
858
5.03M
                eee[2] = eeee[1] - eeeo[1];
859
25.1M
                for(k = 0; k < 4; k++)
860
20.1M
                {
861
20.1M
                    ee[k] = eee[k] + eeo[k];
862
20.1M
                    ee[k + 4] = eee[3 - k] - eeo[3 - k];
863
20.1M
                }
864
45.2M
                for(k = 0; k < 8; k++)
865
40.2M
                {
866
40.2M
                    e[k] = ee[k] + eo[k];
867
40.2M
                    e[k + 8] = ee[7 - k] - eo[7 - k];
868
40.2M
                }
869
85.5M
                for(k = 0; k < 16; k++)
870
80.5M
                {
871
80.5M
                    pi2_tmp[k] =
872
80.5M
                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
873
80.5M
                    pi2_tmp[k + 16] =
874
80.5M
                                    CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
875
80.5M
                }
876
5.03M
            }
877
5.82M
            pi2_src++;
878
5.82M
            pi2_tmp += trans_size;
879
5.82M
            zero_cols = zero_cols >> 1;
880
5.82M
        }
881
882
222k
        pi2_tmp = pi2_tmp_orig;
883
884
        /* Inverse Transform 2nd stage */
885
222k
        shift = IT_SHIFT_STAGE_2;
886
222k
        add = 1 << (shift - 1);
887
222k
        if((zero_rows_2nd_stage & 0xFFFFFFF0) == 0xFFFFFFF0) /* First 4 rows of output of 1st stage are non-zero */
888
30.5k
        {
889
1.00M
            for(j = 0; j < trans_size; j++)
890
972k
            {
891
                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
892
16.3M
                for(k = 0; k < 16; k++)
893
15.4M
                {
894
15.4M
                    o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size]
895
15.4M
                                    + g_ai2_ihevc_trans_32[3][k]
896
15.4M
                                                    * pi2_tmp[3 * trans_size];
897
15.4M
                }
898
8.71M
                for(k = 0; k < 8; k++)
899
7.74M
                {
900
7.74M
                    eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size];
901
7.74M
                }
902
//                for(k = 0; k < 4; k++)
903
972k
                {
904
972k
                    eeo[0] = 0;
905
972k
                    eeo[1] = 0;
906
972k
                    eeo[2] = 0;
907
972k
                    eeo[3] = 0;
908
972k
                }
909
972k
                eeeo[0] = 0;
910
972k
                eeeo[1] = 0;
911
972k
                eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0];
912
972k
                eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0];
913
914
                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
915
972k
                eee[0] = eeee[0] + eeeo[0];
916
972k
                eee[3] = eeee[0] - eeeo[0];
917
972k
                eee[1] = eeee[1] + eeeo[1];
918
972k
                eee[2] = eeee[1] - eeeo[1];
919
4.83M
                for(k = 0; k < 4; k++)
920
3.86M
                {
921
3.86M
                    ee[k] = eee[k] + eeo[k];
922
3.86M
                    ee[k + 4] = eee[3 - k] - eeo[3 - k];
923
3.86M
                }
924
8.69M
                for(k = 0; k < 8; k++)
925
7.72M
                {
926
7.72M
                    e[k] = ee[k] + eo[k];
927
7.72M
                    e[k + 8] = ee[7 - k] - eo[7 - k];
928
7.72M
                }
929
16.2M
                for(k = 0; k < 16; k++)
930
15.3M
                {
931
15.3M
                    WORD32 itrans_out;
932
15.3M
                    itrans_out =
933
15.3M
                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
934
15.3M
                    pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
935
15.3M
                    itrans_out =
936
15.3M
                                    CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
937
15.3M
                    pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16]));
938
15.3M
                }
939
972k
                pi2_tmp++;
940
972k
                pu1_pred += pred_strd;
941
972k
                pu1_dst += dst_strd;
942
972k
            }
943
30.5k
        }
944
192k
        else if((zero_rows_2nd_stage & 0xFFFFFF00) == 0xFFFFFF00) /* First 8 rows of output of 1st stage are non-zero */
945
18.2k
        {
946
598k
            for(j = 0; j < trans_size; j++)
947
579k
            {
948
                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
949
9.66M
                for(k = 0; k < 16; k++)
950
9.08M
                {
951
9.08M
                    o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size]
952
9.08M
                                    + g_ai2_ihevc_trans_32[3][k]
953
9.08M
                                                    * pi2_tmp[3 * trans_size]
954
9.08M
                                    + g_ai2_ihevc_trans_32[5][k]
955
9.08M
                                                    * pi2_tmp[5 * trans_size]
956
9.08M
                                    + g_ai2_ihevc_trans_32[7][k]
957
9.08M
                                                    * pi2_tmp[7 * trans_size];
958
9.08M
                }
959
5.18M
                for(k = 0; k < 8; k++)
960
4.60M
                {
961
4.60M
                    eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size]
962
4.60M
                                    + g_ai2_ihevc_trans_32[6][k]
963
4.60M
                                                    * pi2_tmp[6 * trans_size];
964
4.60M
                }
965
2.89M
                for(k = 0; k < 4; k++)
966
2.31M
                {
967
2.31M
                    eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_tmp[4 * trans_size];
968
2.31M
                }
969
579k
                eeeo[0] = 0;
970
579k
                eeeo[1] = 0;
971
579k
                eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0];
972
579k
                eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0];
973
974
                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
975
579k
                eee[0] = eeee[0] + eeeo[0];
976
579k
                eee[3] = eeee[0] - eeeo[0];
977
579k
                eee[1] = eeee[1] + eeeo[1];
978
579k
                eee[2] = eeee[1] - eeeo[1];
979
2.88M
                for(k = 0; k < 4; k++)
980
2.30M
                {
981
2.30M
                    ee[k] = eee[k] + eeo[k];
982
2.30M
                    ee[k + 4] = eee[3 - k] - eeo[3 - k];
983
2.30M
                }
984
5.17M
                for(k = 0; k < 8; k++)
985
4.59M
                {
986
4.59M
                    e[k] = ee[k] + eo[k];
987
4.59M
                    e[k + 8] = ee[7 - k] - eo[7 - k];
988
4.59M
                }
989
9.61M
                for(k = 0; k < 16; k++)
990
9.03M
                {
991
9.03M
                    WORD32 itrans_out;
992
9.03M
                    itrans_out =
993
9.03M
                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
994
9.03M
                    pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
995
9.03M
                    itrans_out =
996
9.03M
                                    CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
997
9.03M
                    pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16]));
998
9.03M
                }
999
579k
                pi2_tmp++;
1000
579k
                pu1_pred += pred_strd;
1001
579k
                pu1_dst += dst_strd;
1002
579k
            }
1003
18.2k
        }
1004
173k
        else /* All rows of output of 1st stage are non-zero */
1005
173k
        {
1006
5.73M
            for(j = 0; j < trans_size; j++)
1007
5.55M
            {
1008
                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
1009
93.9M
                for(k = 0; k < 16; k++)
1010
88.3M
                {
1011
88.3M
                    o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size]
1012
88.3M
                                    + g_ai2_ihevc_trans_32[3][k]
1013
88.3M
                                                    * pi2_tmp[3 * trans_size]
1014
88.3M
                                    + g_ai2_ihevc_trans_32[5][k]
1015
88.3M
                                                    * pi2_tmp[5 * trans_size]
1016
88.3M
                                    + g_ai2_ihevc_trans_32[7][k]
1017
88.3M
                                                    * pi2_tmp[7 * trans_size]
1018
88.3M
                                    + g_ai2_ihevc_trans_32[9][k]
1019
88.3M
                                                    * pi2_tmp[9 * trans_size]
1020
88.3M
                                    + g_ai2_ihevc_trans_32[11][k]
1021
88.3M
                                                    * pi2_tmp[11 * trans_size]
1022
88.3M
                                    + g_ai2_ihevc_trans_32[13][k]
1023
88.3M
                                                    * pi2_tmp[13 * trans_size]
1024
88.3M
                                    + g_ai2_ihevc_trans_32[15][k]
1025
88.3M
                                                    * pi2_tmp[15 * trans_size]
1026
88.3M
                                    + g_ai2_ihevc_trans_32[17][k]
1027
88.3M
                                                    * pi2_tmp[17 * trans_size]
1028
88.3M
                                    + g_ai2_ihevc_trans_32[19][k]
1029
88.3M
                                                    * pi2_tmp[19 * trans_size]
1030
88.3M
                                    + g_ai2_ihevc_trans_32[21][k]
1031
88.3M
                                                    * pi2_tmp[21 * trans_size]
1032
88.3M
                                    + g_ai2_ihevc_trans_32[23][k]
1033
88.3M
                                                    * pi2_tmp[23 * trans_size]
1034
88.3M
                                    + g_ai2_ihevc_trans_32[25][k]
1035
88.3M
                                                    * pi2_tmp[25 * trans_size]
1036
88.3M
                                    + g_ai2_ihevc_trans_32[27][k]
1037
88.3M
                                                    * pi2_tmp[27 * trans_size]
1038
88.3M
                                    + g_ai2_ihevc_trans_32[29][k]
1039
88.3M
                                                    * pi2_tmp[29 * trans_size]
1040
88.3M
                                    + g_ai2_ihevc_trans_32[31][k]
1041
88.3M
                                                    * pi2_tmp[31 * trans_size];
1042
88.3M
                }
1043
49.9M
                for(k = 0; k < 8; k++)
1044
44.3M
                {
1045
44.3M
                    eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size]
1046
44.3M
                                    + g_ai2_ihevc_trans_32[6][k]
1047
44.3M
                                                    * pi2_tmp[6 * trans_size]
1048
44.3M
                                    + g_ai2_ihevc_trans_32[10][k]
1049
44.3M
                                                    * pi2_tmp[10 * trans_size]
1050
44.3M
                                    + g_ai2_ihevc_trans_32[14][k]
1051
44.3M
                                                    * pi2_tmp[14 * trans_size]
1052
44.3M
                                    + g_ai2_ihevc_trans_32[18][k]
1053
44.3M
                                                    * pi2_tmp[18 * trans_size]
1054
44.3M
                                    + g_ai2_ihevc_trans_32[22][k]
1055
44.3M
                                                    * pi2_tmp[22 * trans_size]
1056
44.3M
                                    + g_ai2_ihevc_trans_32[26][k]
1057
44.3M
                                                    * pi2_tmp[26 * trans_size]
1058
44.3M
                                    + g_ai2_ihevc_trans_32[30][k]
1059
44.3M
                                                    * pi2_tmp[30 * trans_size];
1060
44.3M
                }
1061
27.7M
                for(k = 0; k < 4; k++)
1062
22.2M
                {
1063
22.2M
                    eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_tmp[4 * trans_size]
1064
22.2M
                                    + g_ai2_ihevc_trans_32[12][k]
1065
22.2M
                                                    * pi2_tmp[12 * trans_size]
1066
22.2M
                                    + g_ai2_ihevc_trans_32[20][k]
1067
22.2M
                                                    * pi2_tmp[20 * trans_size]
1068
22.2M
                                    + g_ai2_ihevc_trans_32[28][k]
1069
22.2M
                                                    * pi2_tmp[28 * trans_size];
1070
22.2M
                }
1071
5.55M
                eeeo[0] =
1072
5.55M
                                g_ai2_ihevc_trans_32[8][0] * pi2_tmp[8 * trans_size]
1073
5.55M
                                                + g_ai2_ihevc_trans_32[24][0]
1074
5.55M
                                                                * pi2_tmp[24
1075
5.55M
                                                                                * trans_size];
1076
5.55M
                eeeo[1] =
1077
5.55M
                                g_ai2_ihevc_trans_32[8][1] * pi2_tmp[8 * trans_size]
1078
5.55M
                                                + g_ai2_ihevc_trans_32[24][1]
1079
5.55M
                                                                * pi2_tmp[24
1080
5.55M
                                                                                * trans_size];
1081
5.55M
                eeee[0] =
1082
5.55M
                                g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0]
1083
5.55M
                                                + g_ai2_ihevc_trans_32[16][0]
1084
5.55M
                                                                * pi2_tmp[16
1085
5.55M
                                                                                * trans_size];
1086
5.55M
                eeee[1] =
1087
5.55M
                                g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0]
1088
5.55M
                                                + g_ai2_ihevc_trans_32[16][1]
1089
5.55M
                                                                * pi2_tmp[16
1090
5.55M
                                                                                * trans_size];
1091
1092
                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
1093
5.55M
                eee[0] = eeee[0] + eeeo[0];
1094
5.55M
                eee[3] = eeee[0] - eeeo[0];
1095
5.55M
                eee[1] = eeee[1] + eeeo[1];
1096
5.55M
                eee[2] = eeee[1] - eeeo[1];
1097
27.7M
                for(k = 0; k < 4; k++)
1098
22.2M
                {
1099
22.2M
                    ee[k] = eee[k] + eeo[k];
1100
22.2M
                    ee[k + 4] = eee[3 - k] - eeo[3 - k];
1101
22.2M
                }
1102
49.9M
                for(k = 0; k < 8; k++)
1103
44.4M
                {
1104
44.4M
                    e[k] = ee[k] + eo[k];
1105
44.4M
                    e[k + 8] = ee[7 - k] - eo[7 - k];
1106
44.4M
                }
1107
94.1M
                for(k = 0; k < 16; k++)
1108
88.6M
                {
1109
88.6M
                    WORD32 itrans_out;
1110
88.6M
                    itrans_out =
1111
88.6M
                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
1112
88.6M
                    pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
1113
88.6M
                    itrans_out =
1114
88.6M
                                    CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
1115
88.6M
                    pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16]));
1116
88.6M
                }
1117
5.55M
                pi2_tmp++;
1118
5.55M
                pu1_pred += pred_strd;
1119
5.55M
                pu1_dst += dst_strd;
1120
5.55M
            }
1121
173k
        }
1122
        /************************************************************************************************/
1123
        /************************************END - IT_RECON_32x32****************************************/
1124
        /************************************************************************************************/
1125
222k
    }
1126
340k
}
1127