Coverage Report

Created: 2025-12-08 07:01

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libhevc/common/ihevc_itrans_recon_32x32.c
Line
Count
Source
1
/******************************************************************************
2
*
3
* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4
*
5
* Licensed under the Apache License, Version 2.0 (the "License");
6
* you may not use this file except in compliance with the License.
7
* You may obtain a copy of the License at:
8
*
9
* http://www.apache.org/licenses/LICENSE-2.0
10
*
11
* Unless required by applicable law or agreed to in writing, software
12
* distributed under the License is distributed on an "AS IS" BASIS,
13
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
* See the License for the specific language governing permissions and
15
* limitations under the License.
16
*
17
******************************************************************************/
18
/**
19
 *******************************************************************************
20
 * @file
21
 *  ihevc_itrans_recon_32x32.c
22
 *
23
 * @brief
24
 *  Contains function definitions for inverse transform  and reconstruction 32x32
25
 *
26
 *
27
 * @author
28
 *  100470
29
 *
30
 * @par List of Functions:
31
 *  - ihevc_itrans_recon_32x32()
32
 *
33
 * @remarks
34
 *  None
35
 *
36
 *******************************************************************************
37
 */
38
#include <stdio.h>
39
#include <string.h>
40
#include "ihevc_typedefs.h"
41
#include "ihevc_macros.h"
42
#include "ihevc_platform_macros.h"
43
#include "ihevc_defs.h"
44
#include "ihevc_trans_tables.h"
45
#include "ihevc_itrans_recon.h"
46
#include "ihevc_func_selector.h"
47
#include "ihevc_trans_macros.h"
48
49
50
/**
51
 *******************************************************************************
52
 *
53
 * @brief
54
 *  This function performs Inverse transform  and reconstruction for 32x32
55
 * input block
56
 *
57
 * @par Description:
58
 *  Performs inverse transform and adds the prediction  data and clips output
59
 * to 8 bit
60
 *
61
 * @param[in] pi2_src
62
 *  Input 32x32 coefficients
63
 *
64
 * @param[in] pi2_tmp
65
 *  Temporary 32x32 buffer for storing inverse
66
 *
67
 *  transform
68
 *  1st stage output
69
 *
70
 * @param[in] pu1_pred
71
 *  Prediction 32x32 block
72
 *
73
 * @param[out] pu1_dst
74
 *  Output 32x32 block
75
 *
76
 * @param[in] src_strd
77
 *  Input stride
78
 *
79
 * @param[in] pred_strd
80
 *  Prediction stride
81
 *
82
 * @param[in] dst_strd
83
 *  Output Stride
84
 *
85
 * @param[in] shift
86
 *  Output shift
87
 *
88
 * @param[in] zero_cols
89
 *  Zero columns in pi2_src
90
 *
91
 * @returns  Void
92
 *
93
 * @remarks
94
 *  None
95
 *
96
 *******************************************************************************
97
 */
98
99
void ihevc_itrans_recon_32x32(WORD16 *pi2_src,
100
                              WORD16 *pi2_tmp,
101
                              UWORD8 *pu1_pred,
102
                              UWORD8 *pu1_dst,
103
                              WORD32 src_strd,
104
                              WORD32 pred_strd,
105
                              WORD32 dst_strd,
106
                              WORD32 zero_cols,
107
                              WORD32 zero_rows)
108
904k
{
109
904k
    WORD32 j, k;
110
904k
    WORD32 e[16], o[16];
111
904k
    WORD32 ee[8], eo[8];
112
904k
    WORD32 eee[4], eeo[4];
113
904k
    WORD32 eeee[2], eeeo[2];
114
904k
    WORD32 add;
115
904k
    WORD32 shift;
116
904k
    WORD16 *pi2_tmp_orig;
117
904k
    WORD32 trans_size;
118
904k
    WORD32 zero_rows_2nd_stage = zero_cols;
119
904k
    WORD32 row_limit_2nd_stage;
120
121
904k
    trans_size = TRANS_SIZE_32;
122
904k
    pi2_tmp_orig = pi2_tmp;
123
124
904k
    if((zero_cols & 0xFFFFFFF0) == 0xFFFFFFF0)
125
310k
        row_limit_2nd_stage = 4;
126
594k
    else if((zero_cols & 0xFFFFFF00) == 0xFFFFFF00)
127
284k
        row_limit_2nd_stage = 8;
128
310k
    else
129
310k
        row_limit_2nd_stage = TRANS_SIZE_32;
130
131
904k
    if((zero_rows & 0xFFFFFFF0) == 0xFFFFFFF0)  /* First 4 rows of input are non-zero */
132
310k
    {
133
        /************************************************************************************************/
134
        /**********************************START - IT_RECON_32x32****************************************/
135
        /************************************************************************************************/
136
        /* Inverse Transform 1st stage */
137
310k
        shift = IT_SHIFT_STAGE_1;
138
310k
        add = 1 << (shift - 1);
139
140
3.85M
        for(j = 0; j < row_limit_2nd_stage; j++)
141
3.54M
        {
142
            /* Checking for Zero Cols */
143
3.54M
            if((zero_cols & 1) == 1)
144
2.19M
            {
145
2.19M
                memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
146
2.19M
            }
147
1.34M
            else
148
1.34M
            {
149
                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
150
22.9M
                for(k = 0; k < 16; k++)
151
21.5M
                {
152
21.5M
                    o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_src[src_strd]
153
21.5M
                                    + g_ai2_ihevc_trans_32[3][k]
154
21.5M
                                                    * pi2_src[3 * src_strd];
155
21.5M
                }
156
12.1M
                for(k = 0; k < 8; k++)
157
10.7M
                {
158
10.7M
                    eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_src[2 * src_strd];
159
10.7M
                }
160
//                for(k = 0; k < 4; k++)
161
1.34M
                {
162
1.34M
                    eeo[0] = 0;
163
1.34M
                    eeo[1] = 0;
164
1.34M
                    eeo[2] = 0;
165
1.34M
                    eeo[3] = 0;
166
1.34M
                }
167
1.34M
                eeeo[0] = 0;
168
1.34M
                eeeo[1] = 0;
169
1.34M
                eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_src[0];
170
1.34M
                eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_src[0];
171
172
                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
173
1.34M
                eee[0] = eeee[0] + eeeo[0];
174
1.34M
                eee[3] = eeee[0] - eeeo[0];
175
1.34M
                eee[1] = eeee[1] + eeeo[1];
176
1.34M
                eee[2] = eeee[1] - eeeo[1];
177
6.74M
                for(k = 0; k < 4; k++)
178
5.39M
                {
179
5.39M
                    ee[k] = eee[k] + eeo[k];
180
5.39M
                    ee[k + 4] = eee[3 - k] - eeo[3 - k];
181
5.39M
                }
182
12.1M
                for(k = 0; k < 8; k++)
183
10.7M
                {
184
10.7M
                    e[k] = ee[k] + eo[k];
185
10.7M
                    e[k + 8] = ee[7 - k] - eo[7 - k];
186
10.7M
                }
187
22.8M
                for(k = 0; k < 16; k++)
188
21.5M
                {
189
21.5M
                    pi2_tmp[k] =
190
21.5M
                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
191
21.5M
                    pi2_tmp[k + 16] =
192
21.5M
                                    CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
193
21.5M
                }
194
1.34M
            }
195
3.54M
            pi2_src++;
196
3.54M
            pi2_tmp += trans_size;
197
3.54M
            zero_cols = zero_cols >> 1;
198
3.54M
        }
199
200
310k
        pi2_tmp = pi2_tmp_orig;
201
202
        /* Inverse Transform 2nd stage */
203
310k
        shift = IT_SHIFT_STAGE_2;
204
310k
        add = 1 << (shift - 1);
205
310k
        if((zero_rows_2nd_stage & 0xFFFFFFF0) == 0xFFFFFFF0) /* First 4 rows of output of 1st stage are non-zero */
206
153k
        {
207
5.05M
            for(j = 0; j < trans_size; j++)
208
4.90M
            {
209
                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
210
83.3M
                for(k = 0; k < 16; k++)
211
78.3M
                {
212
78.3M
                    o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size]
213
78.3M
                                    + g_ai2_ihevc_trans_32[3][k]
214
78.3M
                                                    * pi2_tmp[3 * trans_size];
215
78.3M
                }
216
44.1M
                for(k = 0; k < 8; k++)
217
39.2M
                {
218
39.2M
                    eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size];
219
39.2M
                }
220
//                for(k = 0; k < 4; k++)
221
4.90M
                {
222
4.90M
                    eeo[0] = 0;
223
4.90M
                    eeo[1] = 0;
224
4.90M
                    eeo[2] = 0;
225
4.90M
                    eeo[3] = 0;
226
4.90M
                }
227
4.90M
                eeeo[0] = 0;
228
4.90M
                eeeo[1] = 0;
229
4.90M
                eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0];
230
4.90M
                eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0];
231
232
                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
233
4.90M
                eee[0] = eeee[0] + eeeo[0];
234
4.90M
                eee[3] = eeee[0] - eeeo[0];
235
4.90M
                eee[1] = eeee[1] + eeeo[1];
236
4.90M
                eee[2] = eeee[1] - eeeo[1];
237
24.4M
                for(k = 0; k < 4; k++)
238
19.5M
                {
239
19.5M
                    ee[k] = eee[k] + eeo[k];
240
19.5M
                    ee[k + 4] = eee[3 - k] - eeo[3 - k];
241
19.5M
                }
242
44.0M
                for(k = 0; k < 8; k++)
243
39.1M
                {
244
39.1M
                    e[k] = ee[k] + eo[k];
245
39.1M
                    e[k + 8] = ee[7 - k] - eo[7 - k];
246
39.1M
                }
247
82.9M
                for(k = 0; k < 16; k++)
248
78.0M
                {
249
78.0M
                    WORD32 itrans_out;
250
78.0M
                    itrans_out =
251
78.0M
                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
252
78.0M
                    pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
253
78.0M
                    itrans_out =
254
78.0M
                                    CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
255
78.0M
                    pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16]));
256
78.0M
                }
257
4.90M
                pi2_tmp++;
258
4.90M
                pu1_pred += pred_strd;
259
4.90M
                pu1_dst += dst_strd;
260
4.90M
            }
261
153k
        }
262
157k
        else if((zero_rows_2nd_stage & 0xFFFFFF00) == 0xFFFFFF00) /* First 8 rows of output of 1st stage are non-zero */
263
86.9k
        {
264
2.86M
            for(j = 0; j < trans_size; j++)
265
2.77M
            {
266
                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
267
47.0M
                for(k = 0; k < 16; k++)
268
44.2M
                {
269
44.2M
                    o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size]
270
44.2M
                                    + g_ai2_ihevc_trans_32[3][k]
271
44.2M
                                                    * pi2_tmp[3 * trans_size]
272
44.2M
                                    + g_ai2_ihevc_trans_32[5][k]
273
44.2M
                                                    * pi2_tmp[5 * trans_size]
274
44.2M
                                    + g_ai2_ihevc_trans_32[7][k]
275
44.2M
                                                    * pi2_tmp[7 * trans_size];
276
44.2M
                }
277
24.9M
                for(k = 0; k < 8; k++)
278
22.1M
                {
279
22.1M
                    eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size]
280
22.1M
                                    + g_ai2_ihevc_trans_32[6][k]
281
22.1M
                                                    * pi2_tmp[6 * trans_size];
282
22.1M
                }
283
13.8M
                for(k = 0; k < 4; k++)
284
11.0M
                {
285
11.0M
                    eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_tmp[4 * trans_size];
286
11.0M
                }
287
2.77M
                eeeo[0] = 0;
288
2.77M
                eeeo[1] = 0;
289
2.77M
                eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0];
290
2.77M
                eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0];
291
292
                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
293
2.77M
                eee[0] = eeee[0] + eeeo[0];
294
2.77M
                eee[3] = eeee[0] - eeeo[0];
295
2.77M
                eee[1] = eeee[1] + eeeo[1];
296
2.77M
                eee[2] = eeee[1] - eeeo[1];
297
13.8M
                for(k = 0; k < 4; k++)
298
11.0M
                {
299
11.0M
                    ee[k] = eee[k] + eeo[k];
300
11.0M
                    ee[k + 4] = eee[3 - k] - eeo[3 - k];
301
11.0M
                }
302
24.8M
                for(k = 0; k < 8; k++)
303
22.1M
                {
304
22.1M
                    e[k] = ee[k] + eo[k];
305
22.1M
                    e[k + 8] = ee[7 - k] - eo[7 - k];
306
22.1M
                }
307
46.6M
                for(k = 0; k < 16; k++)
308
43.8M
                {
309
43.8M
                    WORD32 itrans_out;
310
43.8M
                    itrans_out =
311
43.8M
                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
312
43.8M
                    pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
313
43.8M
                    itrans_out =
314
43.8M
                                    CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
315
43.8M
                    pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16]));
316
43.8M
                }
317
2.77M
                pi2_tmp++;
318
2.77M
                pu1_pred += pred_strd;
319
2.77M
                pu1_dst += dst_strd;
320
2.77M
            }
321
86.9k
        }
322
70.0k
        else /* All rows of output of 1st stage are non-zero */
323
70.0k
        {
324
2.30M
            for(j = 0; j < trans_size; j++)
325
2.23M
            {
326
                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
327
36.9M
                for(k = 0; k < 16; k++)
328
34.7M
                {
329
34.7M
                    o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size]
330
34.7M
                                    + g_ai2_ihevc_trans_32[3][k]
331
34.7M
                                                    * pi2_tmp[3 * trans_size]
332
34.7M
                                    + g_ai2_ihevc_trans_32[5][k]
333
34.7M
                                                    * pi2_tmp[5 * trans_size]
334
34.7M
                                    + g_ai2_ihevc_trans_32[7][k]
335
34.7M
                                                    * pi2_tmp[7 * trans_size]
336
34.7M
                                    + g_ai2_ihevc_trans_32[9][k]
337
34.7M
                                                    * pi2_tmp[9 * trans_size]
338
34.7M
                                    + g_ai2_ihevc_trans_32[11][k]
339
34.7M
                                                    * pi2_tmp[11 * trans_size]
340
34.7M
                                    + g_ai2_ihevc_trans_32[13][k]
341
34.7M
                                                    * pi2_tmp[13 * trans_size]
342
34.7M
                                    + g_ai2_ihevc_trans_32[15][k]
343
34.7M
                                                    * pi2_tmp[15 * trans_size]
344
34.7M
                                    + g_ai2_ihevc_trans_32[17][k]
345
34.7M
                                                    * pi2_tmp[17 * trans_size]
346
34.7M
                                    + g_ai2_ihevc_trans_32[19][k]
347
34.7M
                                                    * pi2_tmp[19 * trans_size]
348
34.7M
                                    + g_ai2_ihevc_trans_32[21][k]
349
34.7M
                                                    * pi2_tmp[21 * trans_size]
350
34.7M
                                    + g_ai2_ihevc_trans_32[23][k]
351
34.7M
                                                    * pi2_tmp[23 * trans_size]
352
34.7M
                                    + g_ai2_ihevc_trans_32[25][k]
353
34.7M
                                                    * pi2_tmp[25 * trans_size]
354
34.7M
                                    + g_ai2_ihevc_trans_32[27][k]
355
34.7M
                                                    * pi2_tmp[27 * trans_size]
356
34.7M
                                    + g_ai2_ihevc_trans_32[29][k]
357
34.7M
                                                    * pi2_tmp[29 * trans_size]
358
34.7M
                                    + g_ai2_ihevc_trans_32[31][k]
359
34.7M
                                                    * pi2_tmp[31 * trans_size];
360
34.7M
                }
361
19.9M
                for(k = 0; k < 8; k++)
362
17.6M
                {
363
17.6M
                    eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size]
364
17.6M
                                    + g_ai2_ihevc_trans_32[6][k]
365
17.6M
                                                    * pi2_tmp[6 * trans_size]
366
17.6M
                                    + g_ai2_ihevc_trans_32[10][k]
367
17.6M
                                                    * pi2_tmp[10 * trans_size]
368
17.6M
                                    + g_ai2_ihevc_trans_32[14][k]
369
17.6M
                                                    * pi2_tmp[14 * trans_size]
370
17.6M
                                    + g_ai2_ihevc_trans_32[18][k]
371
17.6M
                                                    * pi2_tmp[18 * trans_size]
372
17.6M
                                    + g_ai2_ihevc_trans_32[22][k]
373
17.6M
                                                    * pi2_tmp[22 * trans_size]
374
17.6M
                                    + g_ai2_ihevc_trans_32[26][k]
375
17.6M
                                                    * pi2_tmp[26 * trans_size]
376
17.6M
                                    + g_ai2_ihevc_trans_32[30][k]
377
17.6M
                                                    * pi2_tmp[30 * trans_size];
378
17.6M
                }
379
11.1M
                for(k = 0; k < 4; k++)
380
8.90M
                {
381
8.90M
                    eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_tmp[4 * trans_size]
382
8.90M
                                    + g_ai2_ihevc_trans_32[12][k]
383
8.90M
                                                    * pi2_tmp[12 * trans_size]
384
8.90M
                                    + g_ai2_ihevc_trans_32[20][k]
385
8.90M
                                                    * pi2_tmp[20 * trans_size]
386
8.90M
                                    + g_ai2_ihevc_trans_32[28][k]
387
8.90M
                                                    * pi2_tmp[28 * trans_size];
388
8.90M
                }
389
2.23M
                eeeo[0] =
390
2.23M
                                g_ai2_ihevc_trans_32[8][0] * pi2_tmp[8 * trans_size]
391
2.23M
                                                + g_ai2_ihevc_trans_32[24][0]
392
2.23M
                                                                * pi2_tmp[24
393
2.23M
                                                                                * trans_size];
394
2.23M
                eeeo[1] =
395
2.23M
                                g_ai2_ihevc_trans_32[8][1] * pi2_tmp[8 * trans_size]
396
2.23M
                                                + g_ai2_ihevc_trans_32[24][1]
397
2.23M
                                                                * pi2_tmp[24
398
2.23M
                                                                                * trans_size];
399
2.23M
                eeee[0] =
400
2.23M
                                g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0]
401
2.23M
                                                + g_ai2_ihevc_trans_32[16][0]
402
2.23M
                                                                * pi2_tmp[16
403
2.23M
                                                                                * trans_size];
404
2.23M
                eeee[1] =
405
2.23M
                                g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0]
406
2.23M
                                                + g_ai2_ihevc_trans_32[16][1]
407
2.23M
                                                                * pi2_tmp[16
408
2.23M
                                                                                * trans_size];
409
410
                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
411
2.23M
                eee[0] = eeee[0] + eeeo[0];
412
2.23M
                eee[3] = eeee[0] - eeeo[0];
413
2.23M
                eee[1] = eeee[1] + eeeo[1];
414
2.23M
                eee[2] = eeee[1] - eeeo[1];
415
11.1M
                for(k = 0; k < 4; k++)
416
8.87M
                {
417
8.87M
                    ee[k] = eee[k] + eeo[k];
418
8.87M
                    ee[k + 4] = eee[3 - k] - eeo[3 - k];
419
8.87M
                }
420
19.9M
                for(k = 0; k < 8; k++)
421
17.7M
                {
422
17.7M
                    e[k] = ee[k] + eo[k];
423
17.7M
                    e[k + 8] = ee[7 - k] - eo[7 - k];
424
17.7M
                }
425
37.4M
                for(k = 0; k < 16; k++)
426
35.2M
                {
427
35.2M
                    WORD32 itrans_out;
428
35.2M
                    itrans_out =
429
35.2M
                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
430
35.2M
                    pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
431
35.2M
                    itrans_out =
432
35.2M
                                    CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
433
35.2M
                    pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16]));
434
35.2M
                }
435
2.23M
                pi2_tmp++;
436
2.23M
                pu1_pred += pred_strd;
437
2.23M
                pu1_dst += dst_strd;
438
2.23M
            }
439
70.0k
        }
440
        /************************************************************************************************/
441
        /************************************END - IT_RECON_32x32****************************************/
442
        /************************************************************************************************/
443
310k
    }
444
594k
    else if((zero_rows & 0xFFFFFF00) == 0xFFFFFF00) /* First 8 rows of input are non-zero */
445
233k
    {
446
        /************************************************************************************************/
447
        /**********************************START - IT_RECON_32x32****************************************/
448
        /************************************************************************************************/
449
        /* Inverse Transform 1st stage */
450
233k
        shift = IT_SHIFT_STAGE_1;
451
233k
        add = 1 << (shift - 1);
452
453
3.28M
        for(j = 0; j < row_limit_2nd_stage; j++)
454
3.05M
        {
455
            /* Checking for Zero Cols */
456
3.05M
            if((zero_cols & 1) == 1)
457
2.38M
            {
458
2.38M
                memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
459
2.38M
            }
460
662k
            else
461
662k
            {
462
                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
463
11.2M
                for(k = 0; k < 16; k++)
464
10.6M
                {
465
10.6M
                    o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_src[src_strd]
466
10.6M
                                    + g_ai2_ihevc_trans_32[3][k]
467
10.6M
                                                    * pi2_src[3 * src_strd]
468
10.6M
                                    + g_ai2_ihevc_trans_32[5][k]
469
10.6M
                                                    * pi2_src[5 * src_strd]
470
10.6M
                                    + g_ai2_ihevc_trans_32[7][k]
471
10.6M
                                                    * pi2_src[7 * src_strd];
472
10.6M
                }
473
5.96M
                for(k = 0; k < 8; k++)
474
5.30M
                {
475
5.30M
                    eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_src[2 * src_strd]
476
5.30M
                                    + g_ai2_ihevc_trans_32[6][k]
477
5.30M
                                                    * pi2_src[6 * src_strd];
478
5.30M
                }
479
3.31M
                for(k = 0; k < 4; k++)
480
2.65M
                {
481
2.65M
                    eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_src[4 * src_strd];
482
2.65M
                }
483
662k
                eeeo[0] = 0;
484
662k
                eeeo[1] = 0;
485
662k
                eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_src[0];
486
662k
                eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_src[0];
487
488
                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
489
662k
                eee[0] = eeee[0] + eeeo[0];
490
662k
                eee[3] = eeee[0] - eeeo[0];
491
662k
                eee[1] = eeee[1] + eeeo[1];
492
662k
                eee[2] = eeee[1] - eeeo[1];
493
3.31M
                for(k = 0; k < 4; k++)
494
2.65M
                {
495
2.65M
                    ee[k] = eee[k] + eeo[k];
496
2.65M
                    ee[k + 4] = eee[3 - k] - eeo[3 - k];
497
2.65M
                }
498
5.96M
                for(k = 0; k < 8; k++)
499
5.30M
                {
500
5.30M
                    e[k] = ee[k] + eo[k];
501
5.30M
                    e[k + 8] = ee[7 - k] - eo[7 - k];
502
5.30M
                }
503
11.2M
                for(k = 0; k < 16; k++)
504
10.6M
                {
505
10.6M
                    pi2_tmp[k] =
506
10.6M
                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
507
10.6M
                    pi2_tmp[k + 16] =
508
10.6M
                                    CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
509
10.6M
                }
510
662k
            }
511
3.05M
            pi2_src++;
512
3.05M
            pi2_tmp += trans_size;
513
3.05M
            zero_cols = zero_cols >> 1;
514
3.05M
        }
515
516
233k
        pi2_tmp = pi2_tmp_orig;
517
518
        /* Inverse Transform 2nd stage */
519
233k
        shift = IT_SHIFT_STAGE_2;
520
233k
        add = 1 << (shift - 1);
521
233k
        if((zero_rows_2nd_stage & 0xFFFFFFF0) == 0xFFFFFFF0) /* First 4 rows of output of 1st stage are non-zero */
522
70.5k
        {
523
2.30M
            for(j = 0; j < trans_size; j++)
524
2.23M
            {
525
                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
526
37.5M
                for(k = 0; k < 16; k++)
527
35.3M
                {
528
35.3M
                    o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size]
529
35.3M
                                    + g_ai2_ihevc_trans_32[3][k]
530
35.3M
                                                    * pi2_tmp[3 * trans_size];
531
35.3M
                }
532
20.0M
                for(k = 0; k < 8; k++)
533
17.7M
                {
534
17.7M
                    eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size];
535
17.7M
                }
536
//                for(k = 0; k < 4; k++)
537
2.23M
                {
538
2.23M
                    eeo[0] = 0;
539
2.23M
                    eeo[1] = 0;
540
2.23M
                    eeo[2] = 0;
541
2.23M
                    eeo[3] = 0;
542
2.23M
                }
543
2.23M
                eeeo[0] = 0;
544
2.23M
                eeeo[1] = 0;
545
2.23M
                eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0];
546
2.23M
                eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0];
547
548
                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
549
2.23M
                eee[0] = eeee[0] + eeeo[0];
550
2.23M
                eee[3] = eeee[0] - eeeo[0];
551
2.23M
                eee[1] = eeee[1] + eeeo[1];
552
2.23M
                eee[2] = eeee[1] - eeeo[1];
553
11.0M
                for(k = 0; k < 4; k++)
554
8.84M
                {
555
8.84M
                    ee[k] = eee[k] + eeo[k];
556
8.84M
                    ee[k + 4] = eee[3 - k] - eeo[3 - k];
557
8.84M
                }
558
19.8M
                for(k = 0; k < 8; k++)
559
17.6M
                {
560
17.6M
                    e[k] = ee[k] + eo[k];
561
17.6M
                    e[k + 8] = ee[7 - k] - eo[7 - k];
562
17.6M
                }
563
36.9M
                for(k = 0; k < 16; k++)
564
34.6M
                {
565
34.6M
                    WORD32 itrans_out;
566
34.6M
                    itrans_out =
567
34.6M
                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
568
34.6M
                    pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
569
34.6M
                    itrans_out =
570
34.6M
                                    CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
571
34.6M
                    pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16]));
572
34.6M
                }
573
2.23M
                pi2_tmp++;
574
2.23M
                pu1_pred += pred_strd;
575
2.23M
                pu1_dst += dst_strd;
576
2.23M
            }
577
70.5k
        }
578
163k
        else if((zero_rows_2nd_stage & 0xFFFFFF00) == 0xFFFFFF00) /* First 8 rows of output of 1st stage are non-zero */
579
101k
        {
580
3.32M
            for(j = 0; j < trans_size; j++)
581
3.22M
            {
582
                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
583
53.8M
                for(k = 0; k < 16; k++)
584
50.6M
                {
585
50.6M
                    o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size]
586
50.6M
                                    + g_ai2_ihevc_trans_32[3][k]
587
50.6M
                                                    * pi2_tmp[3 * trans_size]
588
50.6M
                                    + g_ai2_ihevc_trans_32[5][k]
589
50.6M
                                                    * pi2_tmp[5 * trans_size]
590
50.6M
                                    + g_ai2_ihevc_trans_32[7][k]
591
50.6M
                                                    * pi2_tmp[7 * trans_size];
592
50.6M
                }
593
28.8M
                for(k = 0; k < 8; k++)
594
25.6M
                {
595
25.6M
                    eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size]
596
25.6M
                                    + g_ai2_ihevc_trans_32[6][k]
597
25.6M
                                                    * pi2_tmp[6 * trans_size];
598
25.6M
                }
599
16.0M
                for(k = 0; k < 4; k++)
600
12.8M
                {
601
12.8M
                    eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_tmp[4 * trans_size];
602
12.8M
                }
603
3.22M
                eeeo[0] = 0;
604
3.22M
                eeeo[1] = 0;
605
3.22M
                eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0];
606
3.22M
                eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0];
607
608
                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
609
3.22M
                eee[0] = eeee[0] + eeeo[0];
610
3.22M
                eee[3] = eeee[0] - eeeo[0];
611
3.22M
                eee[1] = eeee[1] + eeeo[1];
612
3.22M
                eee[2] = eeee[1] - eeeo[1];
613
15.8M
                for(k = 0; k < 4; k++)
614
12.6M
                {
615
12.6M
                    ee[k] = eee[k] + eeo[k];
616
12.6M
                    ee[k + 4] = eee[3 - k] - eeo[3 - k];
617
12.6M
                }
618
28.2M
                for(k = 0; k < 8; k++)
619
25.0M
                {
620
25.0M
                    e[k] = ee[k] + eo[k];
621
25.0M
                    e[k + 8] = ee[7 - k] - eo[7 - k];
622
25.0M
                }
623
51.2M
                for(k = 0; k < 16; k++)
624
47.9M
                {
625
47.9M
                    WORD32 itrans_out;
626
47.9M
                    itrans_out =
627
47.9M
                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
628
47.9M
                    pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
629
47.9M
                    itrans_out =
630
47.9M
                                    CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
631
47.9M
                    pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16]));
632
47.9M
                }
633
3.22M
                pi2_tmp++;
634
3.22M
                pu1_pred += pred_strd;
635
3.22M
                pu1_dst += dst_strd;
636
3.22M
            }
637
101k
        }
638
61.1k
        else /* All rows of output of 1st stage are non-zero */
639
61.1k
        {
640
2.01M
            for(j = 0; j < trans_size; j++)
641
1.95M
            {
642
                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
643
32.5M
                for(k = 0; k < 16; k++)
644
30.6M
                {
645
30.6M
                    o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size]
646
30.6M
                                    + g_ai2_ihevc_trans_32[3][k]
647
30.6M
                                                    * pi2_tmp[3 * trans_size]
648
30.6M
                                    + g_ai2_ihevc_trans_32[5][k]
649
30.6M
                                                    * pi2_tmp[5 * trans_size]
650
30.6M
                                    + g_ai2_ihevc_trans_32[7][k]
651
30.6M
                                                    * pi2_tmp[7 * trans_size]
652
30.6M
                                    + g_ai2_ihevc_trans_32[9][k]
653
30.6M
                                                    * pi2_tmp[9 * trans_size]
654
30.6M
                                    + g_ai2_ihevc_trans_32[11][k]
655
30.6M
                                                    * pi2_tmp[11 * trans_size]
656
30.6M
                                    + g_ai2_ihevc_trans_32[13][k]
657
30.6M
                                                    * pi2_tmp[13 * trans_size]
658
30.6M
                                    + g_ai2_ihevc_trans_32[15][k]
659
30.6M
                                                    * pi2_tmp[15 * trans_size]
660
30.6M
                                    + g_ai2_ihevc_trans_32[17][k]
661
30.6M
                                                    * pi2_tmp[17 * trans_size]
662
30.6M
                                    + g_ai2_ihevc_trans_32[19][k]
663
30.6M
                                                    * pi2_tmp[19 * trans_size]
664
30.6M
                                    + g_ai2_ihevc_trans_32[21][k]
665
30.6M
                                                    * pi2_tmp[21 * trans_size]
666
30.6M
                                    + g_ai2_ihevc_trans_32[23][k]
667
30.6M
                                                    * pi2_tmp[23 * trans_size]
668
30.6M
                                    + g_ai2_ihevc_trans_32[25][k]
669
30.6M
                                                    * pi2_tmp[25 * trans_size]
670
30.6M
                                    + g_ai2_ihevc_trans_32[27][k]
671
30.6M
                                                    * pi2_tmp[27 * trans_size]
672
30.6M
                                    + g_ai2_ihevc_trans_32[29][k]
673
30.6M
                                                    * pi2_tmp[29 * trans_size]
674
30.6M
                                    + g_ai2_ihevc_trans_32[31][k]
675
30.6M
                                                    * pi2_tmp[31 * trans_size];
676
30.6M
                }
677
17.5M
                for(k = 0; k < 8; k++)
678
15.5M
                {
679
15.5M
                    eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size]
680
15.5M
                                    + g_ai2_ihevc_trans_32[6][k]
681
15.5M
                                                    * pi2_tmp[6 * trans_size]
682
15.5M
                                    + g_ai2_ihevc_trans_32[10][k]
683
15.5M
                                                    * pi2_tmp[10 * trans_size]
684
15.5M
                                    + g_ai2_ihevc_trans_32[14][k]
685
15.5M
                                                    * pi2_tmp[14 * trans_size]
686
15.5M
                                    + g_ai2_ihevc_trans_32[18][k]
687
15.5M
                                                    * pi2_tmp[18 * trans_size]
688
15.5M
                                    + g_ai2_ihevc_trans_32[22][k]
689
15.5M
                                                    * pi2_tmp[22 * trans_size]
690
15.5M
                                    + g_ai2_ihevc_trans_32[26][k]
691
15.5M
                                                    * pi2_tmp[26 * trans_size]
692
15.5M
                                    + g_ai2_ihevc_trans_32[30][k]
693
15.5M
                                                    * pi2_tmp[30 * trans_size];
694
15.5M
                }
695
9.76M
                for(k = 0; k < 4; k++)
696
7.81M
                {
697
7.81M
                    eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_tmp[4 * trans_size]
698
7.81M
                                    + g_ai2_ihevc_trans_32[12][k]
699
7.81M
                                                    * pi2_tmp[12 * trans_size]
700
7.81M
                                    + g_ai2_ihevc_trans_32[20][k]
701
7.81M
                                                    * pi2_tmp[20 * trans_size]
702
7.81M
                                    + g_ai2_ihevc_trans_32[28][k]
703
7.81M
                                                    * pi2_tmp[28 * trans_size];
704
7.81M
                }
705
1.95M
                eeeo[0] =
706
1.95M
                                g_ai2_ihevc_trans_32[8][0] * pi2_tmp[8 * trans_size]
707
1.95M
                                                + g_ai2_ihevc_trans_32[24][0]
708
1.95M
                                                                * pi2_tmp[24
709
1.95M
                                                                                * trans_size];
710
1.95M
                eeeo[1] =
711
1.95M
                                g_ai2_ihevc_trans_32[8][1] * pi2_tmp[8 * trans_size]
712
1.95M
                                                + g_ai2_ihevc_trans_32[24][1]
713
1.95M
                                                                * pi2_tmp[24
714
1.95M
                                                                                * trans_size];
715
1.95M
                eeee[0] =
716
1.95M
                                g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0]
717
1.95M
                                                + g_ai2_ihevc_trans_32[16][0]
718
1.95M
                                                                * pi2_tmp[16
719
1.95M
                                                                                * trans_size];
720
1.95M
                eeee[1] =
721
1.95M
                                g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0]
722
1.95M
                                                + g_ai2_ihevc_trans_32[16][1]
723
1.95M
                                                                * pi2_tmp[16
724
1.95M
                                                                                * trans_size];
725
726
                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
727
1.95M
                eee[0] = eeee[0] + eeeo[0];
728
1.95M
                eee[3] = eeee[0] - eeeo[0];
729
1.95M
                eee[1] = eeee[1] + eeeo[1];
730
1.95M
                eee[2] = eeee[1] - eeeo[1];
731
9.74M
                for(k = 0; k < 4; k++)
732
7.79M
                {
733
7.79M
                    ee[k] = eee[k] + eeo[k];
734
7.79M
                    ee[k + 4] = eee[3 - k] - eeo[3 - k];
735
7.79M
                }
736
17.5M
                for(k = 0; k < 8; k++)
737
15.5M
                {
738
15.5M
                    e[k] = ee[k] + eo[k];
739
15.5M
                    e[k + 8] = ee[7 - k] - eo[7 - k];
740
15.5M
                }
741
32.5M
                for(k = 0; k < 16; k++)
742
30.5M
                {
743
30.5M
                    WORD32 itrans_out;
744
30.5M
                    itrans_out =
745
30.5M
                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
746
30.5M
                    pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
747
30.5M
                    itrans_out =
748
30.5M
                                    CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
749
30.5M
                    pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16]));
750
30.5M
                }
751
1.95M
                pi2_tmp++;
752
1.95M
                pu1_pred += pred_strd;
753
1.95M
                pu1_dst += dst_strd;
754
1.95M
            }
755
61.1k
        }
756
        /************************************************************************************************/
757
        /************************************END - IT_RECON_32x32****************************************/
758
        /************************************************************************************************/
759
233k
    }
760
360k
    else  /* All rows of input are non-zero */
761
360k
    {
762
        /************************************************************************************************/
763
        /**********************************START - IT_RECON_32x32****************************************/
764
        /************************************************************************************************/
765
        /* Inverse Transform 1st stage */
766
360k
        shift = IT_SHIFT_STAGE_1;
767
360k
        add = 1 << (shift - 1);
768
769
7.19M
        for(j = 0; j < row_limit_2nd_stage; j++)
770
6.83M
        {
771
            /* Checking for Zero Cols */
772
6.83M
            if((zero_cols & 1) == 1)
773
2.96M
            {
774
2.96M
                memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
775
2.96M
            }
776
3.87M
            else
777
3.87M
            {
778
                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
779
65.8M
                for(k = 0; k < 16; k++)
780
61.9M
                {
781
61.9M
                    o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_src[src_strd]
782
61.9M
                                    + g_ai2_ihevc_trans_32[3][k]
783
61.9M
                                                    * pi2_src[3 * src_strd]
784
61.9M
                                    + g_ai2_ihevc_trans_32[5][k]
785
61.9M
                                                    * pi2_src[5 * src_strd]
786
61.9M
                                    + g_ai2_ihevc_trans_32[7][k]
787
61.9M
                                                    * pi2_src[7 * src_strd]
788
61.9M
                                    + g_ai2_ihevc_trans_32[9][k]
789
61.9M
                                                    * pi2_src[9 * src_strd]
790
61.9M
                                    + g_ai2_ihevc_trans_32[11][k]
791
61.9M
                                                    * pi2_src[11 * src_strd]
792
61.9M
                                    + g_ai2_ihevc_trans_32[13][k]
793
61.9M
                                                    * pi2_src[13 * src_strd]
794
61.9M
                                    + g_ai2_ihevc_trans_32[15][k]
795
61.9M
                                                    * pi2_src[15 * src_strd]
796
61.9M
                                    + g_ai2_ihevc_trans_32[17][k]
797
61.9M
                                                    * pi2_src[17 * src_strd]
798
61.9M
                                    + g_ai2_ihevc_trans_32[19][k]
799
61.9M
                                                    * pi2_src[19 * src_strd]
800
61.9M
                                    + g_ai2_ihevc_trans_32[21][k]
801
61.9M
                                                    * pi2_src[21 * src_strd]
802
61.9M
                                    + g_ai2_ihevc_trans_32[23][k]
803
61.9M
                                                    * pi2_src[23 * src_strd]
804
61.9M
                                    + g_ai2_ihevc_trans_32[25][k]
805
61.9M
                                                    * pi2_src[25 * src_strd]
806
61.9M
                                    + g_ai2_ihevc_trans_32[27][k]
807
61.9M
                                                    * pi2_src[27 * src_strd]
808
61.9M
                                    + g_ai2_ihevc_trans_32[29][k]
809
61.9M
                                                    * pi2_src[29 * src_strd]
810
61.9M
                                    + g_ai2_ihevc_trans_32[31][k]
811
61.9M
                                                    * pi2_src[31 * src_strd];
812
61.9M
                }
813
34.8M
                for(k = 0; k < 8; k++)
814
31.0M
                {
815
31.0M
                    eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_src[2 * src_strd]
816
31.0M
                                    + g_ai2_ihevc_trans_32[6][k]
817
31.0M
                                                    * pi2_src[6 * src_strd]
818
31.0M
                                    + g_ai2_ihevc_trans_32[10][k]
819
31.0M
                                                    * pi2_src[10 * src_strd]
820
31.0M
                                    + g_ai2_ihevc_trans_32[14][k]
821
31.0M
                                                    * pi2_src[14 * src_strd]
822
31.0M
                                    + g_ai2_ihevc_trans_32[18][k]
823
31.0M
                                                    * pi2_src[18 * src_strd]
824
31.0M
                                    + g_ai2_ihevc_trans_32[22][k]
825
31.0M
                                                    * pi2_src[22 * src_strd]
826
31.0M
                                    + g_ai2_ihevc_trans_32[26][k]
827
31.0M
                                                    * pi2_src[26 * src_strd]
828
31.0M
                                    + g_ai2_ihevc_trans_32[30][k]
829
31.0M
                                                    * pi2_src[30 * src_strd];
830
31.0M
                }
831
19.3M
                for(k = 0; k < 4; k++)
832
15.5M
                {
833
15.5M
                    eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_src[4 * src_strd]
834
15.5M
                                    + g_ai2_ihevc_trans_32[12][k]
835
15.5M
                                                    * pi2_src[12 * src_strd]
836
15.5M
                                    + g_ai2_ihevc_trans_32[20][k]
837
15.5M
                                                    * pi2_src[20 * src_strd]
838
15.5M
                                    + g_ai2_ihevc_trans_32[28][k]
839
15.5M
                                                    * pi2_src[28 * src_strd];
840
15.5M
                }
841
3.87M
                eeeo[0] = g_ai2_ihevc_trans_32[8][0] * pi2_src[8 * src_strd]
842
3.87M
                                + g_ai2_ihevc_trans_32[24][0]
843
3.87M
                                                * pi2_src[24 * src_strd];
844
3.87M
                eeeo[1] = g_ai2_ihevc_trans_32[8][1] * pi2_src[8 * src_strd]
845
3.87M
                                + g_ai2_ihevc_trans_32[24][1]
846
3.87M
                                                * pi2_src[24 * src_strd];
847
3.87M
                eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_src[0]
848
3.87M
                                + g_ai2_ihevc_trans_32[16][0]
849
3.87M
                                                * pi2_src[16 * src_strd];
850
3.87M
                eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_src[0]
851
3.87M
                                + g_ai2_ihevc_trans_32[16][1]
852
3.87M
                                                * pi2_src[16 * src_strd];
853
854
                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
855
3.87M
                eee[0] = eeee[0] + eeeo[0];
856
3.87M
                eee[3] = eeee[0] - eeeo[0];
857
3.87M
                eee[1] = eeee[1] + eeeo[1];
858
3.87M
                eee[2] = eeee[1] - eeeo[1];
859
19.3M
                for(k = 0; k < 4; k++)
860
15.5M
                {
861
15.5M
                    ee[k] = eee[k] + eeo[k];
862
15.5M
                    ee[k + 4] = eee[3 - k] - eeo[3 - k];
863
15.5M
                }
864
34.8M
                for(k = 0; k < 8; k++)
865
31.0M
                {
866
31.0M
                    e[k] = ee[k] + eo[k];
867
31.0M
                    e[k + 8] = ee[7 - k] - eo[7 - k];
868
31.0M
                }
869
65.8M
                for(k = 0; k < 16; k++)
870
61.9M
                {
871
61.9M
                    pi2_tmp[k] =
872
61.9M
                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
873
61.9M
                    pi2_tmp[k + 16] =
874
61.9M
                                    CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
875
61.9M
                }
876
3.87M
            }
877
6.83M
            pi2_src++;
878
6.83M
            pi2_tmp += trans_size;
879
6.83M
            zero_cols = zero_cols >> 1;
880
6.83M
        }
881
882
360k
        pi2_tmp = pi2_tmp_orig;
883
884
        /* Inverse Transform 2nd stage */
885
360k
        shift = IT_SHIFT_STAGE_2;
886
360k
        add = 1 << (shift - 1);
887
360k
        if((zero_rows_2nd_stage & 0xFFFFFFF0) == 0xFFFFFFF0) /* First 4 rows of output of 1st stage are non-zero */
888
86.4k
        {
889
2.77M
            for(j = 0; j < trans_size; j++)
890
2.69M
            {
891
                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
892
44.2M
                for(k = 0; k < 16; k++)
893
41.5M
                {
894
41.5M
                    o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size]
895
41.5M
                                    + g_ai2_ihevc_trans_32[3][k]
896
41.5M
                                                    * pi2_tmp[3 * trans_size];
897
41.5M
                }
898
23.8M
                for(k = 0; k < 8; k++)
899
21.1M
                {
900
21.1M
                    eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size];
901
21.1M
                }
902
//                for(k = 0; k < 4; k++)
903
2.69M
                {
904
2.69M
                    eeo[0] = 0;
905
2.69M
                    eeo[1] = 0;
906
2.69M
                    eeo[2] = 0;
907
2.69M
                    eeo[3] = 0;
908
2.69M
                }
909
2.69M
                eeeo[0] = 0;
910
2.69M
                eeeo[1] = 0;
911
2.69M
                eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0];
912
2.69M
                eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0];
913
914
                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
915
2.69M
                eee[0] = eeee[0] + eeeo[0];
916
2.69M
                eee[3] = eeee[0] - eeeo[0];
917
2.69M
                eee[1] = eeee[1] + eeeo[1];
918
2.69M
                eee[2] = eeee[1] - eeeo[1];
919
13.1M
                for(k = 0; k < 4; k++)
920
10.4M
                {
921
10.4M
                    ee[k] = eee[k] + eeo[k];
922
10.4M
                    ee[k + 4] = eee[3 - k] - eeo[3 - k];
923
10.4M
                }
924
23.4M
                for(k = 0; k < 8; k++)
925
20.8M
                {
926
20.8M
                    e[k] = ee[k] + eo[k];
927
20.8M
                    e[k + 8] = ee[7 - k] - eo[7 - k];
928
20.8M
                }
929
42.3M
                for(k = 0; k < 16; k++)
930
39.7M
                {
931
39.7M
                    WORD32 itrans_out;
932
39.7M
                    itrans_out =
933
39.7M
                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
934
39.7M
                    pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
935
39.7M
                    itrans_out =
936
39.7M
                                    CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
937
39.7M
                    pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16]));
938
39.7M
                }
939
2.69M
                pi2_tmp++;
940
2.69M
                pu1_pred += pred_strd;
941
2.69M
                pu1_dst += dst_strd;
942
2.69M
            }
943
86.4k
        }
944
274k
        else if((zero_rows_2nd_stage & 0xFFFFFF00) == 0xFFFFFF00) /* First 8 rows of output of 1st stage are non-zero */
945
95.2k
        {
946
3.10M
            for(j = 0; j < trans_size; j++)
947
3.01M
            {
948
                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
949
49.3M
                for(k = 0; k < 16; k++)
950
46.3M
                {
951
46.3M
                    o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size]
952
46.3M
                                    + g_ai2_ihevc_trans_32[3][k]
953
46.3M
                                                    * pi2_tmp[3 * trans_size]
954
46.3M
                                    + g_ai2_ihevc_trans_32[5][k]
955
46.3M
                                                    * pi2_tmp[5 * trans_size]
956
46.3M
                                    + g_ai2_ihevc_trans_32[7][k]
957
46.3M
                                                    * pi2_tmp[7 * trans_size];
958
46.3M
                }
959
26.9M
                for(k = 0; k < 8; k++)
960
23.9M
                {
961
23.9M
                    eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size]
962
23.9M
                                    + g_ai2_ihevc_trans_32[6][k]
963
23.9M
                                                    * pi2_tmp[6 * trans_size];
964
23.9M
                }
965
15.0M
                for(k = 0; k < 4; k++)
966
12.0M
                {
967
12.0M
                    eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_tmp[4 * trans_size];
968
12.0M
                }
969
3.01M
                eeeo[0] = 0;
970
3.01M
                eeeo[1] = 0;
971
3.01M
                eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0];
972
3.01M
                eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0];
973
974
                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
975
3.01M
                eee[0] = eeee[0] + eeeo[0];
976
3.01M
                eee[3] = eeee[0] - eeeo[0];
977
3.01M
                eee[1] = eeee[1] + eeeo[1];
978
3.01M
                eee[2] = eeee[1] - eeeo[1];
979
14.9M
                for(k = 0; k < 4; k++)
980
11.9M
                {
981
11.9M
                    ee[k] = eee[k] + eeo[k];
982
11.9M
                    ee[k + 4] = eee[3 - k] - eeo[3 - k];
983
11.9M
                }
984
26.7M
                for(k = 0; k < 8; k++)
985
23.7M
                {
986
23.7M
                    e[k] = ee[k] + eo[k];
987
23.7M
                    e[k + 8] = ee[7 - k] - eo[7 - k];
988
23.7M
                }
989
49.0M
                for(k = 0; k < 16; k++)
990
46.0M
                {
991
46.0M
                    WORD32 itrans_out;
992
46.0M
                    itrans_out =
993
46.0M
                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
994
46.0M
                    pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
995
46.0M
                    itrans_out =
996
46.0M
                                    CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
997
46.0M
                    pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16]));
998
46.0M
                }
999
3.01M
                pi2_tmp++;
1000
3.01M
                pu1_pred += pred_strd;
1001
3.01M
                pu1_dst += dst_strd;
1002
3.01M
            }
1003
95.2k
        }
1004
178k
        else /* All rows of output of 1st stage are non-zero */
1005
178k
        {
1006
5.88M
            for(j = 0; j < trans_size; j++)
1007
5.70M
            {
1008
                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
1009
94.0M
                for(k = 0; k < 16; k++)
1010
88.3M
                {
1011
88.3M
                    o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size]
1012
88.3M
                                    + g_ai2_ihevc_trans_32[3][k]
1013
88.3M
                                                    * pi2_tmp[3 * trans_size]
1014
88.3M
                                    + g_ai2_ihevc_trans_32[5][k]
1015
88.3M
                                                    * pi2_tmp[5 * trans_size]
1016
88.3M
                                    + g_ai2_ihevc_trans_32[7][k]
1017
88.3M
                                                    * pi2_tmp[7 * trans_size]
1018
88.3M
                                    + g_ai2_ihevc_trans_32[9][k]
1019
88.3M
                                                    * pi2_tmp[9 * trans_size]
1020
88.3M
                                    + g_ai2_ihevc_trans_32[11][k]
1021
88.3M
                                                    * pi2_tmp[11 * trans_size]
1022
88.3M
                                    + g_ai2_ihevc_trans_32[13][k]
1023
88.3M
                                                    * pi2_tmp[13 * trans_size]
1024
88.3M
                                    + g_ai2_ihevc_trans_32[15][k]
1025
88.3M
                                                    * pi2_tmp[15 * trans_size]
1026
88.3M
                                    + g_ai2_ihevc_trans_32[17][k]
1027
88.3M
                                                    * pi2_tmp[17 * trans_size]
1028
88.3M
                                    + g_ai2_ihevc_trans_32[19][k]
1029
88.3M
                                                    * pi2_tmp[19 * trans_size]
1030
88.3M
                                    + g_ai2_ihevc_trans_32[21][k]
1031
88.3M
                                                    * pi2_tmp[21 * trans_size]
1032
88.3M
                                    + g_ai2_ihevc_trans_32[23][k]
1033
88.3M
                                                    * pi2_tmp[23 * trans_size]
1034
88.3M
                                    + g_ai2_ihevc_trans_32[25][k]
1035
88.3M
                                                    * pi2_tmp[25 * trans_size]
1036
88.3M
                                    + g_ai2_ihevc_trans_32[27][k]
1037
88.3M
                                                    * pi2_tmp[27 * trans_size]
1038
88.3M
                                    + g_ai2_ihevc_trans_32[29][k]
1039
88.3M
                                                    * pi2_tmp[29 * trans_size]
1040
88.3M
                                    + g_ai2_ihevc_trans_32[31][k]
1041
88.3M
                                                    * pi2_tmp[31 * trans_size];
1042
88.3M
                }
1043
50.7M
                for(k = 0; k < 8; k++)
1044
45.0M
                {
1045
45.0M
                    eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size]
1046
45.0M
                                    + g_ai2_ihevc_trans_32[6][k]
1047
45.0M
                                                    * pi2_tmp[6 * trans_size]
1048
45.0M
                                    + g_ai2_ihevc_trans_32[10][k]
1049
45.0M
                                                    * pi2_tmp[10 * trans_size]
1050
45.0M
                                    + g_ai2_ihevc_trans_32[14][k]
1051
45.0M
                                                    * pi2_tmp[14 * trans_size]
1052
45.0M
                                    + g_ai2_ihevc_trans_32[18][k]
1053
45.0M
                                                    * pi2_tmp[18 * trans_size]
1054
45.0M
                                    + g_ai2_ihevc_trans_32[22][k]
1055
45.0M
                                                    * pi2_tmp[22 * trans_size]
1056
45.0M
                                    + g_ai2_ihevc_trans_32[26][k]
1057
45.0M
                                                    * pi2_tmp[26 * trans_size]
1058
45.0M
                                    + g_ai2_ihevc_trans_32[30][k]
1059
45.0M
                                                    * pi2_tmp[30 * trans_size];
1060
45.0M
                }
1061
28.4M
                for(k = 0; k < 4; k++)
1062
22.7M
                {
1063
22.7M
                    eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_tmp[4 * trans_size]
1064
22.7M
                                    + g_ai2_ihevc_trans_32[12][k]
1065
22.7M
                                                    * pi2_tmp[12 * trans_size]
1066
22.7M
                                    + g_ai2_ihevc_trans_32[20][k]
1067
22.7M
                                                    * pi2_tmp[20 * trans_size]
1068
22.7M
                                    + g_ai2_ihevc_trans_32[28][k]
1069
22.7M
                                                    * pi2_tmp[28 * trans_size];
1070
22.7M
                }
1071
5.70M
                eeeo[0] =
1072
5.70M
                                g_ai2_ihevc_trans_32[8][0] * pi2_tmp[8 * trans_size]
1073
5.70M
                                                + g_ai2_ihevc_trans_32[24][0]
1074
5.70M
                                                                * pi2_tmp[24
1075
5.70M
                                                                                * trans_size];
1076
5.70M
                eeeo[1] =
1077
5.70M
                                g_ai2_ihevc_trans_32[8][1] * pi2_tmp[8 * trans_size]
1078
5.70M
                                                + g_ai2_ihevc_trans_32[24][1]
1079
5.70M
                                                                * pi2_tmp[24
1080
5.70M
                                                                                * trans_size];
1081
5.70M
                eeee[0] =
1082
5.70M
                                g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0]
1083
5.70M
                                                + g_ai2_ihevc_trans_32[16][0]
1084
5.70M
                                                                * pi2_tmp[16
1085
5.70M
                                                                                * trans_size];
1086
5.70M
                eeee[1] =
1087
5.70M
                                g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0]
1088
5.70M
                                                + g_ai2_ihevc_trans_32[16][1]
1089
5.70M
                                                                * pi2_tmp[16
1090
5.70M
                                                                                * trans_size];
1091
1092
                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
1093
5.70M
                eee[0] = eeee[0] + eeeo[0];
1094
5.70M
                eee[3] = eeee[0] - eeeo[0];
1095
5.70M
                eee[1] = eeee[1] + eeeo[1];
1096
5.70M
                eee[2] = eeee[1] - eeeo[1];
1097
28.4M
                for(k = 0; k < 4; k++)
1098
22.7M
                {
1099
22.7M
                    ee[k] = eee[k] + eeo[k];
1100
22.7M
                    ee[k + 4] = eee[3 - k] - eeo[3 - k];
1101
22.7M
                }
1102
51.1M
                for(k = 0; k < 8; k++)
1103
45.4M
                {
1104
45.4M
                    e[k] = ee[k] + eo[k];
1105
45.4M
                    e[k + 8] = ee[7 - k] - eo[7 - k];
1106
45.4M
                }
1107
95.4M
                for(k = 0; k < 16; k++)
1108
89.7M
                {
1109
89.7M
                    WORD32 itrans_out;
1110
89.7M
                    itrans_out =
1111
89.7M
                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
1112
89.7M
                    pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
1113
89.7M
                    itrans_out =
1114
89.7M
                                    CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
1115
89.7M
                    pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16]));
1116
89.7M
                }
1117
5.70M
                pi2_tmp++;
1118
5.70M
                pu1_pred += pred_strd;
1119
5.70M
                pu1_dst += dst_strd;
1120
5.70M
            }
1121
178k
        }
1122
        /************************************************************************************************/
1123
        /************************************END - IT_RECON_32x32****************************************/
1124
        /************************************************************************************************/
1125
360k
    }
1126
904k
}
1127