Coverage Report

Created: 2026-02-14 06:43

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libhevc/common/ihevc_chroma_itrans_recon_16x16.c
Line
Count
Source
1
/******************************************************************************
2
*
3
* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4
*
5
* Licensed under the Apache License, Version 2.0 (the "License");
6
* you may not use this file except in compliance with the License.
7
* You may obtain a copy of the License at:
8
*
9
* http://www.apache.org/licenses/LICENSE-2.0
10
*
11
* Unless required by applicable law or agreed to in writing, software
12
* distributed under the License is distributed on an "AS IS" BASIS,
13
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
* See the License for the specific language governing permissions and
15
* limitations under the License.
16
*
17
******************************************************************************/
18
/**
19
 *******************************************************************************
20
 * @file
21
 *  ihevc_chroma_itrans_recon_16x16.c
22
 *
23
 * @brief
24
 *  Contains function definitions for 16x16 inverse transform  and reconstruction
25
 * of chroma interleaved data.
26
 *
27
 * @author
28
 *  100470
29
 *
30
 * @par List of Functions:
31
 *  - ihevc_chroma_itrans_recon_16x16()
32
 *
33
 * @remarks
34
 *  None
35
 *
36
 *******************************************************************************
37
 */
38
39
#include <stdio.h>
40
#include <string.h>
41
#include "ihevc_typedefs.h"
42
#include "ihevc_macros.h"
43
#include "ihevc_platform_macros.h"
44
#include "ihevc_defs.h"
45
#include "ihevc_trans_tables.h"
46
#include "ihevc_chroma_itrans_recon.h"
47
#include "ihevc_func_selector.h"
48
#include "ihevc_trans_macros.h"
49
50
/* All the functions work one component(U or V) of interleaved data depending upon pointers passed to it */
51
/* Data visualization */
52
/* U V U V U V U V */
53
/* U V U V U V U V */
54
/* U V U V U V U V */
55
/* U V U V U V U V */
56
/* If the pointer points to first byte of above stream (U) , functions will operate on U component */
57
/* If the pointer points to second byte of above stream (V) , functions will operate on V component */
58
59
60
/**
61
 *******************************************************************************
62
 *
63
 * @brief
64
 *  This function performs Inverse transform  and reconstruction for 16x16
65
 * input block
66
 *
67
 * @par Description:
68
 *  Performs inverse transform and adds the prediction  data and clips output
69
 * to 8 bit
70
 *
71
 * @param[in] pi2_src
72
 *  Input 16x16 coefficients
73
 *
74
 * @param[in] pi2_tmp
75
 *  Temporary 16x16 buffer for storing inverse transform
76
 *  1st stage output
77
 *
78
 * @param[in] pu1_pred
79
 *  Prediction 16x16 block
80
 *
81
 * @param[out] pu1_dst
82
 *  Output 16x16 block
83
 *
84
 * @param[in] src_strd
85
 *  Input stride
86
 *
87
 * @param[in] pred_strd
88
 *  Prediction stride
89
 *
90
 * @param[in] dst_strd
91
 *  Output Stride
92
 *
93
 * @param[in] shift
94
 *  Output shift
95
 *
96
 * @param[in] zero_cols
97
 *  Zero columns in pi2_src
98
 *
99
 * @returns  Void
100
 *
101
 * @remarks
102
 *  None
103
 *
104
 *******************************************************************************
105
 */
106
107
108
void ihevc_chroma_itrans_recon_16x16(WORD16 *pi2_src,
109
                                     WORD16 *pi2_tmp,
110
                                     UWORD8 *pu1_pred,
111
                                     UWORD8 *pu1_dst,
112
                                     WORD32 src_strd,
113
                                     WORD32 pred_strd,
114
                                     WORD32 dst_strd,
115
                                     WORD32 zero_cols,
116
                                     WORD32 zero_rows)
117
85.6k
{
118
85.6k
    WORD32 j, k;
119
85.6k
    WORD32 e[8], o[8];
120
85.6k
    WORD32 ee[4], eo[4];
121
85.6k
    WORD32 eee[2], eeo[2];
122
85.6k
    WORD32 add;
123
85.6k
    WORD32 shift;
124
85.6k
    WORD16 *pi2_tmp_orig;
125
85.6k
    WORD32 trans_size;
126
85.6k
    WORD32 row_limit_2nd_stage, zero_rows_2nd_stage = zero_cols;
127
128
85.6k
    trans_size = TRANS_SIZE_16;
129
85.6k
    pi2_tmp_orig = pi2_tmp;
130
131
85.6k
    if((zero_cols & 0xFFF0) == 0xFFF0)
132
41.3k
        row_limit_2nd_stage = 4;
133
44.2k
    else if((zero_cols & 0xFF00) == 0xFF00)
134
18.6k
        row_limit_2nd_stage = 8;
135
25.6k
    else
136
25.6k
        row_limit_2nd_stage = TRANS_SIZE_16;
137
138
85.6k
    if((zero_rows & 0xFFF0) == 0xFFF0) /* First 4 rows of input are non-zero */
139
41.3k
    {
140
        /************************************************************************************************/
141
        /**********************************START - IT_RECON_16x16****************************************/
142
        /************************************************************************************************/
143
144
        /* Inverse Transform 1st stage */
145
41.3k
        shift = IT_SHIFT_STAGE_1;
146
41.3k
        add = 1 << (shift - 1);
147
148
343k
        for(j = 0; j < row_limit_2nd_stage; j++)
149
302k
        {
150
            /* Checking for Zero Cols */
151
302k
            if((zero_cols & 1) == 1)
152
133k
            {
153
133k
                memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
154
133k
            }
155
168k
            else
156
168k
            {
157
                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
158
1.51M
                for(k = 0; k < 8; k++)
159
1.34M
                {
160
1.34M
                    o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_src[src_strd]
161
1.34M
                                    + g_ai2_ihevc_trans_16[3][k]
162
1.34M
                                                    * pi2_src[3 * src_strd];
163
1.34M
                }
164
843k
                for(k = 0; k < 4; k++)
165
674k
                {
166
674k
                    eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_src[2 * src_strd];
167
674k
                }
168
168k
                eeo[0] = 0;
169
168k
                eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_src[0];
170
168k
                eeo[1] = 0;
171
168k
                eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_src[0];
172
173
                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
174
506k
                for(k = 0; k < 2; k++)
175
337k
                {
176
337k
                    ee[k] = eee[k] + eeo[k];
177
337k
                    ee[k + 2] = eee[1 - k] - eeo[1 - k];
178
337k
                }
179
843k
                for(k = 0; k < 4; k++)
180
674k
                {
181
674k
                    e[k] = ee[k] + eo[k];
182
674k
                    e[k + 4] = ee[3 - k] - eo[3 - k];
183
674k
                }
184
1.51M
                for(k = 0; k < 8; k++)
185
1.34M
                {
186
1.34M
                    pi2_tmp[k] =
187
1.34M
                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
188
1.34M
                    pi2_tmp[k + 8] =
189
1.34M
                                    CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
190
1.34M
                }
191
168k
            }
192
302k
            pi2_src++;
193
302k
            pi2_tmp += trans_size;
194
302k
            zero_cols = zero_cols >> 1;
195
302k
        }
196
197
41.3k
        pi2_tmp = pi2_tmp_orig;
198
199
        /* Inverse Transform 2nd stage */
200
41.3k
        shift = IT_SHIFT_STAGE_2;
201
41.3k
        add = 1 << (shift - 1);
202
41.3k
        if((zero_rows_2nd_stage & 0xFFF0) == 0xFFF0) /* First 4 rows of output of 1st stage are non-zero */
203
25.1k
        {
204
427k
            for(j = 0; j < trans_size; j++)
205
402k
            {
206
                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
207
3.62M
                for(k = 0; k < 8; k++)
208
3.22M
                {
209
3.22M
                    o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
210
3.22M
                                    + g_ai2_ihevc_trans_16[3][k]
211
3.22M
                                                    * pi2_tmp[3 * trans_size];
212
3.22M
                }
213
2.01M
                for(k = 0; k < 4; k++)
214
1.61M
                {
215
1.61M
                    eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size];
216
1.61M
                }
217
402k
                eeo[0] = 0;
218
402k
                eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
219
402k
                eeo[1] = 0;
220
402k
                eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
221
222
                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
223
1.20M
                for(k = 0; k < 2; k++)
224
805k
                {
225
805k
                    ee[k] = eee[k] + eeo[k];
226
805k
                    ee[k + 2] = eee[1 - k] - eeo[1 - k];
227
805k
                }
228
2.01M
                for(k = 0; k < 4; k++)
229
1.61M
                {
230
1.61M
                    e[k] = ee[k] + eo[k];
231
1.61M
                    e[k + 4] = ee[3 - k] - eo[3 - k];
232
1.61M
                }
233
3.62M
                for(k = 0; k < 8; k++)
234
3.22M
                {
235
3.22M
                    WORD32 itrans_out;
236
3.22M
                    itrans_out =
237
3.22M
                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
238
3.22M
                    pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2]));
239
3.22M
                    itrans_out =
240
3.22M
                                    CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
241
3.22M
                    pu1_dst[(k + 8) * 2] = CLIP_U8((itrans_out + pu1_pred[(k + 8) * 2]));
242
3.22M
                }
243
402k
                pi2_tmp++;
244
402k
                pu1_pred += pred_strd;
245
402k
                pu1_dst += dst_strd;
246
402k
            }
247
25.1k
        }
248
16.1k
        else if((zero_rows_2nd_stage & 0xFF00) == 0xFF00) /* First 8 rows of output of 1st stage are non-zero */
249
7.19k
        {
250
122k
            for(j = 0; j < trans_size; j++)
251
115k
            {
252
                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
253
1.03M
                for(k = 0; k < 8; k++)
254
921k
                {
255
921k
                    o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
256
921k
                                    + g_ai2_ihevc_trans_16[3][k]
257
921k
                                                    * pi2_tmp[3 * trans_size]
258
921k
                                    + g_ai2_ihevc_trans_16[5][k]
259
921k
                                                    * pi2_tmp[5 * trans_size]
260
921k
                                    + g_ai2_ihevc_trans_16[7][k]
261
921k
                                                    * pi2_tmp[7 * trans_size];
262
921k
                }
263
575k
                for(k = 0; k < 4; k++)
264
460k
                {
265
460k
                    eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
266
460k
                                    + g_ai2_ihevc_trans_16[6][k]
267
460k
                                                    * pi2_tmp[6 * trans_size];
268
460k
                }
269
115k
                eeo[0] = g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size];
270
115k
                eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
271
115k
                eeo[1] = g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size];
272
115k
                eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
273
274
                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
275
345k
                for(k = 0; k < 2; k++)
276
230k
                {
277
230k
                    ee[k] = eee[k] + eeo[k];
278
230k
                    ee[k + 2] = eee[1 - k] - eeo[1 - k];
279
230k
                }
280
575k
                for(k = 0; k < 4; k++)
281
460k
                {
282
460k
                    e[k] = ee[k] + eo[k];
283
460k
                    e[k + 4] = ee[3 - k] - eo[3 - k];
284
460k
                }
285
1.03M
                for(k = 0; k < 8; k++)
286
921k
                {
287
921k
                    WORD32 itrans_out;
288
921k
                    itrans_out =
289
921k
                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
290
921k
                    pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2]));
291
921k
                    itrans_out =
292
921k
                                    CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
293
921k
                    pu1_dst[(k + 8) * 2] = CLIP_U8((itrans_out + pu1_pred[(k + 8) * 2]));
294
921k
                }
295
115k
                pi2_tmp++;
296
115k
                pu1_pred += pred_strd;
297
115k
                pu1_dst += dst_strd;
298
115k
            }
299
7.19k
        }
300
9.00k
        else /* All rows of output of 1st stage are non-zero */
301
9.00k
        {
302
153k
            for(j = 0; j < trans_size; j++)
303
144k
            {
304
                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
305
1.29M
                for(k = 0; k < 8; k++)
306
1.15M
                {
307
1.15M
                    o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
308
1.15M
                                    + g_ai2_ihevc_trans_16[3][k]
309
1.15M
                                                    * pi2_tmp[3 * trans_size]
310
1.15M
                                    + g_ai2_ihevc_trans_16[5][k]
311
1.15M
                                                    * pi2_tmp[5 * trans_size]
312
1.15M
                                    + g_ai2_ihevc_trans_16[7][k]
313
1.15M
                                                    * pi2_tmp[7 * trans_size]
314
1.15M
                                    + g_ai2_ihevc_trans_16[9][k]
315
1.15M
                                                    * pi2_tmp[9 * trans_size]
316
1.15M
                                    + g_ai2_ihevc_trans_16[11][k]
317
1.15M
                                                    * pi2_tmp[11 * trans_size]
318
1.15M
                                    + g_ai2_ihevc_trans_16[13][k]
319
1.15M
                                                    * pi2_tmp[13 * trans_size]
320
1.15M
                                    + g_ai2_ihevc_trans_16[15][k]
321
1.15M
                                                    * pi2_tmp[15 * trans_size];
322
1.15M
                }
323
720k
                for(k = 0; k < 4; k++)
324
576k
                {
325
576k
                    eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
326
576k
                                    + g_ai2_ihevc_trans_16[6][k]
327
576k
                                                    * pi2_tmp[6 * trans_size]
328
576k
                                    + g_ai2_ihevc_trans_16[10][k]
329
576k
                                                    * pi2_tmp[10 * trans_size]
330
576k
                                    + g_ai2_ihevc_trans_16[14][k]
331
576k
                                                    * pi2_tmp[14 * trans_size];
332
576k
                }
333
144k
                eeo[0] =
334
144k
                                g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size]
335
144k
                                                + g_ai2_ihevc_trans_16[12][0]
336
144k
                                                                * pi2_tmp[12
337
144k
                                                                                * trans_size];
338
144k
                eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0]
339
144k
                                + g_ai2_ihevc_trans_16[8][0] * pi2_tmp[8 * trans_size];
340
144k
                eeo[1] =
341
144k
                                g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size]
342
144k
                                                + g_ai2_ihevc_trans_16[12][1]
343
144k
                                                                * pi2_tmp[12
344
144k
                                                                                * trans_size];
345
144k
                eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0]
346
144k
                                + g_ai2_ihevc_trans_16[8][1] * pi2_tmp[8 * trans_size];
347
348
                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
349
432k
                for(k = 0; k < 2; k++)
350
288k
                {
351
288k
                    ee[k] = eee[k] + eeo[k];
352
288k
                    ee[k + 2] = eee[1 - k] - eeo[1 - k];
353
288k
                }
354
720k
                for(k = 0; k < 4; k++)
355
576k
                {
356
576k
                    e[k] = ee[k] + eo[k];
357
576k
                    e[k + 4] = ee[3 - k] - eo[3 - k];
358
576k
                }
359
1.29M
                for(k = 0; k < 8; k++)
360
1.15M
                {
361
1.15M
                    WORD32 itrans_out;
362
1.15M
                    itrans_out =
363
1.15M
                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
364
1.15M
                    pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2]));
365
1.15M
                    itrans_out =
366
1.15M
                                    CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
367
1.15M
                    pu1_dst[(k + 8) * 2] = CLIP_U8((itrans_out + pu1_pred[(k + 8) * 2]));
368
1.15M
                }
369
144k
                pi2_tmp++;
370
144k
                pu1_pred += pred_strd;
371
144k
                pu1_dst += dst_strd;
372
144k
            }
373
9.00k
        }
374
        /************************************************************************************************/
375
        /************************************END - IT_RECON_16x16****************************************/
376
        /************************************************************************************************/
377
41.3k
    }
378
44.2k
    else if((zero_rows & 0xFF00) == 0xFF00) /* First 8 rows of input are non-zero */
379
21.5k
    {
380
        /************************************************************************************************/
381
        /**********************************START - IT_RECON_16x16****************************************/
382
        /************************************************************************************************/
383
384
        /* Inverse Transform 1st stage */
385
21.5k
        shift = IT_SHIFT_STAGE_1;
386
21.5k
        add = 1 << (shift - 1);
387
388
197k
        for(j = 0; j < row_limit_2nd_stage; j++)
389
176k
        {
390
            /* Checking for Zero Cols */
391
176k
            if((zero_cols & 1) == 1)
392
98.6k
            {
393
98.6k
                memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
394
98.6k
            }
395
77.4k
            else
396
77.4k
            {
397
                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
398
696k
                for(k = 0; k < 8; k++)
399
619k
                {
400
619k
                    o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_src[src_strd]
401
619k
                                    + g_ai2_ihevc_trans_16[3][k]
402
619k
                                                    * pi2_src[3 * src_strd]
403
619k
                                    + g_ai2_ihevc_trans_16[5][k]
404
619k
                                                    * pi2_src[5 * src_strd]
405
619k
                                    + g_ai2_ihevc_trans_16[7][k]
406
619k
                                                    * pi2_src[7 * src_strd];
407
619k
                }
408
387k
                for(k = 0; k < 4; k++)
409
309k
                {
410
309k
                    eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_src[2 * src_strd]
411
309k
                                    + g_ai2_ihevc_trans_16[6][k]
412
309k
                                                    * pi2_src[6 * src_strd];
413
309k
                }
414
77.4k
                eeo[0] = g_ai2_ihevc_trans_16[4][0] * pi2_src[4 * src_strd];
415
77.4k
                eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_src[0];
416
77.4k
                eeo[1] = g_ai2_ihevc_trans_16[4][1] * pi2_src[4 * src_strd];
417
77.4k
                eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_src[0];
418
419
                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
420
232k
                for(k = 0; k < 2; k++)
421
154k
                {
422
154k
                    ee[k] = eee[k] + eeo[k];
423
154k
                    ee[k + 2] = eee[1 - k] - eeo[1 - k];
424
154k
                }
425
387k
                for(k = 0; k < 4; k++)
426
309k
                {
427
309k
                    e[k] = ee[k] + eo[k];
428
309k
                    e[k + 4] = ee[3 - k] - eo[3 - k];
429
309k
                }
430
696k
                for(k = 0; k < 8; k++)
431
619k
                {
432
619k
                    pi2_tmp[k] =
433
619k
                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
434
619k
                    pi2_tmp[k + 8] =
435
619k
                                    CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
436
619k
                }
437
77.4k
            }
438
176k
            pi2_src++;
439
176k
            pi2_tmp += trans_size;
440
176k
            zero_cols = zero_cols >> 1;
441
176k
        }
442
443
21.5k
        pi2_tmp = pi2_tmp_orig;
444
445
        /* Inverse Transform 2nd stage */
446
21.5k
        shift = IT_SHIFT_STAGE_2;
447
21.5k
        add = 1 << (shift - 1);
448
21.5k
        if((zero_rows_2nd_stage & 0xFFF0) == 0xFFF0) /* First 4 rows of output of 1st stage are non-zero */
449
11.0k
        {
450
187k
            for(j = 0; j < trans_size; j++)
451
176k
            {
452
                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
453
1.58M
                for(k = 0; k < 8; k++)
454
1.40M
                {
455
1.40M
                    o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
456
1.40M
                                    + g_ai2_ihevc_trans_16[3][k]
457
1.40M
                                                    * pi2_tmp[3 * trans_size];
458
1.40M
                }
459
881k
                for(k = 0; k < 4; k++)
460
705k
                {
461
705k
                    eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size];
462
705k
                }
463
176k
                eeo[0] = 0;
464
176k
                eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
465
176k
                eeo[1] = 0;
466
176k
                eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
467
468
                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
469
529k
                for(k = 0; k < 2; k++)
470
352k
                {
471
352k
                    ee[k] = eee[k] + eeo[k];
472
352k
                    ee[k + 2] = eee[1 - k] - eeo[1 - k];
473
352k
                }
474
881k
                for(k = 0; k < 4; k++)
475
704k
                {
476
704k
                    e[k] = ee[k] + eo[k];
477
704k
                    e[k + 4] = ee[3 - k] - eo[3 - k];
478
704k
                }
479
1.58M
                for(k = 0; k < 8; k++)
480
1.40M
                {
481
1.40M
                    WORD32 itrans_out;
482
1.40M
                    itrans_out =
483
1.40M
                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
484
1.40M
                    pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2]));
485
1.40M
                    itrans_out =
486
1.40M
                                    CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
487
1.40M
                    pu1_dst[(k + 8) * 2] = CLIP_U8((itrans_out + pu1_pred[(k + 8) * 2]));
488
1.40M
                }
489
176k
                pi2_tmp++;
490
176k
                pu1_pred += pred_strd;
491
176k
                pu1_dst += dst_strd;
492
176k
            }
493
11.0k
        }
494
10.4k
        else if((zero_rows_2nd_stage & 0xFF00) == 0xFF00) /* First 8 rows of output of 1st stage are non-zero */
495
4.49k
        {
496
76.3k
            for(j = 0; j < trans_size; j++)
497
71.8k
            {
498
                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
499
646k
                for(k = 0; k < 8; k++)
500
574k
                {
501
574k
                    o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
502
574k
                                    + g_ai2_ihevc_trans_16[3][k]
503
574k
                                                    * pi2_tmp[3 * trans_size]
504
574k
                                    + g_ai2_ihevc_trans_16[5][k]
505
574k
                                                    * pi2_tmp[5 * trans_size]
506
574k
                                    + g_ai2_ihevc_trans_16[7][k]
507
574k
                                                    * pi2_tmp[7 * trans_size];
508
574k
                }
509
359k
                for(k = 0; k < 4; k++)
510
287k
                {
511
287k
                    eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
512
287k
                                    + g_ai2_ihevc_trans_16[6][k]
513
287k
                                                    * pi2_tmp[6 * trans_size];
514
287k
                }
515
71.8k
                eeo[0] = g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size];
516
71.8k
                eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
517
71.8k
                eeo[1] = g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size];
518
71.8k
                eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
519
520
                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
521
215k
                for(k = 0; k < 2; k++)
522
143k
                {
523
143k
                    ee[k] = eee[k] + eeo[k];
524
143k
                    ee[k + 2] = eee[1 - k] - eeo[1 - k];
525
143k
                }
526
359k
                for(k = 0; k < 4; k++)
527
287k
                {
528
287k
                    e[k] = ee[k] + eo[k];
529
287k
                    e[k + 4] = ee[3 - k] - eo[3 - k];
530
287k
                }
531
646k
                for(k = 0; k < 8; k++)
532
574k
                {
533
574k
                    WORD32 itrans_out;
534
574k
                    itrans_out =
535
574k
                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
536
574k
                    pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2]));
537
574k
                    itrans_out =
538
574k
                                    CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
539
574k
                    pu1_dst[(k + 8) * 2] = CLIP_U8((itrans_out + pu1_pred[(k + 8) * 2]));
540
574k
                }
541
71.8k
                pi2_tmp++;
542
71.8k
                pu1_pred += pred_strd;
543
71.8k
                pu1_dst += dst_strd;
544
71.8k
            }
545
4.49k
        }
546
5.99k
        else /* All rows of output of 1st stage are non-zero */
547
5.99k
        {
548
101k
            for(j = 0; j < trans_size; j++)
549
95.6k
            {
550
                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
551
858k
                for(k = 0; k < 8; k++)
552
762k
                {
553
762k
                    o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
554
762k
                                    + g_ai2_ihevc_trans_16[3][k]
555
762k
                                                    * pi2_tmp[3 * trans_size]
556
762k
                                    + g_ai2_ihevc_trans_16[5][k]
557
762k
                                                    * pi2_tmp[5 * trans_size]
558
762k
                                    + g_ai2_ihevc_trans_16[7][k]
559
762k
                                                    * pi2_tmp[7 * trans_size]
560
762k
                                    + g_ai2_ihevc_trans_16[9][k]
561
762k
                                                    * pi2_tmp[9 * trans_size]
562
762k
                                    + g_ai2_ihevc_trans_16[11][k]
563
762k
                                                    * pi2_tmp[11 * trans_size]
564
762k
                                    + g_ai2_ihevc_trans_16[13][k]
565
762k
                                                    * pi2_tmp[13 * trans_size]
566
762k
                                    + g_ai2_ihevc_trans_16[15][k]
567
762k
                                                    * pi2_tmp[15 * trans_size];
568
762k
                }
569
478k
                for(k = 0; k < 4; k++)
570
382k
                {
571
382k
                    eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
572
382k
                                    + g_ai2_ihevc_trans_16[6][k]
573
382k
                                                    * pi2_tmp[6 * trans_size]
574
382k
                                    + g_ai2_ihevc_trans_16[10][k]
575
382k
                                                    * pi2_tmp[10 * trans_size]
576
382k
                                    + g_ai2_ihevc_trans_16[14][k]
577
382k
                                                    * pi2_tmp[14 * trans_size];
578
382k
                }
579
95.6k
                eeo[0] =
580
95.6k
                                g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size]
581
95.6k
                                                + g_ai2_ihevc_trans_16[12][0]
582
95.6k
                                                                * pi2_tmp[12
583
95.6k
                                                                                * trans_size];
584
95.6k
                eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0]
585
95.6k
                                + g_ai2_ihevc_trans_16[8][0] * pi2_tmp[8 * trans_size];
586
95.6k
                eeo[1] =
587
95.6k
                                g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size]
588
95.6k
                                                + g_ai2_ihevc_trans_16[12][1]
589
95.6k
                                                                * pi2_tmp[12
590
95.6k
                                                                                * trans_size];
591
95.6k
                eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0]
592
95.6k
                                + g_ai2_ihevc_trans_16[8][1] * pi2_tmp[8 * trans_size];
593
594
                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
595
286k
                for(k = 0; k < 2; k++)
596
191k
                {
597
191k
                    ee[k] = eee[k] + eeo[k];
598
191k
                    ee[k + 2] = eee[1 - k] - eeo[1 - k];
599
191k
                }
600
478k
                for(k = 0; k < 4; k++)
601
382k
                {
602
382k
                    e[k] = ee[k] + eo[k];
603
382k
                    e[k + 4] = ee[3 - k] - eo[3 - k];
604
382k
                }
605
858k
                for(k = 0; k < 8; k++)
606
762k
                {
607
762k
                    WORD32 itrans_out;
608
762k
                    itrans_out =
609
762k
                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
610
762k
                    pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2]));
611
762k
                    itrans_out =
612
762k
                                    CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
613
762k
                    pu1_dst[(k + 8) * 2] = CLIP_U8((itrans_out + pu1_pred[(k + 8) * 2]));
614
762k
                }
615
95.6k
                pi2_tmp++;
616
95.6k
                pu1_pred += pred_strd;
617
95.6k
                pu1_dst += dst_strd;
618
95.6k
            }
619
5.99k
        }
620
        /************************************************************************************************/
621
        /************************************END - IT_RECON_16x16****************************************/
622
        /************************************************************************************************/
623
21.5k
    }
624
22.6k
    else /* All rows of input are non-zero */
625
22.6k
    {
626
        /************************************************************************************************/
627
        /**********************************START - IT_RECON_16x16****************************************/
628
        /************************************************************************************************/
629
630
        /* Inverse Transform 1st stage */
631
22.6k
        shift = IT_SHIFT_STAGE_1;
632
22.6k
        add = 1 << (shift - 1);
633
634
269k
        for(j = 0; j < row_limit_2nd_stage; j++)
635
246k
        {
636
            /* Checking for Zero Cols */
637
246k
            if((zero_cols & 1) == 1)
638
74.2k
            {
639
74.2k
                memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
640
74.2k
            }
641
172k
            else
642
172k
            {
643
                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
644
1.54M
                for(k = 0; k < 8; k++)
645
1.37M
                {
646
1.37M
                    o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_src[src_strd]
647
1.37M
                                    + g_ai2_ihevc_trans_16[3][k]
648
1.37M
                                                    * pi2_src[3 * src_strd]
649
1.37M
                                    + g_ai2_ihevc_trans_16[5][k]
650
1.37M
                                                    * pi2_src[5 * src_strd]
651
1.37M
                                    + g_ai2_ihevc_trans_16[7][k]
652
1.37M
                                                    * pi2_src[7 * src_strd]
653
1.37M
                                    + g_ai2_ihevc_trans_16[9][k]
654
1.37M
                                                    * pi2_src[9 * src_strd]
655
1.37M
                                    + g_ai2_ihevc_trans_16[11][k]
656
1.37M
                                                    * pi2_src[11 * src_strd]
657
1.37M
                                    + g_ai2_ihevc_trans_16[13][k]
658
1.37M
                                                    * pi2_src[13 * src_strd]
659
1.37M
                                    + g_ai2_ihevc_trans_16[15][k]
660
1.37M
                                                    * pi2_src[15 * src_strd];
661
1.37M
                }
662
860k
                for(k = 0; k < 4; k++)
663
688k
                {
664
688k
                    eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_src[2 * src_strd]
665
688k
                                    + g_ai2_ihevc_trans_16[6][k]
666
688k
                                                    * pi2_src[6 * src_strd]
667
688k
                                    + g_ai2_ihevc_trans_16[10][k]
668
688k
                                                    * pi2_src[10 * src_strd]
669
688k
                                    + g_ai2_ihevc_trans_16[14][k]
670
688k
                                                    * pi2_src[14 * src_strd];
671
688k
                }
672
172k
                eeo[0] = g_ai2_ihevc_trans_16[4][0] * pi2_src[4 * src_strd]
673
172k
                                + g_ai2_ihevc_trans_16[12][0]
674
172k
                                                * pi2_src[12 * src_strd];
675
172k
                eee[0] =
676
172k
                                g_ai2_ihevc_trans_16[0][0] * pi2_src[0]
677
172k
                                                + g_ai2_ihevc_trans_16[8][0]
678
172k
                                                                * pi2_src[8
679
172k
                                                                                * src_strd];
680
172k
                eeo[1] = g_ai2_ihevc_trans_16[4][1] * pi2_src[4 * src_strd]
681
172k
                                + g_ai2_ihevc_trans_16[12][1]
682
172k
                                                * pi2_src[12 * src_strd];
683
172k
                eee[1] =
684
172k
                                g_ai2_ihevc_trans_16[0][1] * pi2_src[0]
685
172k
                                                + g_ai2_ihevc_trans_16[8][1]
686
172k
                                                                * pi2_src[8
687
172k
                                                                                * src_strd];
688
689
                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
690
516k
                for(k = 0; k < 2; k++)
691
344k
                {
692
344k
                    ee[k] = eee[k] + eeo[k];
693
344k
                    ee[k + 2] = eee[1 - k] - eeo[1 - k];
694
344k
                }
695
860k
                for(k = 0; k < 4; k++)
696
688k
                {
697
688k
                    e[k] = ee[k] + eo[k];
698
688k
                    e[k + 4] = ee[3 - k] - eo[3 - k];
699
688k
                }
700
1.54M
                for(k = 0; k < 8; k++)
701
1.37M
                {
702
1.37M
                    pi2_tmp[k] =
703
1.37M
                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
704
1.37M
                    pi2_tmp[k + 8] =
705
1.37M
                                    CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
706
1.37M
                }
707
172k
            }
708
246k
            pi2_src++;
709
246k
            pi2_tmp += trans_size;
710
246k
            zero_cols = zero_cols >> 1;
711
246k
        }
712
713
22.6k
        pi2_tmp = pi2_tmp_orig;
714
715
        /* Inverse Transform 2nd stage */
716
22.6k
        shift = IT_SHIFT_STAGE_2;
717
22.6k
        add = 1 << (shift - 1);
718
22.6k
        if((zero_rows_2nd_stage & 0xFFF0) == 0xFFF0) /* First 4 rows of output of 1st stage are non-zero */
719
5.10k
        {
720
86.7k
            for(j = 0; j < trans_size; j++)
721
81.6k
            {
722
                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
723
734k
                for(k = 0; k < 8; k++)
724
652k
                {
725
652k
                    o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
726
652k
                                    + g_ai2_ihevc_trans_16[3][k]
727
652k
                                                    * pi2_tmp[3 * trans_size];
728
652k
                }
729
408k
                for(k = 0; k < 4; k++)
730
326k
                {
731
326k
                    eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size];
732
326k
                }
733
81.6k
                eeo[0] = 0;
734
81.6k
                eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
735
81.6k
                eeo[1] = 0;
736
81.6k
                eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
737
738
                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
739
244k
                for(k = 0; k < 2; k++)
740
163k
                {
741
163k
                    ee[k] = eee[k] + eeo[k];
742
163k
                    ee[k + 2] = eee[1 - k] - eeo[1 - k];
743
163k
                }
744
407k
                for(k = 0; k < 4; k++)
745
326k
                {
746
326k
                    e[k] = ee[k] + eo[k];
747
326k
                    e[k + 4] = ee[3 - k] - eo[3 - k];
748
326k
                }
749
732k
                for(k = 0; k < 8; k++)
750
650k
                {
751
650k
                    WORD32 itrans_out;
752
650k
                    itrans_out =
753
650k
                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
754
650k
                    pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2]));
755
650k
                    itrans_out =
756
650k
                                    CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
757
650k
                    pu1_dst[(k + 8) * 2] = CLIP_U8((itrans_out + pu1_pred[(k + 8) * 2]));
758
650k
                }
759
81.6k
                pi2_tmp++;
760
81.6k
                pu1_pred += pred_strd;
761
81.6k
                pu1_dst += dst_strd;
762
81.6k
            }
763
5.10k
        }
764
17.5k
        else if((zero_rows_2nd_stage & 0xFF00) == 0xFF00) /* First 8 rows of output of 1st stage are non-zero */
765
6.92k
        {
766
116k
            for(j = 0; j < trans_size; j++)
767
109k
            {
768
                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
769
977k
                for(k = 0; k < 8; k++)
770
867k
                {
771
867k
                    o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
772
867k
                                    + g_ai2_ihevc_trans_16[3][k]
773
867k
                                                    * pi2_tmp[3 * trans_size]
774
867k
                                    + g_ai2_ihevc_trans_16[5][k]
775
867k
                                                    * pi2_tmp[5 * trans_size]
776
867k
                                    + g_ai2_ihevc_trans_16[7][k]
777
867k
                                                    * pi2_tmp[7 * trans_size];
778
867k
                }
779
546k
                for(k = 0; k < 4; k++)
780
436k
                {
781
436k
                    eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
782
436k
                                    + g_ai2_ihevc_trans_16[6][k]
783
436k
                                                    * pi2_tmp[6 * trans_size];
784
436k
                }
785
109k
                eeo[0] = g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size];
786
109k
                eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
787
109k
                eeo[1] = g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size];
788
109k
                eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
789
790
                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
791
328k
                for(k = 0; k < 2; k++)
792
218k
                {
793
218k
                    ee[k] = eee[k] + eeo[k];
794
218k
                    ee[k + 2] = eee[1 - k] - eeo[1 - k];
795
218k
                }
796
547k
                for(k = 0; k < 4; k++)
797
437k
                {
798
437k
                    e[k] = ee[k] + eo[k];
799
437k
                    e[k + 4] = ee[3 - k] - eo[3 - k];
800
437k
                }
801
970k
                for(k = 0; k < 8; k++)
802
860k
                {
803
860k
                    WORD32 itrans_out;
804
860k
                    itrans_out =
805
860k
                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
806
860k
                    pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2]));
807
860k
                    itrans_out =
808
860k
                                    CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
809
860k
                    pu1_dst[(k + 8) * 2] = CLIP_U8((itrans_out + pu1_pred[(k + 8) * 2]));
810
860k
                }
811
109k
                pi2_tmp++;
812
109k
                pu1_pred += pred_strd;
813
109k
                pu1_dst += dst_strd;
814
109k
            }
815
6.92k
        }
816
10.6k
        else /* All rows of output of 1st stage are non-zero */
817
10.6k
        {
818
181k
            for(j = 0; j < trans_size; j++)
819
170k
            {
820
                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
821
1.53M
                for(k = 0; k < 8; k++)
822
1.36M
                {
823
1.36M
                    o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
824
1.36M
                                    + g_ai2_ihevc_trans_16[3][k]
825
1.36M
                                                    * pi2_tmp[3 * trans_size]
826
1.36M
                                    + g_ai2_ihevc_trans_16[5][k]
827
1.36M
                                                    * pi2_tmp[5 * trans_size]
828
1.36M
                                    + g_ai2_ihevc_trans_16[7][k]
829
1.36M
                                                    * pi2_tmp[7 * trans_size]
830
1.36M
                                    + g_ai2_ihevc_trans_16[9][k]
831
1.36M
                                                    * pi2_tmp[9 * trans_size]
832
1.36M
                                    + g_ai2_ihevc_trans_16[11][k]
833
1.36M
                                                    * pi2_tmp[11 * trans_size]
834
1.36M
                                    + g_ai2_ihevc_trans_16[13][k]
835
1.36M
                                                    * pi2_tmp[13 * trans_size]
836
1.36M
                                    + g_ai2_ihevc_trans_16[15][k]
837
1.36M
                                                    * pi2_tmp[15 * trans_size];
838
1.36M
                }
839
853k
                for(k = 0; k < 4; k++)
840
682k
                {
841
682k
                    eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
842
682k
                                    + g_ai2_ihevc_trans_16[6][k]
843
682k
                                                    * pi2_tmp[6 * trans_size]
844
682k
                                    + g_ai2_ihevc_trans_16[10][k]
845
682k
                                                    * pi2_tmp[10 * trans_size]
846
682k
                                    + g_ai2_ihevc_trans_16[14][k]
847
682k
                                                    * pi2_tmp[14 * trans_size];
848
682k
                }
849
170k
                eeo[0] =
850
170k
                                g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size]
851
170k
                                                + g_ai2_ihevc_trans_16[12][0]
852
170k
                                                                * pi2_tmp[12
853
170k
                                                                                * trans_size];
854
170k
                eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0]
855
170k
                                + g_ai2_ihevc_trans_16[8][0] * pi2_tmp[8 * trans_size];
856
170k
                eeo[1] =
857
170k
                                g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size]
858
170k
                                                + g_ai2_ihevc_trans_16[12][1]
859
170k
                                                                * pi2_tmp[12
860
170k
                                                                                * trans_size];
861
170k
                eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0]
862
170k
                                + g_ai2_ihevc_trans_16[8][1] * pi2_tmp[8 * trans_size];
863
864
                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
865
511k
                for(k = 0; k < 2; k++)
866
341k
                {
867
341k
                    ee[k] = eee[k] + eeo[k];
868
341k
                    ee[k + 2] = eee[1 - k] - eeo[1 - k];
869
341k
                }
870
853k
                for(k = 0; k < 4; k++)
871
682k
                {
872
682k
                    e[k] = ee[k] + eo[k];
873
682k
                    e[k + 4] = ee[3 - k] - eo[3 - k];
874
682k
                }
875
1.53M
                for(k = 0; k < 8; k++)
876
1.36M
                {
877
1.36M
                    WORD32 itrans_out;
878
1.36M
                    itrans_out =
879
1.36M
                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
880
1.36M
                    pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2]));
881
1.36M
                    itrans_out =
882
1.36M
                                    CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
883
1.36M
                    pu1_dst[(k + 8) * 2] = CLIP_U8((itrans_out + pu1_pred[(k + 8) * 2]));
884
1.36M
                }
885
170k
                pi2_tmp++;
886
170k
                pu1_pred += pred_strd;
887
170k
                pu1_dst += dst_strd;
888
170k
            }
889
10.6k
        }
890
        /************************************************************************************************/
891
        /************************************END - IT_RECON_16x16****************************************/
892
        /************************************************************************************************/
893
22.6k
    }
894
85.6k
}
895