/src/libhevc/common/ihevc_itrans_recon_8x8.c

Source
/******************************************************************************
*
* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
******************************************************************************/
/**
 *******************************************************************************
 * @file
 *  ihevc_itrans_recon_8x8.c
 *
 * @brief
 *  Contains function definitions for inverse transform  and reconstruction 8x8
 *
 *
 * @author
 *  100470
 *
 * @par List of Functions:
 *  - ihevc_itrans_recon_8x8()
 *
 * @remarks
 *  None
 *
 *******************************************************************************
 */
#include <stdio.h>
#include <string.h>
#include "ihevc_typedefs.h"
#include "ihevc_macros.h"
#include "ihevc_platform_macros.h"
#include "ihevc_defs.h"
#include "ihevc_trans_tables.h"
#include "ihevc_itrans_recon.h"
#include "ihevc_func_selector.h"
#include "ihevc_trans_macros.h"

/**
 *******************************************************************************
 *
 * @brief
 *  This function performs Inverse transform  and reconstruction for 8x8
 * input block
 *
 * @par Description:
 *  Performs inverse transform and adds the prediction  data and clips output
 * to 8 bit
 *
 * @param[in] pi2_src
 *  Input 8x8 coefficients
 *
 * @param[in] pi2_tmp
 *  Temporary 8x8 buffer for storing inverse
 *
 *  transform
 *  1st stage output
 *
 * @param[in] pu1_pred
 *  Prediction 8x8 block
 *
 * @param[out] pu1_dst
 *  Output 8x8 block
 *
 * @param[in] src_strd
 *  Input stride
 *
 * @param[in] pred_strd
 *  Prediction stride
 *
 * @param[in] dst_strd
 *  Output Stride
 *
 * @param[in] shift
 *  Output shift
 *
 * @param[in] zero_cols
 *  Zero columns in pi2_src
 *
 * @returns  Void
 *
 * @remarks
 *  None
 *
 *******************************************************************************
 */

void ihevc_itrans_recon_8x8(WORD16 *pi2_src,
                            WORD16 *pi2_tmp,
                            UWORD8 *pu1_pred,
                            UWORD8 *pu1_dst,
                            WORD32 src_strd,
                            WORD32 pred_strd,
                            WORD32 dst_strd,
                            WORD32 zero_cols,
                            WORD32 zero_rows)
{
    WORD32 j, k;
    WORD32 e[4], o[4];
    WORD32 ee[2], eo[2];
    WORD32 add;
    WORD32 shift;
    WORD16 *pi2_tmp_orig;
    WORD32 trans_size;
    WORD32 zero_rows_2nd_stage = zero_cols;
    WORD32 row_limit_2nd_stage;

    trans_size = TRANS_SIZE_8;

    pi2_tmp_orig = pi2_tmp;

    if((zero_cols & 0xF0) == 0xF0)
        row_limit_2nd_stage = 4;
    else
        row_limit_2nd_stage = TRANS_SIZE_8;


    if((zero_rows & 0xF0) == 0xF0) /* First 4 rows of input are non-zero */
    {
        /************************************************************************************************/
        /**********************************START - IT_RECON_8x8******************************************/
        /************************************************************************************************/

        /* Inverse Transform 1st stage */
        shift = IT_SHIFT_STAGE_1;
        add = 1 << (shift - 1);

        for(j = 0; j < row_limit_2nd_stage; j++)
        {
            /* Checking for Zero Cols */
            if((zero_cols & 1) == 1)
            {
                memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
            }
            else
            {
                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
                for(k = 0; k < 4; k++)
                {
                    o[k] = g_ai2_ihevc_trans_8[1][k] * pi2_src[src_strd]
                                    + g_ai2_ihevc_trans_8[3][k]
                                                    * pi2_src[3 * src_strd];
                }
                eo[0] = g_ai2_ihevc_trans_8[2][0] * pi2_src[2 * src_strd];
                eo[1] = g_ai2_ihevc_trans_8[2][1] * pi2_src[2 * src_strd];
                ee[0] = g_ai2_ihevc_trans_8[0][0] * pi2_src[0];
                ee[1] = g_ai2_ihevc_trans_8[0][1] * pi2_src[0];

                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
                e[0] = ee[0] + eo[0];
                e[3] = ee[0] - eo[0];
                e[1] = ee[1] + eo[1];
                e[2] = ee[1] - eo[1];
                for(k = 0; k < 4; k++)
                {
                    pi2_tmp[k] =
                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
                    pi2_tmp[k + 4] =
                                    CLIP_S16(((e[3 - k] - o[3 - k] + add) >> shift));
                }
            }
            pi2_src++;
            pi2_tmp += trans_size;
            zero_cols = zero_cols >> 1;
        }

        pi2_tmp = pi2_tmp_orig;

        /* Inverse Transform 2nd stage */
        shift = IT_SHIFT_STAGE_2;
        add = 1 << (shift - 1);
        if((zero_rows_2nd_stage & 0xF0) == 0xF0) /* First 4 rows of output of 1st stage are non-zero */
        {
            for(j = 0; j < trans_size; j++)
            {
                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
                for(k = 0; k < 4; k++)
                {
                    o[k] = g_ai2_ihevc_trans_8[1][k] * pi2_tmp[trans_size]
                                    + g_ai2_ihevc_trans_8[3][k] * pi2_tmp[3 * trans_size];
                }
                eo[0] = g_ai2_ihevc_trans_8[2][0] * pi2_tmp[2 * trans_size];
                eo[1] = g_ai2_ihevc_trans_8[2][1] * pi2_tmp[2 * trans_size];
                ee[0] = g_ai2_ihevc_trans_8[0][0] * pi2_tmp[0];
                ee[1] = g_ai2_ihevc_trans_8[0][1] * pi2_tmp[0];

                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
                e[0] = ee[0] + eo[0];
                e[3] = ee[0] - eo[0];
                e[1] = ee[1] + eo[1];
                e[2] = ee[1] - eo[1];
                for(k = 0; k < 4; k++)
                {
                    WORD32 itrans_out;
                    itrans_out =
                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
                    pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
                    itrans_out =
                                    CLIP_S16(((e[3 - k] - o[3 - k] + add) >> shift));
                    pu1_dst[k + 4] = CLIP_U8((itrans_out + pu1_pred[k + 4]));
                }
                pi2_tmp++;
                pu1_pred += pred_strd;
                pu1_dst += dst_strd;
            }
        }
        else /* All rows of output of 1st stage are non-zero */
        {
            for(j = 0; j < trans_size; j++)
            {
                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
                for(k = 0; k < 4; k++)
                {
                    o[k] = g_ai2_ihevc_trans_8[1][k] * pi2_tmp[trans_size]
                                    + g_ai2_ihevc_trans_8[3][k]
                                                    * pi2_tmp[3 * trans_size]
                                    + g_ai2_ihevc_trans_8[5][k]
                                                    * pi2_tmp[5 * trans_size]
                                    + g_ai2_ihevc_trans_8[7][k]
                                                    * pi2_tmp[7 * trans_size];
                }

                eo[0] = g_ai2_ihevc_trans_8[2][0] * pi2_tmp[2 * trans_size]
                                + g_ai2_ihevc_trans_8[6][0] * pi2_tmp[6 * trans_size];
                eo[1] = g_ai2_ihevc_trans_8[2][1] * pi2_tmp[2 * trans_size]
                                + g_ai2_ihevc_trans_8[6][1] * pi2_tmp[6 * trans_size];
                ee[0] = g_ai2_ihevc_trans_8[0][0] * pi2_tmp[0]
                                + g_ai2_ihevc_trans_8[4][0] * pi2_tmp[4 * trans_size];
                ee[1] = g_ai2_ihevc_trans_8[0][1] * pi2_tmp[0]
                                + g_ai2_ihevc_trans_8[4][1] * pi2_tmp[4 * trans_size];

                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
                e[0] = ee[0] + eo[0];
                e[3] = ee[0] - eo[0];
                e[1] = ee[1] + eo[1];
                e[2] = ee[1] - eo[1];
                for(k = 0; k < 4; k++)
                {
                    WORD32 itrans_out;
                    itrans_out =
                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
                    pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
                    itrans_out =
                                    CLIP_S16(((e[3 - k] - o[3 - k] + add) >> shift));
                    pu1_dst[k + 4] = CLIP_U8((itrans_out + pu1_pred[k + 4]));
                }
                pi2_tmp++;
                pu1_pred += pred_strd;
                pu1_dst += dst_strd;
            }
        }
        /************************************************************************************************/
        /************************************END - IT_RECON_8x8******************************************/
        /************************************************************************************************/
    }
    else /* All rows of input are non-zero */
    {
        /************************************************************************************************/
        /**********************************START - IT_RECON_8x8******************************************/
        /************************************************************************************************/

        /* Inverse Transform 1st stage */
        shift = IT_SHIFT_STAGE_1;
        add = 1 << (shift - 1);

        for(j = 0; j < row_limit_2nd_stage; j++)
        {
            /* Checking for Zero Cols */
            if((zero_cols & 1) == 1)
            {
                memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
            }
            else
            {
                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
                for(k = 0; k < 4; k++)
                {
                    o[k] = g_ai2_ihevc_trans_8[1][k] * pi2_src[src_strd]
                                    + g_ai2_ihevc_trans_8[3][k]
                                                    * pi2_src[3 * src_strd]
                                    + g_ai2_ihevc_trans_8[5][k]
                                                    * pi2_src[5 * src_strd]
                                    + g_ai2_ihevc_trans_8[7][k]
                                                    * pi2_src[7 * src_strd];
                }

                eo[0] = g_ai2_ihevc_trans_8[2][0] * pi2_src[2 * src_strd]
                                + g_ai2_ihevc_trans_8[6][0] * pi2_src[6 * src_strd];
                eo[1] = g_ai2_ihevc_trans_8[2][1] * pi2_src[2 * src_strd]
                                + g_ai2_ihevc_trans_8[6][1] * pi2_src[6 * src_strd];
                ee[0] = g_ai2_ihevc_trans_8[0][0] * pi2_src[0]
                                + g_ai2_ihevc_trans_8[4][0] * pi2_src[4 * src_strd];
                ee[1] = g_ai2_ihevc_trans_8[0][1] * pi2_src[0]
                                + g_ai2_ihevc_trans_8[4][1] * pi2_src[4 * src_strd];

                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
                e[0] = ee[0] + eo[0];
                e[3] = ee[0] - eo[0];
                e[1] = ee[1] + eo[1];
                e[2] = ee[1] - eo[1];
                for(k = 0; k < 4; k++)
                {
                    pi2_tmp[k] =
                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
                    pi2_tmp[k + 4] =
                                    CLIP_S16(((e[3 - k] - o[3 - k] + add) >> shift));
                }
            }
            pi2_src++;
            pi2_tmp += trans_size;
            zero_cols = zero_cols >> 1;
        }

        pi2_tmp = pi2_tmp_orig;

        /* Inverse Transform 2nd stage */
        shift = IT_SHIFT_STAGE_2;
        add = 1 << (shift - 1);
        if((zero_rows_2nd_stage & 0xF0) == 0xF0) /* First 4 rows of output of 1st stage are non-zero */
        {
            for(j = 0; j < trans_size; j++)
            {
                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
                for(k = 0; k < 4; k++)
                {
                    o[k] = g_ai2_ihevc_trans_8[1][k] * pi2_tmp[trans_size]
                                    + g_ai2_ihevc_trans_8[3][k] * pi2_tmp[3 * trans_size];
                }
                eo[0] = g_ai2_ihevc_trans_8[2][0] * pi2_tmp[2 * trans_size];
                eo[1] = g_ai2_ihevc_trans_8[2][1] * pi2_tmp[2 * trans_size];
                ee[0] = g_ai2_ihevc_trans_8[0][0] * pi2_tmp[0];
                ee[1] = g_ai2_ihevc_trans_8[0][1] * pi2_tmp[0];

                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
                e[0] = ee[0] + eo[0];
                e[3] = ee[0] - eo[0];
                e[1] = ee[1] + eo[1];
                e[2] = ee[1] - eo[1];
                for(k = 0; k < 4; k++)
                {
                    WORD32 itrans_out;
                    itrans_out =
                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
                    pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
                    itrans_out =
                                    CLIP_S16(((e[3 - k] - o[3 - k] + add) >> shift));
                    pu1_dst[k + 4] = CLIP_U8((itrans_out + pu1_pred[k + 4]));
                }
                pi2_tmp++;
                pu1_pred += pred_strd;
                pu1_dst += dst_strd;
            }
        }
        else /* All rows of output of 1st stage are non-zero */
        {
            for(j = 0; j < trans_size; j++)
            {
                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
                for(k = 0; k < 4; k++)
                {
                    o[k] = g_ai2_ihevc_trans_8[1][k] * pi2_tmp[trans_size]
                                    + g_ai2_ihevc_trans_8[3][k]
                                                    * pi2_tmp[3 * trans_size]
                                    + g_ai2_ihevc_trans_8[5][k]
                                                    * pi2_tmp[5 * trans_size]
                                    + g_ai2_ihevc_trans_8[7][k]
                                                    * pi2_tmp[7 * trans_size];
                }

                eo[0] = g_ai2_ihevc_trans_8[2][0] * pi2_tmp[2 * trans_size]
                                + g_ai2_ihevc_trans_8[6][0] * pi2_tmp[6 * trans_size];
                eo[1] = g_ai2_ihevc_trans_8[2][1] * pi2_tmp[2 * trans_size]
                                + g_ai2_ihevc_trans_8[6][1] * pi2_tmp[6 * trans_size];
                ee[0] = g_ai2_ihevc_trans_8[0][0] * pi2_tmp[0]
                                + g_ai2_ihevc_trans_8[4][0] * pi2_tmp[4 * trans_size];
                ee[1] = g_ai2_ihevc_trans_8[0][1] * pi2_tmp[0]
                                + g_ai2_ihevc_trans_8[4][1] * pi2_tmp[4 * trans_size];

                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
                e[0] = ee[0] + eo[0];
                e[3] = ee[0] - eo[0];
                e[1] = ee[1] + eo[1];
                e[2] = ee[1] - eo[1];
                for(k = 0; k < 4; k++)
                {
                    WORD32 itrans_out;
                    itrans_out =
                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
                    pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
                    itrans_out =
                                    CLIP_S16(((e[3 - k] - o[3 - k] + add) >> shift));
                    pu1_dst[k + 4] = CLIP_U8((itrans_out + pu1_pred[k + 4]));
                }
                pi2_tmp++;
                pu1_pred += pred_strd;
                pu1_dst += dst_strd;
            }
        }
        /************************************************************************************************/
        /************************************END - IT_RECON_8x8******************************************/
        /************************************************************************************************/
    }
}


Coverage Report

Created: 2025-07-12 07:16

Line	Count	Source
1		/******************************************************************************
2		*
3		* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4		*
5		* Licensed under the Apache License, Version 2.0 (the "License");
6		* you may not use this file except in compliance with the License.
7		* You may obtain a copy of the License at:
8		*
9		* http://www.apache.org/licenses/LICENSE-2.0
10		*
11		* Unless required by applicable law or agreed to in writing, software
12		* distributed under the License is distributed on an "AS IS" BASIS,
13		* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14		* See the License for the specific language governing permissions and
15		* limitations under the License.
16		*
17		******************************************************************************/
18		/**
19		*******************************************************************************
20		* @file
21		* ihevc_itrans_recon_8x8.c
22		*
23		* @brief
24		* Contains function definitions for inverse transform and reconstruction 8x8
25		*
26		*
27		* @author
28		* 100470
29		*
30		* @par List of Functions:
31		* - ihevc_itrans_recon_8x8()
32		*
33		* @remarks
34		* None
35		*
36		*******************************************************************************
37		*/
38		#include <stdio.h>
39		#include <string.h>
40		#include "ihevc_typedefs.h"
41		#include "ihevc_macros.h"
42		#include "ihevc_platform_macros.h"
43		#include "ihevc_defs.h"
44		#include "ihevc_trans_tables.h"
45		#include "ihevc_itrans_recon.h"
46		#include "ihevc_func_selector.h"
47		#include "ihevc_trans_macros.h"
48
49		/**
50		*******************************************************************************
51		*
52		* @brief
53		* This function performs Inverse transform and reconstruction for 8x8
54		* input block
55		*
56		* @par Description:
57		* Performs inverse transform and adds the prediction data and clips output
58		* to 8 bit
59		*
60		* @param[in] pi2_src
61		* Input 8x8 coefficients
62		*
63		* @param[in] pi2_tmp
64		* Temporary 8x8 buffer for storing inverse
65		*
66		* transform
67		* 1st stage output
68		*
69		* @param[in] pu1_pred
70		* Prediction 8x8 block
71		*
72		* @param[out] pu1_dst
73		* Output 8x8 block
74		*
75		* @param[in] src_strd
76		* Input stride
77		*
78		* @param[in] pred_strd
79		* Prediction stride
80		*
81		* @param[in] dst_strd
82		* Output Stride
83		*
84		* @param[in] shift
85		* Output shift
86		*
87		* @param[in] zero_cols
88		* Zero columns in pi2_src
89		*
90		* @returns Void
91		*
92		* @remarks
93		* None
94		*
95		*******************************************************************************
96		*/
97
98		void ihevc_itrans_recon_8x8(WORD16 *pi2_src,
99		WORD16 *pi2_tmp,
100		UWORD8 *pu1_pred,
101		UWORD8 *pu1_dst,
102		WORD32 src_strd,
103		WORD32 pred_strd,
104		WORD32 dst_strd,
105		WORD32 zero_cols,
106		WORD32 zero_rows)
107	1.11M	{
108	1.11M	WORD32 j, k;
109	1.11M	WORD32 e[4], o[4];
110	1.11M	WORD32 ee[2], eo[2];
111	1.11M	WORD32 add;
112	1.11M	WORD32 shift;
113	1.11M	WORD16 *pi2_tmp_orig;
114	1.11M	WORD32 trans_size;
115	1.11M	WORD32 zero_rows_2nd_stage = zero_cols;
116	1.11M	WORD32 row_limit_2nd_stage;
117
118	1.11M	trans_size = TRANS_SIZE_8;
119
120	1.11M	pi2_tmp_orig = pi2_tmp;
121
122	1.11M	if((zero_cols & 0xF0) == 0xF0)
123	181k	row_limit_2nd_stage = 4;
124	934k	else
125	934k	row_limit_2nd_stage = TRANS_SIZE_8;
126
127
128	1.11M	if((zero_rows & 0xF0) == 0xF0) /* First 4 rows of input are non-zero */
129	161k	{
130		/************************************************************************************************/
131		/********************************START - IT_RECON_8x8****************************************/
132		/************************************************************************************************/
133
134		/* Inverse Transform 1st stage */
135	161k	shift = IT_SHIFT_STAGE_1;
136	161k	add = 1 << (shift - 1);
137
138	1.04M	for(j = 0; j < row_limit_2nd_stage; j++)
139	886k	{
140		/* Checking for Zero Cols */
141	886k	if((zero_cols & 1) == 1)
142	146k	{
143	146k	memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
144	146k	}
145	740k	else
146	740k	{
147		/* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
148	3.70M	for(k = 0; k < 4; k++)
149	2.96M	{
150	2.96M	o[k] = g_ai2_ihevc_trans_8[1][k] * pi2_src[src_strd]
151	2.96M	+ g_ai2_ihevc_trans_8[3][k]
152	2.96M	* pi2_src[3 * src_strd];
153	2.96M	}
154	740k	eo[0] = g_ai2_ihevc_trans_8[2][0] * pi2_src[2 * src_strd];
155	740k	eo[1] = g_ai2_ihevc_trans_8[2][1] * pi2_src[2 * src_strd];
156	740k	ee[0] = g_ai2_ihevc_trans_8[0][0] * pi2_src[0];
157	740k	ee[1] = g_ai2_ihevc_trans_8[0][1] * pi2_src[0];
158
159		/* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
160	740k	e[0] = ee[0] + eo[0];
161	740k	e[3] = ee[0] - eo[0];
162	740k	e[1] = ee[1] + eo[1];
163	740k	e[2] = ee[1] - eo[1];
164	3.70M	for(k = 0; k < 4; k++)
165	2.96M	{
166	2.96M	pi2_tmp[k] =
167	2.96M	CLIP_S16(((e[k] + o[k] + add) >> shift));
168	2.96M	pi2_tmp[k + 4] =
169	2.96M	CLIP_S16(((e[3 - k] - o[3 - k] + add) >> shift));
170	2.96M	}
171	740k	}
172	886k	pi2_src++;
173	886k	pi2_tmp += trans_size;
174	886k	zero_cols = zero_cols >> 1;
175	886k	}
176
177	161k	pi2_tmp = pi2_tmp_orig;
178
179		/* Inverse Transform 2nd stage */
180	161k	shift = IT_SHIFT_STAGE_2;
181	161k	add = 1 << (shift - 1);
182	161k	if((zero_rows_2nd_stage & 0xF0) == 0xF0) /* First 4 rows of output of 1st stage are non-zero */
183	101k	{
184	914k	for(j = 0; j < trans_size; j++)
185	812k	{
186		/* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
187	4.06M	for(k = 0; k < 4; k++)
188	3.24M	{
189	3.24M	o[k] = g_ai2_ihevc_trans_8[1][k] * pi2_tmp[trans_size]
190	3.24M	+ g_ai2_ihevc_trans_8[3][k] * pi2_tmp[3 * trans_size];
191	3.24M	}
192	812k	eo[0] = g_ai2_ihevc_trans_8[2][0] * pi2_tmp[2 * trans_size];
193	812k	eo[1] = g_ai2_ihevc_trans_8[2][1] * pi2_tmp[2 * trans_size];
194	812k	ee[0] = g_ai2_ihevc_trans_8[0][0] * pi2_tmp[0];
195	812k	ee[1] = g_ai2_ihevc_trans_8[0][1] * pi2_tmp[0];
196
197		/* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
198	812k	e[0] = ee[0] + eo[0];
199	812k	e[3] = ee[0] - eo[0];
200	812k	e[1] = ee[1] + eo[1];
201	812k	e[2] = ee[1] - eo[1];
202	4.06M	for(k = 0; k < 4; k++)
203	3.24M	{
204	3.24M	WORD32 itrans_out;
205	3.24M	itrans_out =
206	3.24M	CLIP_S16(((e[k] + o[k] + add) >> shift));
207	3.24M	pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
208	3.24M	itrans_out =
209	3.24M	CLIP_S16(((e[3 - k] - o[3 - k] + add) >> shift));
210	3.24M	pu1_dst[k + 4] = CLIP_U8((itrans_out + pu1_pred[k + 4]));
211	3.24M	}
212	812k	pi2_tmp++;
213	812k	pu1_pred += pred_strd;
214	812k	pu1_dst += dst_strd;
215	812k	}
216	101k	}
217	60.0k	else /* All rows of output of 1st stage are non-zero */
218	60.0k	{
219	540k	for(j = 0; j < trans_size; j++)
220	480k	{
221		/* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
222	2.40M	for(k = 0; k < 4; k++)
223	1.92M	{
224	1.92M	o[k] = g_ai2_ihevc_trans_8[1][k] * pi2_tmp[trans_size]
225	1.92M	+ g_ai2_ihevc_trans_8[3][k]
226	1.92M	* pi2_tmp[3 * trans_size]
227	1.92M	+ g_ai2_ihevc_trans_8[5][k]
228	1.92M	* pi2_tmp[5 * trans_size]
229	1.92M	+ g_ai2_ihevc_trans_8[7][k]
230	1.92M	* pi2_tmp[7 * trans_size];
231	1.92M	}
232
233	480k	eo[0] = g_ai2_ihevc_trans_8[2][0] * pi2_tmp[2 * trans_size]
234	480k	+ g_ai2_ihevc_trans_8[6][0] * pi2_tmp[6 * trans_size];
235	480k	eo[1] = g_ai2_ihevc_trans_8[2][1] * pi2_tmp[2 * trans_size]
236	480k	+ g_ai2_ihevc_trans_8[6][1] * pi2_tmp[6 * trans_size];
237	480k	ee[0] = g_ai2_ihevc_trans_8[0][0] * pi2_tmp[0]
238	480k	+ g_ai2_ihevc_trans_8[4][0] * pi2_tmp[4 * trans_size];
239	480k	ee[1] = g_ai2_ihevc_trans_8[0][1] * pi2_tmp[0]
240	480k	+ g_ai2_ihevc_trans_8[4][1] * pi2_tmp[4 * trans_size];
241
242		/* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
243	480k	e[0] = ee[0] + eo[0];
244	480k	e[3] = ee[0] - eo[0];
245	480k	e[1] = ee[1] + eo[1];
246	480k	e[2] = ee[1] - eo[1];
247	2.40M	for(k = 0; k < 4; k++)
248	1.92M	{
249	1.92M	WORD32 itrans_out;
250	1.92M	itrans_out =
251	1.92M	CLIP_S16(((e[k] + o[k] + add) >> shift));
252	1.92M	pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
253	1.92M	itrans_out =
254	1.92M	CLIP_S16(((e[3 - k] - o[3 - k] + add) >> shift));
255	1.92M	pu1_dst[k + 4] = CLIP_U8((itrans_out + pu1_pred[k + 4]));
256	1.92M	}
257	480k	pi2_tmp++;
258	480k	pu1_pred += pred_strd;
259	480k	pu1_dst += dst_strd;
260	480k	}
261	60.0k	}
262		/************************************************************************************************/
263		/**********************************END - IT_RECON_8x8****************************************/
264		/************************************************************************************************/
265	161k	}
266	953k	else /* All rows of input are non-zero */
267	953k	{
268		/************************************************************************************************/
269		/********************************START - IT_RECON_8x8****************************************/
270		/************************************************************************************************/
271
272		/* Inverse Transform 1st stage */
273	953k	shift = IT_SHIFT_STAGE_1;
274	953k	add = 1 << (shift - 1);
275
276	8.26M	for(j = 0; j < row_limit_2nd_stage; j++)
277	7.31M	{
278		/* Checking for Zero Cols */
279	7.31M	if((zero_cols & 1) == 1)
280	41.8k	{
281	41.8k	memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
282	41.8k	}
283	7.26M	else
284	7.26M	{
285		/* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
286	36.3M	for(k = 0; k < 4; k++)
287	29.0M	{
288	29.0M	o[k] = g_ai2_ihevc_trans_8[1][k] * pi2_src[src_strd]
289	29.0M	+ g_ai2_ihevc_trans_8[3][k]
290	29.0M	* pi2_src[3 * src_strd]
291	29.0M	+ g_ai2_ihevc_trans_8[5][k]
292	29.0M	* pi2_src[5 * src_strd]
293	29.0M	+ g_ai2_ihevc_trans_8[7][k]
294	29.0M	* pi2_src[7 * src_strd];
295	29.0M	}
296
297	7.26M	eo[0] = g_ai2_ihevc_trans_8[2][0] * pi2_src[2 * src_strd]
298	7.26M	+ g_ai2_ihevc_trans_8[6][0] * pi2_src[6 * src_strd];
299	7.26M	eo[1] = g_ai2_ihevc_trans_8[2][1] * pi2_src[2 * src_strd]
300	7.26M	+ g_ai2_ihevc_trans_8[6][1] * pi2_src[6 * src_strd];
301	7.26M	ee[0] = g_ai2_ihevc_trans_8[0][0] * pi2_src[0]
302	7.26M	+ g_ai2_ihevc_trans_8[4][0] * pi2_src[4 * src_strd];
303	7.26M	ee[1] = g_ai2_ihevc_trans_8[0][1] * pi2_src[0]
304	7.26M	+ g_ai2_ihevc_trans_8[4][1] * pi2_src[4 * src_strd];
305
306		/* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
307	7.26M	e[0] = ee[0] + eo[0];
308	7.26M	e[3] = ee[0] - eo[0];
309	7.26M	e[1] = ee[1] + eo[1];
310	7.26M	e[2] = ee[1] - eo[1];
311	36.3M	for(k = 0; k < 4; k++)
312	29.0M	{
313	29.0M	pi2_tmp[k] =
314	29.0M	CLIP_S16(((e[k] + o[k] + add) >> shift));
315	29.0M	pi2_tmp[k + 4] =
316	29.0M	CLIP_S16(((e[3 - k] - o[3 - k] + add) >> shift));
317	29.0M	}
318	7.26M	}
319	7.31M	pi2_src++;
320	7.31M	pi2_tmp += trans_size;
321	7.31M	zero_cols = zero_cols >> 1;
322	7.31M	}
323
324	953k	pi2_tmp = pi2_tmp_orig;
325
326		/* Inverse Transform 2nd stage */
327	953k	shift = IT_SHIFT_STAGE_2;
328	953k	add = 1 << (shift - 1);
329	953k	if((zero_rows_2nd_stage & 0xF0) == 0xF0) /* First 4 rows of output of 1st stage are non-zero */
330	79.6k	{
331	717k	for(j = 0; j < trans_size; j++)
332	637k	{
333		/* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
334	3.18M	for(k = 0; k < 4; k++)
335	2.54M	{
336	2.54M	o[k] = g_ai2_ihevc_trans_8[1][k] * pi2_tmp[trans_size]
337	2.54M	+ g_ai2_ihevc_trans_8[3][k] * pi2_tmp[3 * trans_size];
338	2.54M	}
339	637k	eo[0] = g_ai2_ihevc_trans_8[2][0] * pi2_tmp[2 * trans_size];
340	637k	eo[1] = g_ai2_ihevc_trans_8[2][1] * pi2_tmp[2 * trans_size];
341	637k	ee[0] = g_ai2_ihevc_trans_8[0][0] * pi2_tmp[0];
342	637k	ee[1] = g_ai2_ihevc_trans_8[0][1] * pi2_tmp[0];
343
344		/* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
345	637k	e[0] = ee[0] + eo[0];
346	637k	e[3] = ee[0] - eo[0];
347	637k	e[1] = ee[1] + eo[1];
348	637k	e[2] = ee[1] - eo[1];
349	3.18M	for(k = 0; k < 4; k++)
350	2.54M	{
351	2.54M	WORD32 itrans_out;
352	2.54M	itrans_out =
353	2.54M	CLIP_S16(((e[k] + o[k] + add) >> shift));
354	2.54M	pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
355	2.54M	itrans_out =
356	2.54M	CLIP_S16(((e[3 - k] - o[3 - k] + add) >> shift));
357	2.54M	pu1_dst[k + 4] = CLIP_U8((itrans_out + pu1_pred[k + 4]));
358	2.54M	}
359	637k	pi2_tmp++;
360	637k	pu1_pred += pred_strd;
361	637k	pu1_dst += dst_strd;
362	637k	}
363	79.6k	}
364	874k	else /* All rows of output of 1st stage are non-zero */
365	874k	{
366	7.86M	for(j = 0; j < trans_size; j++)
367	6.99M	{
368		/* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
369	34.9M	for(k = 0; k < 4; k++)
370	27.9M	{
371	27.9M	o[k] = g_ai2_ihevc_trans_8[1][k] * pi2_tmp[trans_size]
372	27.9M	+ g_ai2_ihevc_trans_8[3][k]
373	27.9M	* pi2_tmp[3 * trans_size]
374	27.9M	+ g_ai2_ihevc_trans_8[5][k]
375	27.9M	* pi2_tmp[5 * trans_size]
376	27.9M	+ g_ai2_ihevc_trans_8[7][k]
377	27.9M	* pi2_tmp[7 * trans_size];
378	27.9M	}
379
380	6.99M	eo[0] = g_ai2_ihevc_trans_8[2][0] * pi2_tmp[2 * trans_size]
381	6.99M	+ g_ai2_ihevc_trans_8[6][0] * pi2_tmp[6 * trans_size];
382	6.99M	eo[1] = g_ai2_ihevc_trans_8[2][1] * pi2_tmp[2 * trans_size]
383	6.99M	+ g_ai2_ihevc_trans_8[6][1] * pi2_tmp[6 * trans_size];
384	6.99M	ee[0] = g_ai2_ihevc_trans_8[0][0] * pi2_tmp[0]
385	6.99M	+ g_ai2_ihevc_trans_8[4][0] * pi2_tmp[4 * trans_size];
386	6.99M	ee[1] = g_ai2_ihevc_trans_8[0][1] * pi2_tmp[0]
387	6.99M	+ g_ai2_ihevc_trans_8[4][1] * pi2_tmp[4 * trans_size];
388
389		/* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
390	6.99M	e[0] = ee[0] + eo[0];
391	6.99M	e[3] = ee[0] - eo[0];
392	6.99M	e[1] = ee[1] + eo[1];
393	6.99M	e[2] = ee[1] - eo[1];
394	34.9M	for(k = 0; k < 4; k++)
395	27.9M	{
396	27.9M	WORD32 itrans_out;
397	27.9M	itrans_out =
398	27.9M	CLIP_S16(((e[k] + o[k] + add) >> shift));
399	27.9M	pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
400	27.9M	itrans_out =
401	27.9M	CLIP_S16(((e[3 - k] - o[3 - k] + add) >> shift));
402	27.9M	pu1_dst[k + 4] = CLIP_U8((itrans_out + pu1_pred[k + 4]));
403	27.9M	}
404	6.99M	pi2_tmp++;
405	6.99M	pu1_pred += pred_strd;
406	6.99M	pu1_dst += dst_strd;
407	6.99M	}
408	874k	}
409		/************************************************************************************************/
410		/**********************************END - IT_RECON_8x8****************************************/
411		/************************************************************************************************/
412	953k	}
413	1.11M	}
414