/src/libavc/common/ih264_resi_trans_quant.c

Source (jump to first uncovered line)
/******************************************************************************
 *
 * Copyright (C) 2015 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 *****************************************************************************
 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
*/

/**
*******************************************************************************
* @file
*  ih264_resi_trans_quant.c
*
* @brief
*  Contains function definitions single stage forward transform for H.264
*  It will calculate the residue, do the cf and then do quantization
*
* @author
*  ittiam
*
* @par List of Functions:
*  - ih264_resi_trans_quant_4x4
*  - ih264_resi_trans_quant_chroma_4x4
*  - ih264_hadamard_quant_4x4
*  - ih264_hadamard_quant_2x2_uv
*  - ih264_resi_trans_quant_8x8
*
* @remarks
*  none
*
*******************************************************************************
*/


/*****************************************************************************/
/* File Includes                                                             */
/*****************************************************************************/

/* System Include Files */
#include <stddef.h>

/* User Include Files */
#include "ih264_typedefs.h"
#include "ih264_defs.h"
#include "ih264_macros.h"
#include "ih264_size_defs.h"
#include "ih264_trans_macros.h"
#include "ih264_trans_data.h"
#include "ih264_structs.h"
#include "ih264_trans_quant_itrans_iquant.h"


/*****************************************************************************/
/* Function Definitions                                                      */
/*****************************************************************************/

/**
*******************************************************************************
*
* @brief
*  This function performs forward transform and quantization on a 4x4 block
*
* @par Description:
*  The function accepts source buffer and estimation buffer. From these, it
*  computes the residue. This is residue is then transformed and quantized.
*  The transform and quantization are in placed computed. They use the residue
*  buffer for this.
*
* @param[in] pu1_src
*  Pointer to source sub-block
*
* @param[in] pu1_pred
*  Pointer to prediction sub-block
*
* @param[in] pi2_out
*  Pointer to residual sub-block
*
* @param[in] src_strd
*  Source stride
*
* @param[in] pred_strd
*  Prediction stride
*
* @param[in] pu2_scale_matrix
*  Pointer to Forward Quant Scale Matrix
*
* @param[in] pu2_threshold_matrix
*  Pointer to Forward Quant Threshold Matrix
*
* @param[in] u4_qbits
*  QP_BITS_h264_4x4 + floor(QP/6)
*
* @param[in] u4_round_factor
*  Quantization Round factor
*
* @param[out] pu1_nnz
*  Total non-zero coefficients in the current sub-block
*
* @param[in] pi2_alt_dc_addr
*  DC Coefficient of the block
*
* @remarks none
*
*******************************************************************************
*/
void ih264_resi_trans_quant_4x4(UWORD8 *pu1_src,
                                UWORD8 *pu1_pred,
                                WORD16 *pi2_out,
                                WORD32 src_strd,
                                WORD32 pred_strd,
                                const UWORD16 *pu2_scale_matrix,
                                const UWORD16 *pu2_threshold_matrix,
                                UWORD32 u4_qbits,
                                UWORD32 u4_round_factor,
                                UWORD8 *pu1_nnz,
                                WORD16 *pi2_alt_dc_addr)
{
    UWORD32 i;
    WORD32 x0, x1, x2, x3, x4, x5, x6, x7;
    WORD32 i4_value;
    WORD16 *pi2_out_tmp = pi2_out;
    UWORD32 u4_nonzero_coeff = 0;

    for (i = 0; i < SUB_BLK_WIDTH_4x4; i++)
    {
        /* computing prediction error (residue) */
        x4 = pu1_src[0] - pu1_pred[0];
        x5 = pu1_src[1] - pu1_pred[1];
        x6 = pu1_src[2] - pu1_pred[2];
        x7 = pu1_src[3] - pu1_pred[3];

        /* Horizontal transform */
        x0 = x4 + x7;
        x1 = x5 + x6;
        x2 = x5 - x6;
        x3 = x4 - x7;

        pi2_out_tmp[0] = x0 + x1;
        pi2_out_tmp[1] = (x3 << 1) + x2;
        pi2_out_tmp[2] = x0 - x1;
        pi2_out_tmp[3] = x3 - (x2 << 1);

        /* pointing to next row; */
        pu1_src += src_strd;
        pu1_pred += pred_strd;
        pi2_out_tmp += 4;
    }

    pi2_out_tmp = pi2_out;
    for (i = 0; i < SUB_BLK_WIDTH_4x4; i++)
    {
        /* Vertical transform and quantization */
        x4 = pi2_out_tmp[0];
        x5 = pi2_out_tmp[4];
        x6 = pi2_out_tmp[8];
        x7 = pi2_out_tmp[12];

        x0 = x4 + x7;
        x1 = x5 + x6;
        x2 = x5 - x6;
        x3 = x4 - x7;

        /* quantization is done in place */
        i4_value = x0 + x1;
        if(i == 0)
        {
            (*pi2_alt_dc_addr) = i4_value;
        }
        FWD_QUANT(i4_value, pu2_threshold_matrix[0],
                  pu2_scale_matrix[0], u4_round_factor, u4_qbits,
                  u4_nonzero_coeff);
        pi2_out_tmp[0] = i4_value;

        i4_value = (x3 << 1) + x2;
        FWD_QUANT(i4_value, pu2_threshold_matrix[4],
                  pu2_scale_matrix[4], u4_round_factor, u4_qbits,
                  u4_nonzero_coeff);
        pi2_out_tmp[4] = i4_value;

        i4_value = x0 - x1;
        FWD_QUANT(i4_value, pu2_threshold_matrix[8],
                  pu2_scale_matrix[8], u4_round_factor, u4_qbits,
                  u4_nonzero_coeff);
        pi2_out_tmp[8] = i4_value;

        i4_value = x3 - (x2 << 1);
        FWD_QUANT(i4_value, pu2_threshold_matrix[12],
                  pu2_scale_matrix[12], u4_round_factor, u4_qbits,
                  u4_nonzero_coeff);
        pi2_out_tmp[12] = i4_value;

        pi2_out_tmp++;
        pu2_scale_matrix++;
        pu2_threshold_matrix++;
    }

    /* Return total nonzero coefficients in the current sub block */
    *pu1_nnz =  u4_nonzero_coeff;
}

/**
*******************************************************************************
*
* @brief
*  This function performs forward transform and quantization on a 4x4
*  chroma block with interleaved values
*
* @par Description:
*  The function accepts source buffer and estimation buffer. From these, it
*  computes the residue. This is residue is then transformed and quantized.
*  The transform and quantization are in placed computed. They use the residue
*  buffer for this.
*
* @param[in] pu1_src
*  Pointer to source sub-block
*
* @param[in] pu1_pred
*  Pointer to prediction sub-block
*
* @param[in] pi2_out
*  Pointer to residual sub-block
*
* @param[in] src_strd
*  Source stride
*
* @param[in] pred_strd
*  Prediction stride
*
* @param[in] pu2_scale_matrix
*  Pointer to Forward Quant Scale Matrix
*
* @param[in] pu2_threshold_matrix
*  Pointer to Forward Quant Threshold Matrix
*
* @param[in] u4_qbits
*  QP_BITS_h264_4x4 + floor(QP/6)
*
* @param[in] u4_round_factor
*  Quantization Round factor
*
* @param[out] pu1_nnz
*  Total non-zero coefficients in the current sub-block
*
* @param[in] pi2_alt_dc_addr
*  DC Coefficient of the block
*
* @remarks none
*
*******************************************************************************
*/
void ih264_resi_trans_quant_chroma_4x4(UWORD8 *pu1_src,
                                       UWORD8 *pu1_pred,
                                       WORD16 *pi2_out,
                                       WORD32 src_strd,
                                       WORD32 pred_strd,
                                       const UWORD16 *pu2_scale_matrix,
                                       const UWORD16 *pu2_threshold_matrix,
                                       UWORD32 u4_qbits,
                                       UWORD32 u4_round_factor,
                                       UWORD8 *pu1_nnz,
                                       WORD16 *pu1_dc_alt_addr)
{
    UWORD32 i;
    WORD32 x0, x1, x2, x3, x4, x5, x6, x7;
    WORD32 i4_value;
    WORD16 *pi2_out_tmp = pi2_out;
    UWORD32 u4_nonzero_coeff = 0;

    for(i = 0; i < SUB_BLK_WIDTH_4x4; i++)
    {
        /* computing prediction error (residue) */
        x4 = pu1_src[0] - pu1_pred[0];
        x5 = pu1_src[2] - pu1_pred[2];
        x6 = pu1_src[4] - pu1_pred[4];
        x7 = pu1_src[6] - pu1_pred[6];

        /* Horizontal transform */
        x0 = x4 + x7;
        x1 = x5 + x6;
        x2 = x5 - x6;
        x3 = x4 - x7;

        pi2_out_tmp[0] = x0 + x1;
        pi2_out_tmp[1] = (x3 << 1) + x2;
        pi2_out_tmp[2] = x0 - x1;
        pi2_out_tmp[3] = x3 - (x2 << 1);

        /* pointing to next row; */
        pu1_src += src_strd;
        pu1_pred += pred_strd;
        pi2_out_tmp += 4;
    }

    pi2_out_tmp = pi2_out;
    for(i = 0; i < SUB_BLK_WIDTH_4x4; i++)
    {
        /* Vertical transform and quantization */
        x4 = pi2_out_tmp[0];
        x5 = pi2_out_tmp[4];
        x6 = pi2_out_tmp[8];
        x7 = pi2_out_tmp[12];

        x0 = x4 + x7;
        x1 = x5 + x6;
        x2 = x5 - x6;
        x3 = x4 - x7;

        /* quantization is done in place */
        i4_value = x0 + x1;
        if(i == 0)
        {
            *pu1_dc_alt_addr = i4_value;
        }
        FWD_QUANT(i4_value, pu2_threshold_matrix[0],
                  pu2_scale_matrix[0], u4_round_factor, u4_qbits,
                  u4_nonzero_coeff);
        pi2_out_tmp[0] = i4_value;

        i4_value = (x3 << 1) + x2;
        FWD_QUANT(i4_value, pu2_threshold_matrix[4],
                  pu2_scale_matrix[4], u4_round_factor, u4_qbits,
                  u4_nonzero_coeff);
        pi2_out_tmp[4] = i4_value;

        i4_value = x0 - x1;
        FWD_QUANT(i4_value, pu2_threshold_matrix[8],
                  pu2_scale_matrix[8], u4_round_factor, u4_qbits,
                  u4_nonzero_coeff);
        pi2_out_tmp[8] = i4_value;

        i4_value = x3 - (x2 << 1);
        FWD_QUANT(i4_value, pu2_threshold_matrix[12],
                  pu2_scale_matrix[12], u4_round_factor, u4_qbits,
                  u4_nonzero_coeff);
        pi2_out_tmp[12] = i4_value;

        pi2_out_tmp++;
        pu2_scale_matrix++;
        pu2_threshold_matrix++;
    }

    /* Return total nonzero coefficients in the current sub block */
    *pu1_nnz =  u4_nonzero_coeff;
}

/**
*******************************************************************************
*
* @brief
*  This function performs forward hadamard transform and quantization on a
*  4x4 block
*
* @par Description:
*  The function accepts source buffer and estimation buffer. From these, it
*  computes the residue. This is residue is then transformed and quantized.
*  The transform and quantization are in placed computed. They use the residue
*  buffer for this.
*
* @param[in] pu1_src
*  Pointer to source sub-block
*
* @param[in] pi2_dst
*  Pointer to destination sub-block
*
* @param[in] pu2_threshold_matrix
*  Pointer to Forward Quant Threshold Matrix
*
* @param[in] pu2_scale_matrix
*  Pointer to Forward Quant Scale Matrix
*
* @param[in] u4_qbits
*  QP_BITS_h264_4x4 + floor(QP/6)
*
* @param[in] u4_round_factor
*  Quantization Round factor
*
* @param[out] pu1_nnz
*  Total non-zero coefficients in the current sub-block
*
* @remarks none
*
********************************************************************************
*/
void ih264_hadamard_quant_4x4(WORD16 *pi2_src,
                              WORD16 *pi2_dst,
                              const UWORD16 *pu2_scale_matrix,
                              const UWORD16 *pu2_threshold_matrix,
                              UWORD32 u4_qbits,
                              UWORD32 u4_round_factor,
                              UWORD8 *pu1_nnz)
{
    WORD32 i;
    WORD32 x0, x1, x2, x3, x4, x5, x6, x7, i4_value;

    *pu1_nnz = 0;

    for(i = 0; i < SUB_BLK_WIDTH_4x4; i++)
    {
        x4 = pi2_src[0];
        x5 = pi2_src[1];
        x6 = pi2_src[2];
        x7 = pi2_src[3];

        x0 = x4 + x7;
        x1 = x5 + x6;
        x2 = x5 - x6;
        x3 = x4 - x7;

        pi2_dst[0] = x0 + x1;
        pi2_dst[1] = x3 + x2;
        pi2_dst[2] = x0 - x1;
        pi2_dst[3] = x3 - x2;

        pi2_src += 4;
        pi2_dst += 4;
    }

    /* Vertical transform and quantization */
    pi2_dst -= SUB_BLK_WIDTH_4x4 << 2;

    for(i = 0; i < SUB_BLK_WIDTH_4x4; i++)
    {
        x4 = pi2_dst[0];
        x5 = pi2_dst[4];
        x6 = pi2_dst[8];
        x7 = pi2_dst[12];

        x0 = x4 + x7;
        x1 = x5 + x6;
        x2 = x5 - x6;
        x3 = x4 - x7;

        i4_value = (x0 + x1) >> 1;
        FWD_QUANT(i4_value, pu2_threshold_matrix[0],
                  pu2_scale_matrix[0], u4_round_factor, u4_qbits, pu1_nnz[0]);
        pi2_dst[0] = i4_value;

        i4_value = (x3 + x2) >> 1;
        FWD_QUANT(i4_value, pu2_threshold_matrix[0],
                  pu2_scale_matrix[0], u4_round_factor, u4_qbits, pu1_nnz[0]);
        pi2_dst[4] = i4_value;

        i4_value = (x0 - x1) >> 1;
        FWD_QUANT(i4_value, pu2_threshold_matrix[0],
                  pu2_scale_matrix[0], u4_round_factor, u4_qbits, pu1_nnz[0]);
        pi2_dst[8] = i4_value;

        i4_value = (x3 - x2) >> 1;
        FWD_QUANT(i4_value, pu2_threshold_matrix[0],
                  pu2_scale_matrix[0], u4_round_factor, u4_qbits, pu1_nnz[0]);
        pi2_dst[12] = i4_value;

        pi2_dst++;
    }
}

/**
*******************************************************************************
*
* @brief
*   This function performs forward hadamard transform and quantization on a
*   2x2 block for both U and V planes
*
* @par Description:
*  The function accepts source buffer and estimation buffer. From these, it
*  computes the residue. This is residue is then transformed and quantized.
*  The transform and quantization are in placed computed. They use the residue
*  buffer for this.
*
* @param[in] pu1_src
*  Pointer to source sub-block
*
* @param[in] pi2_dst
*  Pointer to destination sub-block
*
* @param[in] pu2_threshold_matrix
*  Pointer to Forward Quant Threshold Matrix
*
* @param[in] pu2_scale_matrix
*  Pointer to Forward Quant Scale Matrix
*
* @param[in] u4_qbits
*  QP_BITS_h264_4x4 + floor(QP/6)
*
* @param[in] u4_round_factor
*  Quantization Round factor
*
* @param[out] pu1_nnz
*  Total non-zero coefficients in the current sub-block
*
* @remarks
*  NNZ for dc is populated at 0 and 5th position of pu1_nnz
*
*******************************************************************************
*/
void ih264_hadamard_quant_2x2_uv(WORD16 *pi2_src,
                                 WORD16 *pi2_dst,
                                 const UWORD16 *pu2_scale_matrix,
                                 const UWORD16 *pu2_threshold_matrix,
                                 UWORD32 u4_qbits,
                                 UWORD32 u4_round_factor,
                                 UWORD8 *pu1_nnz)
{
    WORD32 x0, x1, x2, x3, x4, x5, x6, x7;
    WORD32 i4_value, plane;

    for(plane = 0; plane < 2; plane++)
    {
        pu1_nnz[plane] = 0;

        /* Horizontal transform */
        x4 = pi2_src[0];
        x5 = pi2_src[1];
        x6 = pi2_src[2];
        x7 = pi2_src[3];

        x0 = x4 + x5;
        x1 = x4 - x5;
        x2 = x6 + x7;
        x3 = x6 - x7;

        /* Vertical transform and quantization */
        i4_value = (x0 + x2);
        FWD_QUANT(i4_value, pu2_threshold_matrix[0],
                  pu2_scale_matrix[0], u4_round_factor, u4_qbits,
                  pu1_nnz[plane]);
        pi2_dst[0] = i4_value;

        i4_value = (x0 - x2);
        FWD_QUANT(i4_value, pu2_threshold_matrix[0],
                  pu2_scale_matrix[0], u4_round_factor, u4_qbits,
                  pu1_nnz[plane]);
        pi2_dst[2] = i4_value;

        i4_value = (x1 - x3);
        FWD_QUANT(i4_value, pu2_threshold_matrix[0],
                  pu2_scale_matrix[0], u4_round_factor, u4_qbits,
                  pu1_nnz[plane]);
        pi2_dst[3] = i4_value;

        i4_value = (x1 + x3);
        FWD_QUANT(i4_value, pu2_threshold_matrix[0],
                  pu2_scale_matrix[0], u4_round_factor, u4_qbits,
                  pu1_nnz[plane]);
        pi2_dst[1] = i4_value;

        pi2_dst += 4;
        pi2_src += 4;
    }
}

/**
*******************************************************************************
*
* @brief
*  This function performs Single stage forward transform CF8 and quantization
*  on 8x8 blocks
*
* @par Description:
*  Performs single stage 8x8 forward transform CF8 after calculating the residue
*  The result is then quantized
*
* @param[in] pu1_src
*  Pointer to source sub-block
*
* @param[in] pu1_pred
*  Pointer to prediction sub-block
*
* @param[in] pi2_out
*  Pointer to residual sub-block
*
* @param[in] src_strd
*  Source stride
*
* @param[in] pred_strd
*  Prediction stride
*
* @param[in] pu2_scale_matrix
*  Pointer to Forward Quant Scale Matrix
*
* @param[in] pu2_threshold_matrix
*  Pointer to Forward Quant Threshold Matrix
*
* @param[in] u4_qbits
*  QP_BITS_h264_8x8 + floor(QP/6)
*
* @param[in] u4_round_factor
*  Quantization Round factor
*
* @param[out] pu1_nnz
*  Total non-zero coefficients in the current sub-block
*
* @param[in] pi2_alt_dc_addr
*  UNUSED
*
* @returns none
*
* @remarks:
*  TODO: This function needs to be tested before integration
*
*******************************************************************************
*/
void ih264_resi_trans_quant_8x8(UWORD8 *pu1_src,
                                UWORD8 *pu1_pred,
                                WORD16 *pi2_out,
                                WORD32 src_strd,
                                WORD32 pred_strd,
                                const UWORD16 *pu2_scale_matrix,
                                const UWORD16 *pu2_threshold_matrix,
                                UWORD32 u4_qbits,
                                UWORD32 u4_round_factor,
                                UWORD8 *pu1_nnz,
                                WORD16 *pu1_dc_alt_addr)
{
    WORD16 *pi2_out_tmp = pi2_out;
    WORD32 i;
    WORD32 a0, a1, a2, a3, a4, a5, a6, a7;
    WORD32 r0, r1, r2, r3, r4, r5, r6, r7;
    UWORD32 u4_nonzero_coeff = 0;

    UNUSED(pu1_dc_alt_addr);

    /* Horizontal transform */
    for(i = 0; i < SUB_BLK_WIDTH_8x8; ++i)
    {
        r0 = pu1_src[0];
        r0 -= pu1_pred[0];
        r1 = pu1_src[1];
        r1 -= pu1_pred[1];
        r2 = pu1_src[2]; r2 -= pu1_pred[2];
        r3 = pu1_src[3]; r3 -= pu1_pred[3];
        r4 = pu1_src[4]; r4 -= pu1_pred[4];
        r5 = pu1_src[5]; r5 -= pu1_pred[5];
        r6 = pu1_src[6]; r6 -= pu1_pred[6];
        r7 = pu1_src[7]; r7 -= pu1_pred[7];

        a0 = r0 + r7;
        a1 = r1 + r6;
        a2 = r2 + r5;
        a3 = r3 + r4;

        a4 = a0 + a3;
        a5 = a1 + a2;
        a6 = a0 - a3;
        a7 = a1 - a2;

        pi2_out_tmp[0] = a4 + a5;
        pi2_out_tmp[2] = a6 + (a7 >> 1);
        pi2_out_tmp[4] = a4 - a5;
        pi2_out_tmp[6] = (a6 >> 1) - a7;

        a0 = r0 - r7;
        a1 = r1 - r6;
        a2 = r2 - r5;
        a3 = r3 - r4;

        a4 = a1 + a2 + ((a0 >> 1) + a0);
        a5 = a0 - a3 - ((a2 >> 1) + a2);
        a6 = a0 + a3 - ((a1 >> 1) + a1);
        a7 = a1 - a2 + ((a3 >> 1) + a3);

        pi2_out_tmp[1] = a4 + (a7 >> 2);
        pi2_out_tmp[3] = a5 + (a6 >> 2);
        pi2_out_tmp[5] = a6 - (a5 >> 2);
        pi2_out_tmp[7] = (a4 >> 2) - a7;

        pu1_src += src_strd;
        pu1_pred += pred_strd;
        pi2_out_tmp += 8;
    }

    /* vertical transform and quant */
    pi2_out_tmp = pi2_out;
    for(i = 0; i < SUB_BLK_WIDTH_8x8; ++i)
    {
        r0 = pi2_out_tmp[0];
        r1 = pi2_out_tmp[8];
        r2 = pi2_out_tmp[16];
        r3 = pi2_out_tmp[24];
        r4 = pi2_out_tmp[32];
        r5 = pi2_out_tmp[40];
        r6 = pi2_out_tmp[48];
        r7 = pi2_out_tmp[56];

        a0 = r0 + r7;
        a1 = r1 + r6;
        a2 = r2 + r5;
        a3 = r3 + r4;

        a4 = a0 + a3;
        a5 = a1 + a2;
        a6 = a0 - a3;
        a7 = a1 - a2;

        a0 = r0 - r7;
        a1 = r1 - r6;
        a2 = r2 - r5;
        a3 = r3 - r4;

        r0 = a4 + a5;
        r2 = a6 + (a7 >> 1);
        r4 = a4 - a5;
        r6 = (a6 >> 1) - a7;

        a4 = a1 + a2 + ((a0 >> 1) + a0);
        a5 = a0 - a3 - ((a2 >> 1) + a2);
        a6 = a0 + a3 - ((a1 >> 1) + a1);
        a7 = a1 - a2 + ((a3 >> 1) + a3);

        r1 = a4 + (a7 >> 2);
        r3 = a5 + (a6 >> 2);
        r5 = a6 - (a5 >> 2);
        r7 = (a4 >> 2) - a7;

        FWD_QUANT(r0, pu2_threshold_matrix[0],
                  pu2_scale_matrix[0], u4_round_factor, u4_qbits,
                  u4_nonzero_coeff);
        pi2_out_tmp[0] = r0;

        FWD_QUANT(r1, pu2_threshold_matrix[8],
                  pu2_scale_matrix[8], u4_round_factor, u4_qbits,
                  u4_nonzero_coeff);
        pi2_out_tmp[8] = r1;

        FWD_QUANT(r2, pu2_threshold_matrix[16],
                  pu2_scale_matrix[16], u4_round_factor, u4_qbits,
                  u4_nonzero_coeff);
        pi2_out_tmp[16] = r2;

        FWD_QUANT(r3, pu2_threshold_matrix[24],
                  pu2_scale_matrix[24], u4_round_factor, u4_qbits,
                  u4_nonzero_coeff);
        pi2_out_tmp[24] = r3;

        FWD_QUANT(r4, pu2_threshold_matrix[32],
                  pu2_scale_matrix[32], u4_round_factor, u4_qbits,
                  u4_nonzero_coeff);
        pi2_out_tmp[32] = r4;

        FWD_QUANT(r5, pu2_threshold_matrix[40],
                  pu2_scale_matrix[40], u4_round_factor, u4_qbits,
                  u4_nonzero_coeff);
        pi2_out_tmp[40] = r5;

        FWD_QUANT(r6, pu2_threshold_matrix[48],
                  pu2_scale_matrix[48], u4_round_factor, u4_qbits,
                  u4_nonzero_coeff);
        pi2_out_tmp[48] = r6;

        FWD_QUANT(r7, pu2_threshold_matrix[56],
                  pu2_scale_matrix[56], u4_round_factor, u4_qbits,
                  u4_nonzero_coeff);
        pi2_out_tmp[56] = r7;

        pi2_out_tmp++;
        pu2_scale_matrix++;
        pu2_threshold_matrix++;
    }
    /* Return total nonzero coefficients in the current sub block */
    *pu1_nnz =  u4_nonzero_coeff;
}

Coverage Report

Created: 2025-08-26 06:38

Line	Count	Source (jump to first uncovered line)
1		/******************************************************************************
2		*
3		* Copyright (C) 2015 The Android Open Source Project
4		*
5		* Licensed under the Apache License, Version 2.0 (the "License");
6		* you may not use this file except in compliance with the License.
7		* You may obtain a copy of the License at:
8		*
9		* http://www.apache.org/licenses/LICENSE-2.0
10		*
11		* Unless required by applicable law or agreed to in writing, software
12		* distributed under the License is distributed on an "AS IS" BASIS,
13		* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14		* See the License for the specific language governing permissions and
15		* limitations under the License.
16		*
17		*****************************************************************************
18		* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19		*/
20
21		/**
22		*******************************************************************************
23		* @file
24		* ih264_resi_trans_quant.c
25		*
26		* @brief
27		* Contains function definitions single stage forward transform for H.264
28		* It will calculate the residue, do the cf and then do quantization
29		*
30		* @author
31		* ittiam
32		*
33		* @par List of Functions:
34		* - ih264_resi_trans_quant_4x4
35		* - ih264_resi_trans_quant_chroma_4x4
36		* - ih264_hadamard_quant_4x4
37		* - ih264_hadamard_quant_2x2_uv
38		* - ih264_resi_trans_quant_8x8
39		*
40		* @remarks
41		* none
42		*
43		*******************************************************************************
44		*/
45
46
47		/*****************************************************************************/
48		/* File Includes */
49		/*****************************************************************************/
50
51		/* System Include Files */
52		#include <stddef.h>
53
54		/* User Include Files */
55		#include "ih264_typedefs.h"
56		#include "ih264_defs.h"
57		#include "ih264_macros.h"
58		#include "ih264_size_defs.h"
59		#include "ih264_trans_macros.h"
60		#include "ih264_trans_data.h"
61		#include "ih264_structs.h"
62		#include "ih264_trans_quant_itrans_iquant.h"
63
64
65		/*****************************************************************************/
66		/* Function Definitions */
67		/*****************************************************************************/
68
69		/**
70		*******************************************************************************
71		*
72		* @brief
73		* This function performs forward transform and quantization on a 4x4 block
74		*
75		* @par Description:
76		* The function accepts source buffer and estimation buffer. From these, it
77		* computes the residue. This is residue is then transformed and quantized.
78		* The transform and quantization are in placed computed. They use the residue
79		* buffer for this.
80		*
81		* @param[in] pu1_src
82		* Pointer to source sub-block
83		*
84		* @param[in] pu1_pred
85		* Pointer to prediction sub-block
86		*
87		* @param[in] pi2_out
88		* Pointer to residual sub-block
89		*
90		* @param[in] src_strd
91		* Source stride
92		*
93		* @param[in] pred_strd
94		* Prediction stride
95		*
96		* @param[in] pu2_scale_matrix
97		* Pointer to Forward Quant Scale Matrix
98		*
99		* @param[in] pu2_threshold_matrix
100		* Pointer to Forward Quant Threshold Matrix
101		*
102		* @param[in] u4_qbits
103		* QP_BITS_h264_4x4 + floor(QP/6)
104		*
105		* @param[in] u4_round_factor
106		* Quantization Round factor
107		*
108		* @param[out] pu1_nnz
109		* Total non-zero coefficients in the current sub-block
110		*
111		* @param[in] pi2_alt_dc_addr
112		* DC Coefficient of the block
113		*
114		* @remarks none
115		*
116		*******************************************************************************
117		*/
118		void ih264_resi_trans_quant_4x4(UWORD8 *pu1_src,
119		UWORD8 *pu1_pred,
120		WORD16 *pi2_out,
121		WORD32 src_strd,
122		WORD32 pred_strd,
123		const UWORD16 *pu2_scale_matrix,
124		const UWORD16 *pu2_threshold_matrix,
125		UWORD32 u4_qbits,
126		UWORD32 u4_round_factor,
127		UWORD8 *pu1_nnz,
128		WORD16 *pi2_alt_dc_addr)
129	0	{
130	0	UWORD32 i;
131	0	WORD32 x0, x1, x2, x3, x4, x5, x6, x7;
132	0	WORD32 i4_value;
133	0	WORD16 *pi2_out_tmp = pi2_out;
134	0	UWORD32 u4_nonzero_coeff = 0;
135
136	0	for (i = 0; i < SUB_BLK_WIDTH_4x4; i++)
137	0	{
138		/* computing prediction error (residue) */
139	0	x4 = pu1_src[0] - pu1_pred[0];
140	0	x5 = pu1_src[1] - pu1_pred[1];
141	0	x6 = pu1_src[2] - pu1_pred[2];
142	0	x7 = pu1_src[3] - pu1_pred[3];
143
144		/* Horizontal transform */
145	0	x0 = x4 + x7;
146	0	x1 = x5 + x6;
147	0	x2 = x5 - x6;
148	0	x3 = x4 - x7;
149
150	0	pi2_out_tmp[0] = x0 + x1;
151	0	pi2_out_tmp[1] = (x3 << 1) + x2;
152	0	pi2_out_tmp[2] = x0 - x1;
153	0	pi2_out_tmp[3] = x3 - (x2 << 1);
154
155		/* pointing to next row; */
156	0	pu1_src += src_strd;
157	0	pu1_pred += pred_strd;
158	0	pi2_out_tmp += 4;
159	0	}
160
161	0	pi2_out_tmp = pi2_out;
162	0	for (i = 0; i < SUB_BLK_WIDTH_4x4; i++)
163	0	{
164		/* Vertical transform and quantization */
165	0	x4 = pi2_out_tmp[0];
166	0	x5 = pi2_out_tmp[4];
167	0	x6 = pi2_out_tmp[8];
168	0	x7 = pi2_out_tmp[12];
169
170	0	x0 = x4 + x7;
171	0	x1 = x5 + x6;
172	0	x2 = x5 - x6;
173	0	x3 = x4 - x7;
174
175		/* quantization is done in place */
176	0	i4_value = x0 + x1;
177	0	if(i == 0)
178	0	{
179	0	(*pi2_alt_dc_addr) = i4_value;
180	0	}
181	0	FWD_QUANT(i4_value, pu2_threshold_matrix[0],
182	0	pu2_scale_matrix[0], u4_round_factor, u4_qbits,
183	0	u4_nonzero_coeff);
184	0	pi2_out_tmp[0] = i4_value;
185
186	0	i4_value = (x3 << 1) + x2;
187	0	FWD_QUANT(i4_value, pu2_threshold_matrix[4],
188	0	pu2_scale_matrix[4], u4_round_factor, u4_qbits,
189	0	u4_nonzero_coeff);
190	0	pi2_out_tmp[4] = i4_value;
191
192	0	i4_value = x0 - x1;
193	0	FWD_QUANT(i4_value, pu2_threshold_matrix[8],
194	0	pu2_scale_matrix[8], u4_round_factor, u4_qbits,
195	0	u4_nonzero_coeff);
196	0	pi2_out_tmp[8] = i4_value;
197
198	0	i4_value = x3 - (x2 << 1);
199	0	FWD_QUANT(i4_value, pu2_threshold_matrix[12],
200	0	pu2_scale_matrix[12], u4_round_factor, u4_qbits,
201	0	u4_nonzero_coeff);
202	0	pi2_out_tmp[12] = i4_value;
203
204	0	pi2_out_tmp++;
205	0	pu2_scale_matrix++;
206	0	pu2_threshold_matrix++;
207	0	}
208
209		/* Return total nonzero coefficients in the current sub block */
210	0	*pu1_nnz = u4_nonzero_coeff;
211	0	}
212
213		/**
214		*******************************************************************************
215		*
216		* @brief
217		* This function performs forward transform and quantization on a 4x4
218		* chroma block with interleaved values
219		*
220		* @par Description:
221		* The function accepts source buffer and estimation buffer. From these, it
222		* computes the residue. This is residue is then transformed and quantized.
223		* The transform and quantization are in placed computed. They use the residue
224		* buffer for this.
225		*
226		* @param[in] pu1_src
227		* Pointer to source sub-block
228		*
229		* @param[in] pu1_pred
230		* Pointer to prediction sub-block
231		*
232		* @param[in] pi2_out
233		* Pointer to residual sub-block
234		*
235		* @param[in] src_strd
236		* Source stride
237		*
238		* @param[in] pred_strd
239		* Prediction stride
240		*
241		* @param[in] pu2_scale_matrix
242		* Pointer to Forward Quant Scale Matrix
243		*
244		* @param[in] pu2_threshold_matrix
245		* Pointer to Forward Quant Threshold Matrix
246		*
247		* @param[in] u4_qbits
248		* QP_BITS_h264_4x4 + floor(QP/6)
249		*
250		* @param[in] u4_round_factor
251		* Quantization Round factor
252		*
253		* @param[out] pu1_nnz
254		* Total non-zero coefficients in the current sub-block
255		*
256		* @param[in] pi2_alt_dc_addr
257		* DC Coefficient of the block
258		*
259		* @remarks none
260		*
261		*******************************************************************************
262		*/
263		void ih264_resi_trans_quant_chroma_4x4(UWORD8 *pu1_src,
264		UWORD8 *pu1_pred,
265		WORD16 *pi2_out,
266		WORD32 src_strd,
267		WORD32 pred_strd,
268		const UWORD16 *pu2_scale_matrix,
269		const UWORD16 *pu2_threshold_matrix,
270		UWORD32 u4_qbits,
271		UWORD32 u4_round_factor,
272		UWORD8 *pu1_nnz,
273		WORD16 *pu1_dc_alt_addr)
274	0	{
275	0	UWORD32 i;
276	0	WORD32 x0, x1, x2, x3, x4, x5, x6, x7;
277	0	WORD32 i4_value;
278	0	WORD16 *pi2_out_tmp = pi2_out;
279	0	UWORD32 u4_nonzero_coeff = 0;
280
281	0	for(i = 0; i < SUB_BLK_WIDTH_4x4; i++)
282	0	{
283		/* computing prediction error (residue) */
284	0	x4 = pu1_src[0] - pu1_pred[0];
285	0	x5 = pu1_src[2] - pu1_pred[2];
286	0	x6 = pu1_src[4] - pu1_pred[4];
287	0	x7 = pu1_src[6] - pu1_pred[6];
288
289		/* Horizontal transform */
290	0	x0 = x4 + x7;
291	0	x1 = x5 + x6;
292	0	x2 = x5 - x6;
293	0	x3 = x4 - x7;
294
295	0	pi2_out_tmp[0] = x0 + x1;
296	0	pi2_out_tmp[1] = (x3 << 1) + x2;
297	0	pi2_out_tmp[2] = x0 - x1;
298	0	pi2_out_tmp[3] = x3 - (x2 << 1);
299
300		/* pointing to next row; */
301	0	pu1_src += src_strd;
302	0	pu1_pred += pred_strd;
303	0	pi2_out_tmp += 4;
304	0	}
305
306	0	pi2_out_tmp = pi2_out;
307	0	for(i = 0; i < SUB_BLK_WIDTH_4x4; i++)
308	0	{
309		/* Vertical transform and quantization */
310	0	x4 = pi2_out_tmp[0];
311	0	x5 = pi2_out_tmp[4];
312	0	x6 = pi2_out_tmp[8];
313	0	x7 = pi2_out_tmp[12];
314
315	0	x0 = x4 + x7;
316	0	x1 = x5 + x6;
317	0	x2 = x5 - x6;
318	0	x3 = x4 - x7;
319
320		/* quantization is done in place */
321	0	i4_value = x0 + x1;
322	0	if(i == 0)
323	0	{
324	0	*pu1_dc_alt_addr = i4_value;
325	0	}
326	0	FWD_QUANT(i4_value, pu2_threshold_matrix[0],
327	0	pu2_scale_matrix[0], u4_round_factor, u4_qbits,
328	0	u4_nonzero_coeff);
329	0	pi2_out_tmp[0] = i4_value;
330
331	0	i4_value = (x3 << 1) + x2;
332	0	FWD_QUANT(i4_value, pu2_threshold_matrix[4],
333	0	pu2_scale_matrix[4], u4_round_factor, u4_qbits,
334	0	u4_nonzero_coeff);
335	0	pi2_out_tmp[4] = i4_value;
336
337	0	i4_value = x0 - x1;
338	0	FWD_QUANT(i4_value, pu2_threshold_matrix[8],
339	0	pu2_scale_matrix[8], u4_round_factor, u4_qbits,
340	0	u4_nonzero_coeff);
341	0	pi2_out_tmp[8] = i4_value;
342
343	0	i4_value = x3 - (x2 << 1);
344	0	FWD_QUANT(i4_value, pu2_threshold_matrix[12],
345	0	pu2_scale_matrix[12], u4_round_factor, u4_qbits,
346	0	u4_nonzero_coeff);
347	0	pi2_out_tmp[12] = i4_value;
348
349	0	pi2_out_tmp++;
350	0	pu2_scale_matrix++;
351	0	pu2_threshold_matrix++;
352	0	}
353
354		/* Return total nonzero coefficients in the current sub block */
355	0	*pu1_nnz = u4_nonzero_coeff;
356	0	}
357
358		/**
359		*******************************************************************************
360		*
361		* @brief
362		* This function performs forward hadamard transform and quantization on a
363		* 4x4 block
364		*
365		* @par Description:
366		* The function accepts source buffer and estimation buffer. From these, it
367		* computes the residue. This is residue is then transformed and quantized.
368		* The transform and quantization are in placed computed. They use the residue
369		* buffer for this.
370		*
371		* @param[in] pu1_src
372		* Pointer to source sub-block
373		*
374		* @param[in] pi2_dst
375		* Pointer to destination sub-block
376		*
377		* @param[in] pu2_threshold_matrix
378		* Pointer to Forward Quant Threshold Matrix
379		*
380		* @param[in] pu2_scale_matrix
381		* Pointer to Forward Quant Scale Matrix
382		*
383		* @param[in] u4_qbits
384		* QP_BITS_h264_4x4 + floor(QP/6)
385		*
386		* @param[in] u4_round_factor
387		* Quantization Round factor
388		*
389		* @param[out] pu1_nnz
390		* Total non-zero coefficients in the current sub-block
391		*
392		* @remarks none
393		*
394		********************************************************************************
395		*/
396		void ih264_hadamard_quant_4x4(WORD16 *pi2_src,
397		WORD16 *pi2_dst,
398		const UWORD16 *pu2_scale_matrix,
399		const UWORD16 *pu2_threshold_matrix,
400		UWORD32 u4_qbits,
401		UWORD32 u4_round_factor,
402		UWORD8 *pu1_nnz)
403	0	{
404	0	WORD32 i;
405	0	WORD32 x0, x1, x2, x3, x4, x5, x6, x7, i4_value;
406
407	0	*pu1_nnz = 0;
408
409	0	for(i = 0; i < SUB_BLK_WIDTH_4x4; i++)
410	0	{
411	0	x4 = pi2_src[0];
412	0	x5 = pi2_src[1];
413	0	x6 = pi2_src[2];
414	0	x7 = pi2_src[3];
415
416	0	x0 = x4 + x7;
417	0	x1 = x5 + x6;
418	0	x2 = x5 - x6;
419	0	x3 = x4 - x7;
420
421	0	pi2_dst[0] = x0 + x1;
422	0	pi2_dst[1] = x3 + x2;
423	0	pi2_dst[2] = x0 - x1;
424	0	pi2_dst[3] = x3 - x2;
425
426	0	pi2_src += 4;
427	0	pi2_dst += 4;
428	0	}
429
430		/* Vertical transform and quantization */
431	0	pi2_dst -= SUB_BLK_WIDTH_4x4 << 2;
432
433	0	for(i = 0; i < SUB_BLK_WIDTH_4x4; i++)
434	0	{
435	0	x4 = pi2_dst[0];
436	0	x5 = pi2_dst[4];
437	0	x6 = pi2_dst[8];
438	0	x7 = pi2_dst[12];
439
440	0	x0 = x4 + x7;
441	0	x1 = x5 + x6;
442	0	x2 = x5 - x6;
443	0	x3 = x4 - x7;
444
445	0	i4_value = (x0 + x1) >> 1;
446	0	FWD_QUANT(i4_value, pu2_threshold_matrix[0],
447	0	pu2_scale_matrix[0], u4_round_factor, u4_qbits, pu1_nnz[0]);
448	0	pi2_dst[0] = i4_value;
449
450	0	i4_value = (x3 + x2) >> 1;
451	0	FWD_QUANT(i4_value, pu2_threshold_matrix[0],
452	0	pu2_scale_matrix[0], u4_round_factor, u4_qbits, pu1_nnz[0]);
453	0	pi2_dst[4] = i4_value;
454
455	0	i4_value = (x0 - x1) >> 1;
456	0	FWD_QUANT(i4_value, pu2_threshold_matrix[0],
457	0	pu2_scale_matrix[0], u4_round_factor, u4_qbits, pu1_nnz[0]);
458	0	pi2_dst[8] = i4_value;
459
460	0	i4_value = (x3 - x2) >> 1;
461	0	FWD_QUANT(i4_value, pu2_threshold_matrix[0],
462	0	pu2_scale_matrix[0], u4_round_factor, u4_qbits, pu1_nnz[0]);
463	0	pi2_dst[12] = i4_value;
464
465	0	pi2_dst++;
466	0	}
467	0	}
468
469		/**
470		*******************************************************************************
471		*
472		* @brief
473		* This function performs forward hadamard transform and quantization on a
474		* 2x2 block for both U and V planes
475		*
476		* @par Description:
477		* The function accepts source buffer and estimation buffer. From these, it
478		* computes the residue. This is residue is then transformed and quantized.
479		* The transform and quantization are in placed computed. They use the residue
480		* buffer for this.
481		*
482		* @param[in] pu1_src
483		* Pointer to source sub-block
484		*
485		* @param[in] pi2_dst
486		* Pointer to destination sub-block
487		*
488		* @param[in] pu2_threshold_matrix
489		* Pointer to Forward Quant Threshold Matrix
490		*
491		* @param[in] pu2_scale_matrix
492		* Pointer to Forward Quant Scale Matrix
493		*
494		* @param[in] u4_qbits
495		* QP_BITS_h264_4x4 + floor(QP/6)
496		*
497		* @param[in] u4_round_factor
498		* Quantization Round factor
499		*
500		* @param[out] pu1_nnz
501		* Total non-zero coefficients in the current sub-block
502		*
503		* @remarks
504		* NNZ for dc is populated at 0 and 5th position of pu1_nnz
505		*
506		*******************************************************************************
507		*/
508		void ih264_hadamard_quant_2x2_uv(WORD16 *pi2_src,
509		WORD16 *pi2_dst,
510		const UWORD16 *pu2_scale_matrix,
511		const UWORD16 *pu2_threshold_matrix,
512		UWORD32 u4_qbits,
513		UWORD32 u4_round_factor,
514		UWORD8 *pu1_nnz)
515	0	{
516	0	WORD32 x0, x1, x2, x3, x4, x5, x6, x7;
517	0	WORD32 i4_value, plane;
518
519	0	for(plane = 0; plane < 2; plane++)
520	0	{
521	0	pu1_nnz[plane] = 0;
522
523		/* Horizontal transform */
524	0	x4 = pi2_src[0];
525	0	x5 = pi2_src[1];
526	0	x6 = pi2_src[2];
527	0	x7 = pi2_src[3];
528
529	0	x0 = x4 + x5;
530	0	x1 = x4 - x5;
531	0	x2 = x6 + x7;
532	0	x3 = x6 - x7;
533
534		/* Vertical transform and quantization */
535	0	i4_value = (x0 + x2);
536	0	FWD_QUANT(i4_value, pu2_threshold_matrix[0],
537	0	pu2_scale_matrix[0], u4_round_factor, u4_qbits,
538	0	pu1_nnz[plane]);
539	0	pi2_dst[0] = i4_value;
540
541	0	i4_value = (x0 - x2);
542	0	FWD_QUANT(i4_value, pu2_threshold_matrix[0],
543	0	pu2_scale_matrix[0], u4_round_factor, u4_qbits,
544	0	pu1_nnz[plane]);
545	0	pi2_dst[2] = i4_value;
546
547	0	i4_value = (x1 - x3);
548	0	FWD_QUANT(i4_value, pu2_threshold_matrix[0],
549	0	pu2_scale_matrix[0], u4_round_factor, u4_qbits,
550	0	pu1_nnz[plane]);
551	0	pi2_dst[3] = i4_value;
552
553	0	i4_value = (x1 + x3);
554	0	FWD_QUANT(i4_value, pu2_threshold_matrix[0],
555	0	pu2_scale_matrix[0], u4_round_factor, u4_qbits,
556	0	pu1_nnz[plane]);
557	0	pi2_dst[1] = i4_value;
558
559	0	pi2_dst += 4;
560	0	pi2_src += 4;
561	0	}
562	0	}
563
564		/**
565		*******************************************************************************
566		*
567		* @brief
568		* This function performs Single stage forward transform CF8 and quantization
569		* on 8x8 blocks
570		*
571		* @par Description:
572		* Performs single stage 8x8 forward transform CF8 after calculating the residue
573		* The result is then quantized
574		*
575		* @param[in] pu1_src
576		* Pointer to source sub-block
577		*
578		* @param[in] pu1_pred
579		* Pointer to prediction sub-block
580		*
581		* @param[in] pi2_out
582		* Pointer to residual sub-block
583		*
584		* @param[in] src_strd
585		* Source stride
586		*
587		* @param[in] pred_strd
588		* Prediction stride
589		*
590		* @param[in] pu2_scale_matrix
591		* Pointer to Forward Quant Scale Matrix
592		*
593		* @param[in] pu2_threshold_matrix
594		* Pointer to Forward Quant Threshold Matrix
595		*
596		* @param[in] u4_qbits
597		* QP_BITS_h264_8x8 + floor(QP/6)
598		*
599		* @param[in] u4_round_factor
600		* Quantization Round factor
601		*
602		* @param[out] pu1_nnz
603		* Total non-zero coefficients in the current sub-block
604		*
605		* @param[in] pi2_alt_dc_addr
606		* UNUSED
607		*
608		* @returns none
609		*
610		* @remarks:
611		* TODO: This function needs to be tested before integration
612		*
613		*******************************************************************************
614		*/
615		void ih264_resi_trans_quant_8x8(UWORD8 *pu1_src,
616		UWORD8 *pu1_pred,
617		WORD16 *pi2_out,
618		WORD32 src_strd,
619		WORD32 pred_strd,
620		const UWORD16 *pu2_scale_matrix,
621		const UWORD16 *pu2_threshold_matrix,
622		UWORD32 u4_qbits,
623		UWORD32 u4_round_factor,
624		UWORD8 *pu1_nnz,
625		WORD16 *pu1_dc_alt_addr)
626	0	{
627	0	WORD16 *pi2_out_tmp = pi2_out;
628	0	WORD32 i;
629	0	WORD32 a0, a1, a2, a3, a4, a5, a6, a7;
630	0	WORD32 r0, r1, r2, r3, r4, r5, r6, r7;
631	0	UWORD32 u4_nonzero_coeff = 0;
632
633	0	UNUSED(pu1_dc_alt_addr);
634
635		/* Horizontal transform */
636	0	for(i = 0; i < SUB_BLK_WIDTH_8x8; ++i)
637	0	{
638	0	r0 = pu1_src[0];
639	0	r0 -= pu1_pred[0];
640	0	r1 = pu1_src[1];
641	0	r1 -= pu1_pred[1];
642	0	r2 = pu1_src[2]; r2 -= pu1_pred[2];
643	0	r3 = pu1_src[3]; r3 -= pu1_pred[3];
644	0	r4 = pu1_src[4]; r4 -= pu1_pred[4];
645	0	r5 = pu1_src[5]; r5 -= pu1_pred[5];
646	0	r6 = pu1_src[6]; r6 -= pu1_pred[6];
647	0	r7 = pu1_src[7]; r7 -= pu1_pred[7];
648
649	0	a0 = r0 + r7;
650	0	a1 = r1 + r6;
651	0	a2 = r2 + r5;
652	0	a3 = r3 + r4;
653
654	0	a4 = a0 + a3;
655	0	a5 = a1 + a2;
656	0	a6 = a0 - a3;
657	0	a7 = a1 - a2;
658
659	0	pi2_out_tmp[0] = a4 + a5;
660	0	pi2_out_tmp[2] = a6 + (a7 >> 1);
661	0	pi2_out_tmp[4] = a4 - a5;
662	0	pi2_out_tmp[6] = (a6 >> 1) - a7;
663
664	0	a0 = r0 - r7;
665	0	a1 = r1 - r6;
666	0	a2 = r2 - r5;
667	0	a3 = r3 - r4;
668
669	0	a4 = a1 + a2 + ((a0 >> 1) + a0);
670	0	a5 = a0 - a3 - ((a2 >> 1) + a2);
671	0	a6 = a0 + a3 - ((a1 >> 1) + a1);
672	0	a7 = a1 - a2 + ((a3 >> 1) + a3);
673
674	0	pi2_out_tmp[1] = a4 + (a7 >> 2);
675	0	pi2_out_tmp[3] = a5 + (a6 >> 2);
676	0	pi2_out_tmp[5] = a6 - (a5 >> 2);
677	0	pi2_out_tmp[7] = (a4 >> 2) - a7;
678
679	0	pu1_src += src_strd;
680	0	pu1_pred += pred_strd;
681	0	pi2_out_tmp += 8;
682	0	}
683
684		/* vertical transform and quant */
685	0	pi2_out_tmp = pi2_out;
686	0	for(i = 0; i < SUB_BLK_WIDTH_8x8; ++i)
687	0	{
688	0	r0 = pi2_out_tmp[0];
689	0	r1 = pi2_out_tmp[8];
690	0	r2 = pi2_out_tmp[16];
691	0	r3 = pi2_out_tmp[24];
692	0	r4 = pi2_out_tmp[32];
693	0	r5 = pi2_out_tmp[40];
694	0	r6 = pi2_out_tmp[48];
695	0	r7 = pi2_out_tmp[56];
696
697	0	a0 = r0 + r7;
698	0	a1 = r1 + r6;
699	0	a2 = r2 + r5;
700	0	a3 = r3 + r4;
701
702	0	a4 = a0 + a3;
703	0	a5 = a1 + a2;
704	0	a6 = a0 - a3;
705	0	a7 = a1 - a2;
706
707	0	a0 = r0 - r7;
708	0	a1 = r1 - r6;
709	0	a2 = r2 - r5;
710	0	a3 = r3 - r4;
711
712	0	r0 = a4 + a5;
713	0	r2 = a6 + (a7 >> 1);
714	0	r4 = a4 - a5;
715	0	r6 = (a6 >> 1) - a7;
716
717	0	a4 = a1 + a2 + ((a0 >> 1) + a0);
718	0	a5 = a0 - a3 - ((a2 >> 1) + a2);
719	0	a6 = a0 + a3 - ((a1 >> 1) + a1);
720	0	a7 = a1 - a2 + ((a3 >> 1) + a3);
721
722	0	r1 = a4 + (a7 >> 2);
723	0	r3 = a5 + (a6 >> 2);
724	0	r5 = a6 - (a5 >> 2);
725	0	r7 = (a4 >> 2) - a7;
726
727	0	FWD_QUANT(r0, pu2_threshold_matrix[0],
728	0	pu2_scale_matrix[0], u4_round_factor, u4_qbits,
729	0	u4_nonzero_coeff);
730	0	pi2_out_tmp[0] = r0;
731
732	0	FWD_QUANT(r1, pu2_threshold_matrix[8],
733	0	pu2_scale_matrix[8], u4_round_factor, u4_qbits,
734	0	u4_nonzero_coeff);
735	0	pi2_out_tmp[8] = r1;
736
737	0	FWD_QUANT(r2, pu2_threshold_matrix[16],
738	0	pu2_scale_matrix[16], u4_round_factor, u4_qbits,
739	0	u4_nonzero_coeff);
740	0	pi2_out_tmp[16] = r2;
741
742	0	FWD_QUANT(r3, pu2_threshold_matrix[24],
743	0	pu2_scale_matrix[24], u4_round_factor, u4_qbits,
744	0	u4_nonzero_coeff);
745	0	pi2_out_tmp[24] = r3;
746
747	0	FWD_QUANT(r4, pu2_threshold_matrix[32],
748	0	pu2_scale_matrix[32], u4_round_factor, u4_qbits,
749	0	u4_nonzero_coeff);
750	0	pi2_out_tmp[32] = r4;
751
752	0	FWD_QUANT(r5, pu2_threshold_matrix[40],
753	0	pu2_scale_matrix[40], u4_round_factor, u4_qbits,
754	0	u4_nonzero_coeff);
755	0	pi2_out_tmp[40] = r5;
756
757	0	FWD_QUANT(r6, pu2_threshold_matrix[48],
758	0	pu2_scale_matrix[48], u4_round_factor, u4_qbits,
759	0	u4_nonzero_coeff);
760	0	pi2_out_tmp[48] = r6;
761
762	0	FWD_QUANT(r7, pu2_threshold_matrix[56],
763	0	pu2_scale_matrix[56], u4_round_factor, u4_qbits,
764	0	u4_nonzero_coeff);
765	0	pi2_out_tmp[56] = r7;
766
767	0	pi2_out_tmp++;
768	0	pu2_scale_matrix++;
769	0	pu2_threshold_matrix++;
770	0	}
771		/* Return total nonzero coefficients in the current sub block */
772	0	*pu1_nnz = u4_nonzero_coeff;
773	0	}