Coverage Report

Created: 2025-12-14 06:42

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libavc/common/x86/ih264_ihadamard_scaling_avx2.c
Line
Count
Source
1
/******************************************************************************
2
+ *
3
+ * Copyright (C) 2015 The Android Open Source Project
4
+ *
5
+ * Licensed under the Apache License, Version 2.0 (the "License");
6
+ * you may not use this file except in compliance with the License.
7
+ * You may obtain a copy of the License at:
8
+ *
9
+ * http://www.apache.org/licenses/LICENSE-2.0
10
+ *
11
+ * Unless required by applicable law or agreed to in writing, software
12
+ * distributed under the License is distributed on an "AS IS" BASIS,
13
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ * See the License for the specific language governing permissions and
15
+ * limitations under the License.
16
+ *
17
+ *****************************************************************************
18
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19
+*/
20
/**
21
+ *******************************************************************************
22
+ * @file
23
+ *  ih264_ihadamard_scaling_avx2.c
24
+ *
25
+ * @brief
26
+ *  Contains definition of functions for h264 inverse hadamard 4x4 transform and scaling
27
+ *
28
+ * @author
29
+ *  Priyanka
30
+ *
31
+ *  @par List of Functions:
32
+ *  - ih264_ihadamard_scaling_4x4_avx2()
33
+ *
34
+ * @remarks
35
+ *
36
+ *******************************************************************************
37
+ */
38
/*****************************************************************************/
39
/* File Includes                                                             */
40
/*****************************************************************************/
41
42
/* User include files */
43
#include "ih264_typedefs.h"
44
#include "ih264_defs.h"
45
#include "ih264_trans_macros.h"
46
#include "ih264_macros.h"
47
#include "ih264_trans_data.h"
48
#include "ih264_size_defs.h"
49
#include "ih264_structs.h"
50
#include "ih264_trans_quant_itrans_iquant.h"
51
#include <immintrin.h>
52
53
/*
54
+ ********************************************************************************
55
+ *
56
+ * @brief This function performs a 4x4 inverse hadamard transform on the 4x4 DC coefficients
57
+ * of a 16x16 intra prediction macroblock, and then performs scaling.
58
+ * prediction buffer
59
+ *
60
+ * @par Description:
61
+ *  The DC coefficients pass through a 2-stage inverse hadamard transform.
62
+ *  This inverse transformed content is scaled to based on Qp value.
63
+ *
64
+ * @param[in] pi2_src
65
+ *  input 4x4 block of DC coefficients
66
+ *
67
+ * @param[out] pi2_out
68
+ *  output 4x4 block
69
+ *
70
+ * @param[in] pu2_iscal_mat
71
+ *  pointer to scaling list
72
+ *
73
+ * @param[in] pu2_weigh_mat
74
+ *  pointer to weight matrix
75
+ *
76
+ * @param[in] u4_qp_div_6
77
+ *  Floor (qp/6)
78
+ *
79
+ * @param[in] pi4_tmp
80
+ * temporary buffer of size 1*16
81
+ *
82
+ * @returns none
83
+ *
84
+ * @remarks none
85
+ *
86
+ *******************************************************************************
87
*/
88
89
#include <stdint.h>
90
#include <string.h>
91
92
#include <stdio.h>
93
94
#ifdef __ANDROID__
95
#include "log/log.h"
96
#include <cutils/log.h>
97
#endif
98
99
100
void ih264_ihadamard_scaling_4x4_avx2(WORD16* pi2_src,
101
                                      WORD16* pi2_out,
102
                                      const UWORD16 *pu2_iscal_mat,
103
                                      const UWORD16 *pu2_weigh_mat,
104
                                      UWORD32 u4_qp_div_6,
105
                                      WORD32* pi4_tmp)
106
2.89k
{ 
107
2.89k
    __m256i src,r0_r1,r2_r3,r3_r2,r1_r3,r0_r2;
108
2.89k
    __m256i src_r0_r1, src_r2_r3;
109
2.89k
    __m256i temp0, temp1,tmp0, tmp1, tmp2, tmp3;
110
2.89k
    __m256i add_rshift = _mm256_set1_epi32((u4_qp_div_6 < 6) ? (1 << (5 - u4_qp_div_6)) : 0);
111
2.89k
    __m256i mult_val = _mm256_set1_epi32(pu2_iscal_mat[0] * pu2_weigh_mat[0]);
112
2.89k
    __m256i zero =  _mm256_setzero_si256();
113
114
2.89k
    __m128i t0 ,t1;
115
2.89k
    UNUSED (pi4_tmp);
116
117
2.89k
    src_r0_r1 = _mm256_loadu_si256((__m256i *) (pi2_src)); //a00 a01 a02 a03 a10 a11 a12 a13 -- the source matrix 0th,1st row
118
119
2.89k
    temp0 = _mm256_unpacklo_epi64(src_r0_r1, zero);  
120
2.89k
    temp1 = _mm256_unpackhi_epi64(src_r0_r1, zero);        // b0 b1 b2..         d0 d1...
121
2.89k
    temp0 = _mm256_unpacklo_epi16(temp0, temp1); 
122
2.89k
    tmp0 =  _mm256_permute2x128_si256(temp0,zero,0x20);    //tmp0 tmp3
123
2.89k
    tmp1 =  _mm256_permute2x128_si256(temp0,zero,0x31);    //tmp1  tmp2
124
125
2.89k
    temp0 = _mm256_unpacklo_epi32(tmp0, tmp1);             //a0 c0 a1 c1 a2 c2 a3 c3 a0 c0 a1 c1 b0 d0 b1 c1
126
2.89k
    temp1 = _mm256_unpackhi_epi32(tmp0, tmp1);
127
    
128
2.89k
    temp1 = _mm256_shuffle_epi32(temp1,0b01001110);
129
2.89k
    tmp0 = _mm256_add_epi16(temp0, temp1);
130
2.89k
    tmp1 = _mm256_sub_epi16(temp0, temp1);
131
132
2.89k
    temp0 =   _mm256_unpacklo_epi64(tmp0, tmp1);
133
2.89k
    temp1 =   _mm256_unpackhi_epi64(tmp0, tmp1);
134
2.89k
    tmp0 = _mm256_add_epi16(temp0, temp1);
135
2.89k
    tmp1 = _mm256_sub_epi16(temp0, temp1);
136
137
2.89k
    temp0 = _mm256_unpacklo_epi32(tmp0, tmp1);             //a0 c0 a1 c1 a2 c2 a3 c3 a0 c0 a1 c1 b0 d0 b1 c1
138
2.89k
    temp1 = _mm256_unpackhi_epi32(tmp0, tmp1); 
139
    
140
2.89k
    temp0 = _mm256_unpacklo_epi64(tmp0, tmp1);             //a0 c0 a1 c1 a2 c2 a3 c3 a0 c0 a1 c1 b0 d0 b1 c1
141
2.89k
    temp1 = _mm256_unpackhi_epi64(tmp0, tmp1);
142
   
143
2.89k
    tmp0 = _mm256_unpacklo_epi16(temp0, temp1);             //a0 c0 a1 c1 a2 c2 a3 c3 a0 c0 a1 c1 b0 d0 b1 c1
144
2.89k
    tmp1 = _mm256_unpackhi_epi16(temp0, temp1);
145
146
2.89k
    temp0 = _mm256_unpacklo_epi32(tmp0, tmp1);             //a0 c0 a1 c1 a2 c2 a3 c3 a0 c0 a1 c1 b0 d0 b1 c1
147
2.89k
    temp1 = _mm256_unpackhi_epi32(tmp0, tmp1);
148
    
149
2.89k
    temp1 = _mm256_shuffle_epi32(temp1, _MM_SHUFFLE(1, 0, 3, 2));
150
2.89k
    tmp0 = _mm256_add_epi16(temp0, temp1);
151
2.89k
    tmp1 = _mm256_sub_epi16(temp0, temp1);
152
2.89k
    temp0 =   _mm256_unpacklo_epi64(tmp0, tmp1);
153
2.89k
    temp1 =   _mm256_unpackhi_epi64(tmp0, tmp1);
154
2.89k
    tmp0 = _mm256_add_epi16(temp0, temp1);
155
2.89k
    tmp1 = _mm256_sub_epi16(temp0, temp1);
156
157
2.89k
    temp0 =   _mm256_unpacklo_epi64(tmp0, tmp1);
158
2.89k
    temp1 =   _mm256_unpackhi_epi64(tmp0, tmp1);
159
160
161
2.89k
    r0_r1 =_mm256_cvtepi16_epi32(_mm256_castsi256_si128(temp0));
162
2.89k
    r2_r3 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(temp1));
163
164
2.89k
    src_r0_r1 = _mm256_mullo_epi32(r0_r1, mult_val);
165
2.89k
    src_r2_r3 = _mm256_mullo_epi32(r2_r3, mult_val);
166
167
    //Scaling
168
2.89k
    if(u4_qp_div_6 >= 6)
169
1.63k
    {
170
1.63k
        src_r0_r1 = _mm256_slli_epi32(src_r0_r1, u4_qp_div_6 - 6);
171
1.63k
        src_r2_r3 = _mm256_slli_epi32(src_r2_r3, u4_qp_div_6 - 6);
172
1.63k
    }
173
1.26k
    else
174
1.26k
    {
175
1.26k
        temp0  = _mm256_add_epi32(src_r0_r1, add_rshift);
176
1.26k
        temp1  = _mm256_add_epi32(src_r2_r3, add_rshift);
177
1.26k
        src_r0_r1 = _mm256_srai_epi32(temp0, 6 - u4_qp_div_6);
178
1.26k
        src_r2_r3 = _mm256_srai_epi32(temp1, 6 - u4_qp_div_6);
179
1.26k
    }
180
181
2.89k
    src = _mm256_packs_epi32(src_r0_r1, src_r2_r3);
182
2.89k
    _mm256_storeu_si256((__m256i *) (&pi2_out[0]), src);
183
2.89k
}
184