Coverage Report

Created: 2026-03-31 06:08

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libavc/common/x86/ih264_deblk_luma_avx2.c
Line
Count
Source
1
/******************************************************************************
2
 *
3
 * Copyright (C) 2015 The Android Open Source Project
4
 *
5
 * Licensed under the Apache License, Version 2.0 (the "License");
6
 * you may not use this file except in compliance with the License.
7
 * You may obtain a copy of the License at:
8
 *
9
 * http://www.apache.org/licenses/LICENSE-2.0
10
 *
11
 * Unless required by applicable law or agreed to in writing, software
12
 * distributed under the License is distributed on an "AS IS" BASIS,
13
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 * See the License for the specific language governing permissions and
15
 * limitations under the License.
16
 *
17
 *****************************************************************************
18
 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19
*/
20
/*****************************************************************************/
21
/*                                                                           */
22
/*  File Name         : ih264_deblk_luma_avx2.c                              */
23
/*                                                                           */
24
/*  Description       : Contains function definitions for deblocking         */
25
/*                                                                           */
26
/*  List of Functions : ih264_deblk_luma_horz_bslt4_avx2()                   */
27
/*                      ih264_deblk_luma_vert_bslt4_avx2()                   */
28
/*                                                                           */
29
/*  Issues / Problems : None                                                 */
30
/*                                                                           */
31
/*  Revision History  :                                                      */
32
/*                                                                           */
33
/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
34
/*         12 02 2015   Naveen Kumar P  Added luma deblocking ssse3          */
35
/*                                      intrinsics                           */
36
/*         15 09 2020   Priyanka Bose   AVX2 Intel Intrinsics Support        */
37
/*****************************************************************************/
38
39
/*****************************************************************************/
40
/* File Includes                                                             */
41
/*****************************************************************************/
42
43
/* System include files */
44
#include <stdio.h>
45
#ifdef __ANDROID__
46
#include "log/log.h"
47
#include <cutils/log.h>
48
#endif
49
50
/* User include files */
51
#include "ih264_typedefs.h"
52
#include "ih264_platform_macros.h"
53
#include "ih264_deblk_edge_filters.h"
54
#include "ih264_macros.h"
55
56
57
/*****************************************************************************/
58
/*                                                                           */
59
/*  Function Name : ih264_deblk_luma_horz_bslt4_avx2()                       */
60
/*                                                                           */
61
/*  Description   : This function performs filtering of a luma block         */
62
/*                  horizontal edge when boundary strength is less than 4.   */
63
/*                                                                           */
64
/*  Inputs        : pu1_src       - pointer to the src sample q0             */
65
/*                  src_strd      - source stride                            */
66
/*                  alpha         - alpha value for the boundary             */
67
/*                  beta          - beta value for the boundary              */
68
/*                  u4_bs         - packed Boundary strength array           */
69
/*                  pu1_cliptab   - tc0_table                                */
70
/*                                                                           */
71
/*  Globals       : None                                                     */
72
/*                                                                           */
73
/*  Processing    : This operation is described in Sec. 8.7.2.3 under the    */
74
/*                  title "Filtering process for edges for bS less than 4"   */
75
/*                  in ITU T Rec H.264.                                      */
76
/*                                                                           */
77
/*  Outputs       : None                                                     */
78
/*                                                                           */
79
/*  Returns       : None                                                     */
80
/*                                                                           */
81
/*  Issues        : None                                                     */
82
/*                                                                           */
83
/*  Revision History:                                                        */
84
/*                                                                           */
85
/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
86
/*         12 02 2015   Naveen Kumar P  Initial version                      */
87
/*         15 09 2020   Priyanka Bose   AVX2 Intel Intrinsics Support        */
88
/*****************************************************************************/
89
void ih264_deblk_luma_horz_bslt4_avx2(UWORD8 *pu1_src,
90
                                       WORD32 src_strd,
91
                                       WORD32 alpha,
92
                                       WORD32 beta,
93
                                       UWORD32 u4_bs,
94
                                       const UWORD8 *pu1_cliptab)
95
35.1k
{
96
97
35.1k
    WORD16 i16_posP2, i16_posP1, i16_posP0, i16_posQ1, i16_posQ2;
98
35.1k
    UWORD8 *pu1_HorzPixel;
99
35.1k
    __m256i zero = _mm256_setzero_si256();
100
35.1k
    __m128i zero_128 = _mm_setzero_si128();
101
35.1k
    __m128i Alpha_8x16,bs_flag_16x8b, C0_16x8, C0_8x16, C0_hi_8x16;
102
35.1k
    __m256i Beta_8x32,in_macro_32x8,in_macro_1,in_macro_2,flag1_32x8,flag2_32x8;
103
35.1k
    __m256i C_8x32,C0_8x32_res,temp1,temp2,temp3,temp4,res1,res2,q0p1_32x8,p0q1_32x8;
104
35.1k
    __m128i p0_16x8,q0_16x8,temp1_128,temp2_128,flag1_16x8_128;
105
35.1k
    __m256i const_val4_8x32,p0q0_32x8,p1q1_32x8,p2q2_32x8,q0p0_32x8;
106
35.1k
    UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
107
35.1k
    UWORD8 clip0, clip1, clip2, clip3;
108
109
35.1k
    pu1_HorzPixel = pu1_src - (src_strd << 2);
110
111
35.1k
    i16_posQ1 = src_strd;
112
35.1k
    i16_posQ2 = X2(src_strd);
113
35.1k
    i16_posP0 = X3(src_strd);
114
35.1k
    i16_posP1 = X2(src_strd);
115
35.1k
    i16_posP2 = src_strd;
116
117
35.1k
    p0q0_32x8 = _mm256_loadu2_m128i((__m128i *)(pu1_src), (__m128i *)(pu1_HorzPixel + i16_posP0)); //lower -p0 higher-q0
118
35.1k
    p1q1_32x8 = _mm256_loadu2_m128i((__m128i *)(pu1_src + i16_posQ1), (__m128i *)(pu1_HorzPixel + i16_posP1)); //l= p1, h=q1
119
35.1k
    p2q2_32x8 = _mm256_loadu2_m128i((__m128i *)(pu1_src + i16_posQ2), (__m128i *)(pu1_HorzPixel + i16_posP2));
120
121
35.1k
    u1_Bs0 = (u4_bs >> 24) & 0xff;
122
35.1k
    u1_Bs1 = (u4_bs >> 16) & 0xff;
123
35.1k
    u1_Bs2 = (u4_bs >> 8) & 0xff;
124
35.1k
    u1_Bs3 = (u4_bs >> 0) & 0xff;
125
35.1k
    clip0 = pu1_cliptab[u1_Bs0];
126
35.1k
    clip1 = pu1_cliptab[u1_Bs1];
127
35.1k
    clip2 = pu1_cliptab[u1_Bs2];
128
35.1k
    clip3 = pu1_cliptab[u1_Bs3];
129
130
35.1k
    Alpha_8x16 = _mm_set1_epi16(alpha);
131
35.1k
    Beta_8x32 = _mm256_set1_epi16(beta);
132
133
35.1k
    bs_flag_16x8b = _mm_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3,
134
35.1k
                                   u1_Bs2, u1_Bs2, u1_Bs2, u1_Bs2,
135
35.1k
                           u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,
136
35.1k
                                   u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0);
137
138
35.1k
    C0_16x8 = _mm_set_epi8(clip3, clip3, clip3, clip3, clip2, clip2, clip2,
139
35.1k
                           clip2, clip1, clip1, clip1, clip1, clip0, clip0,
140
35.1k
                           clip0, clip0);
141
142
35.1k
    bs_flag_16x8b = _mm_cmpeq_epi8(bs_flag_16x8b, zero_128);
143
35.1k
    bs_flag_16x8b = _mm_xor_si128(bs_flag_16x8b, _mm_set1_epi8(0xFF)); //Invert for required mask
144
35.1k
    C0_8x16 = _mm_unpacklo_epi8(C0_16x8, zero_128);
145
35.1k
    C0_hi_8x16 = _mm_unpackhi_epi8(C0_16x8, zero_128);
146
35.1k
    C0_8x32_res = _mm256_set_m128i(C0_hi_8x16,C0_8x16);
147
148
    //Cond1 (ABS(p0 - q0) < alpha)
149
35.1k
    p0_16x8 = _mm256_castsi256_si128(p0q0_32x8);
150
35.1k
    q0p0_32x8 = _mm256_permute2x128_si256(p0q0_32x8, p0q0_32x8, 0x1);
151
35.1k
    q0_16x8   = _mm256_castsi256_si128(p0q0_32x8);
152
35.1k
    temp1_128 = _mm_subs_epu8(q0_16x8, p0_16x8);
153
35.1k
    temp2_128 = _mm_subs_epu8(p0_16x8, q0_16x8);
154
35.1k
    temp1_128 = _mm_add_epi8(temp1_128, temp2_128);
155
156
35.1k
    temp2_128 = _mm_unpacklo_epi8(temp1_128, zero_128);
157
35.1k
    temp1_128 = _mm_unpackhi_epi8(temp1_128, zero_128);
158
159
35.1k
    temp2_128 = _mm_cmpgt_epi16(Alpha_8x16, temp2_128);
160
35.1k
    temp1_128 = _mm_cmpgt_epi16(Alpha_8x16, temp1_128);
161
35.1k
    flag1_16x8_128 = _mm_packs_epi16(temp2_128, temp1_128);
162
35.1k
    flag1_16x8_128 = _mm_and_si128(flag1_16x8_128, bs_flag_16x8b);
163
164
35.1k
    flag1_32x8  = _mm256_set_m128i(flag1_16x8_128,flag1_16x8_128);
165
166
    //Cond2 (ABS(q1 - q0) < beta) & Cond3 (ABS(p1 - p0) < beta)
167
35.1k
    temp1 = _mm256_subs_epu8(p0q0_32x8, p1q1_32x8);
168
35.1k
    temp2 = _mm256_subs_epu8(p1q1_32x8, p0q0_32x8);
169
35.1k
    temp1 = _mm256_add_epi8(temp1, temp2);
170
171
35.1k
    temp2 = _mm256_unpacklo_epi8(temp1, zero);
172
35.1k
    temp1 = _mm256_unpackhi_epi8(temp1, zero);
173
174
35.1k
    temp2 = _mm256_cmpgt_epi16(Beta_8x32, temp2);
175
35.1k
    temp1 = _mm256_cmpgt_epi16(Beta_8x32, temp1);
176
177
35.1k
    flag2_32x8 = _mm256_packs_epi16(temp2, temp1);
178
179
    //!((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta))
180
35.1k
    flag1_32x8 = _mm256_and_si256(flag1_32x8, flag2_32x8);
181
182
   //(ABS(p2 - p0) < beta) & (ABS(q2 - q0) < beta)
183
35.1k
    temp1 = _mm256_subs_epu8(p0q0_32x8, p2q2_32x8);
184
35.1k
    temp2 = _mm256_subs_epu8(p2q2_32x8, p0q0_32x8);
185
35.1k
    temp1 = _mm256_add_epi8(temp1, temp2);
186
187
35.1k
    temp2 = _mm256_unpacklo_epi8(temp1, zero);
188
35.1k
    temp1 = _mm256_unpackhi_epi8(temp1, zero);
189
35.1k
    temp2 = _mm256_cmpgt_epi16(Beta_8x32, temp2);
190
35.1k
    temp1 = _mm256_cmpgt_epi16(Beta_8x32, temp1);
191
192
35.1k
    flag2_32x8 = _mm256_packs_epi16(temp2, temp1);
193
35.1k
    flag2_32x8 = _mm256_and_si256(flag1_32x8, flag2_32x8);
194
195
35.1k
    temp2 = _mm256_subs_epi16(zero, temp2);
196
35.1k
    temp1 = _mm256_subs_epi16(zero, temp1);
197
198
35.1k
    temp3 = _mm256_permute2x128_si256(temp2,temp1,0x20); // low adding
199
35.1k
    temp4 = _mm256_permute2x128_si256(temp2,temp1,0x31); //high adding
200
35.1k
    temp2 = _mm256_add_epi16(temp3,temp4);
201
35.1k
    C_8x32 = _mm256_add_epi16(C0_8x32_res, temp2);       //
202
35.1k
    const_val4_8x32 = _mm256_set1_epi16(4);
203
204
35.1k
    res1 = _mm256_permute4x64_epi64(q0p0_32x8, 0xD8);
205
35.1k
    res2 = _mm256_permute4x64_epi64(p1q1_32x8, 0xD8);
206
207
35.1k
    temp3 = _mm256_subs_epi16(_mm256_unpacklo_epi8(res1, zero),
208
35.1k
                              _mm256_unpackhi_epi8(res1, zero));
209
35.1k
    temp4 = _mm256_subs_epi16(_mm256_unpacklo_epi8(res2, zero),
210
35.1k
                              _mm256_unpackhi_epi8(res2, zero));
211
212
35.1k
    temp1 = _mm256_slli_epi16(temp3, 2);
213
35.1k
    temp1 = _mm256_add_epi16(temp1, temp4);
214
35.1k
    temp1 = _mm256_add_epi16(temp1, const_val4_8x32);
215
35.1k
    in_macro_32x8 = _mm256_srai_epi16(temp1, 3);
216
217
35.1k
    in_macro_32x8 = _mm256_min_epi16(C_8x32, in_macro_32x8); //CLIP3
218
35.1k
    C_8x32 = _mm256_subs_epi16(zero, C_8x32);
219
35.1k
    in_macro_32x8 = _mm256_max_epi16(C_8x32, in_macro_32x8); //CLIP3
220
221
35.1k
    temp3 = _mm256_unpacklo_epi8(res1, zero); //q0
222
35.1k
    temp4 = _mm256_unpackhi_epi8(res1, zero); //p0
223
224
35.1k
    temp1 = _mm256_add_epi16(temp4, in_macro_32x8);
225
35.1k
    temp2 = _mm256_sub_epi16(temp3, in_macro_32x8);
226
227
35.1k
    temp1 = _mm256_packus_epi16(temp2, temp1); // Suffle needed
228
229
35.1k
    temp1 = _mm256_and_si256(temp1, flag1_32x8); //q0 p0
230
231
35.1k
    temp2 = _mm256_and_si256(res1,
232
35.1k
                          _mm256_xor_si256(flag1_32x8, _mm256_set1_epi16(0xFFFF)));
233
234
35.1k
    temp1 = _mm256_add_epi8(temp1, temp2);
235
35.1k
    temp1 = _mm256_permute4x64_epi64(temp1, 0xD8);
236
35.1k
    _mm256_storeu2_m128i((__m128i *)(pu1_HorzPixel + i16_posP0),(__m128i *)(pu1_src),temp1);
237
238
    //if(Ap < Beta) if(Aq < Beta)
239
35.1k
     temp1 = _mm256_avg_epu16(_mm256_unpacklo_epi8(res1, zero),
240
35.1k
                              _mm256_unpackhi_epi8(res1, zero));
241
242
35.1k
     temp2 = _mm256_slli_epi16(_mm256_unpacklo_epi8(p1q1_32x8, zero), 1);
243
35.1k
     temp3 = _mm256_subs_epi16(_mm256_unpacklo_epi8(p2q2_32x8, zero), temp2);
244
245
35.1k
     temp2 = _mm256_slli_epi16(_mm256_unpackhi_epi8(p1q1_32x8, zero), 1);
246
35.1k
     temp2 = _mm256_subs_epi16(_mm256_unpackhi_epi8(p2q2_32x8, zero), temp2);
247
248
35.1k
     temp4 = _mm256_permute2x128_si256(temp3, temp2, 0x20);     //p0 q0
249
35.1k
     temp3 = _mm256_permute2x128_si256(temp3, temp2, 0x31);
250
35.1k
     temp4 = _mm256_add_epi16(temp1, temp4); //p
251
35.1k
     in_macro_1  = _mm256_srai_epi16(temp4, 1);
252
35.1k
     temp3 = _mm256_add_epi16(temp1, temp3); //q
253
35.1k
     in_macro_2  = _mm256_srai_epi16(temp3, 1);
254
255
35.1k
     in_macro_1 = _mm256_min_epi16(C0_8x32_res, in_macro_1); //CLIP3
256
35.1k
     C0_8x32_res = _mm256_subs_epi16(zero, C0_8x32_res);
257
35.1k
     in_macro_1 = _mm256_max_epi16(C0_8x32_res, in_macro_1); //CLIP3
258
259
35.1k
     in_macro_2 = _mm256_max_epi16(C0_8x32_res, in_macro_2); //CLIP3
260
35.1k
     C0_8x32_res = _mm256_subs_epi16(zero, C0_8x32_res);
261
35.1k
     in_macro_2 = _mm256_min_epi16(C0_8x32_res, in_macro_2); //CLIP3
262
263
35.1k
     temp1 = _mm256_unpacklo_epi8(res2, zero);
264
35.1k
     temp2 = _mm256_unpackhi_epi8(res2, zero);
265
266
35.1k
     temp1 = _mm256_add_epi16(temp1, in_macro_1);
267
35.1k
     temp2 = _mm256_add_epi16(temp2, in_macro_2);
268
35.1k
     temp1 = _mm256_packus_epi16(temp1, temp2); // pl ph ql qh
269
35.1k
     temp1 = _mm256_and_si256(temp1, flag2_32x8);
270
35.1k
     temp2 = _mm256_and_si256(res2,_mm256_xor_si256(flag2_32x8, _mm256_set1_epi16(0xFFFF)));
271
35.1k
     temp1 = _mm256_add_epi8(temp1, temp2);
272
35.1k
     temp1 = _mm256_permute4x64_epi64(temp1, 0xD8);
273
35.1k
     _mm256_storeu2_m128i((__m128i *)(pu1_src + i16_posQ1),(__m128i *)(pu1_HorzPixel + i16_posP1),temp1);
274
275
35.1k
}