Coverage Report

Created: 2026-03-20 07:15

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libhevc/common/x86/ihevc_deblk_ssse3_intr.c
Line
Count
Source
1
/******************************************************************************
2
*
3
* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4
*
5
* Licensed under the Apache License, Version 2.0 (the "License");
6
* you may not use this file except in compliance with the License.
7
* You may obtain a copy of the License at:
8
*
9
* http://www.apache.org/licenses/LICENSE-2.0
10
*
11
* Unless required by applicable law or agreed to in writing, software
12
* distributed under the License is distributed on an "AS IS" BASIS,
13
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
* See the License for the specific language governing permissions and
15
* limitations under the License.
16
*
17
******************************************************************************/
18
/**
19
*******************************************************************************
20
* @file
21
*  ihevc_deblck_atom_intr.c
22
*
23
* @brief
24
*  Contains function definitions for deblocking filters
25
*
26
* @author
27
*  Rishab
28
*
29
* @par List of Functions:
30
*   - ihevc_deblk_luma_vert_ssse3()
31
*   - ihevc_deblk_luma_horz_ssse3()
32
*   - ihevc_deblk_chroma_vert_ssse3()
33
*   - ihevc_deblk_chroma_horz_ssse3()
34
*
35
* @remarks
36
*  None
37
*
38
*******************************************************************************
39
*/
40
#include <stdlib.h>
41
#include <stdio.h>
42
#include <assert.h>
43
#include "ihevc_typedefs.h"
44
#include "ihevc_platform_macros.h"
45
#include "ihevc_macros.h"
46
#include "ihevc_deblk.h"
47
#include "ihevc_deblk_tables.h"
48
#include "ihevc_debug.h"
49
50
#include "ihevc_tables_x86_intr.h"
51
52
#include <immintrin.h>
53
/**
54
*******************************************************************************
55
*
56
* @brief
57
*       Decision process and filtering for the luma block vertical edge.
58
*
59
* @par Description:
60
*     The decision process for the luma block vertical edge is  carried out and
61
*     an appropriate filter is applied. The  boundary filter strength, bs should
62
*     be greater than 0.  The pcm flags and the transquant bypass flags should
63
*     be  taken care of by the calling function.
64
*
65
* @param[in] pu1_src
66
*  Pointer to the src sample q(0,0)
67
*
68
* @param[in] src_strd
69
*  Source stride
70
*
71
* @param[in] bs
72
*  Boundary filter strength of q(0,0)
73
*
74
* @param[in] quant_param_p
75
*  quantization parameter of p block
76
*
77
* @param[in] quant_param_q
78
*  quantization parameter of p block
79
*
80
* @param[in] beta_offset_div2
81
*
82
*
83
* @param[in] tc_offset_div2
84
*
85
*
86
* @param[in] filter_flag_p
87
*  flag whether to filter the p block
88
*
89
* @param[in] filter_flag_q
90
*  flag whether to filter the q block
91
*
92
* @returns
93
*
94
* @remarks
95
*  None
96
*
97
*******************************************************************************
98
*/
99
100
void ihevc_deblk_luma_vert_ssse3(UWORD8 *pu1_src,
101
                                 WORD32 src_strd,
102
                                 WORD32 bs,
103
                                 WORD32 quant_param_p,
104
                                 WORD32 quant_param_q,
105
                                 WORD32 beta_offset_div2,
106
                                 WORD32 tc_offset_div2,
107
                                 WORD32 filter_flag_p,
108
                                 WORD32 filter_flag_q)
109
25.0M
{
110
25.0M
    WORD32 qp_luma, beta_indx, tc_indx;
111
25.0M
    WORD32 beta, tc;
112
25.0M
    WORD32 d, dp, dq, d_sam0, d_sam3;
113
114
25.0M
    WORD32 d3, d0, de_0, de_1, de_2, de_3;
115
25.0M
    WORD32 de, dep, deq;
116
25.0M
    __m128i src_row0_8x16b, src_row1_8x16b, src_row2_8x16b, src_row3_8x16b;
117
118
119
25.0M
    {
120
25.0M
        __m128i src_tmp_8x16b, coef_8x16b, mask_d_result_4x32b, mask_de_result_8x16b;
121
25.0M
        __m128i mask_16x8b, temp_coef0_8x16b, temp_coef1_8x16b;
122
123
124
125
25.0M
        ASSERT((bs > 0) && (bs <= 3));
126
25.0M
        ASSERT(filter_flag_p || filter_flag_q);
127
128
25.0M
        qp_luma = (quant_param_p + quant_param_q + 1) >> 1;
129
25.0M
        beta_indx = CLIP3(qp_luma + (beta_offset_div2 << 1), 0, 51);
130
131
        /* BS based on implementation can take value 3 if it is intra/inter egde          */
132
        /* based on BS, tc index is calcuated by adding 2 * ( bs - 1) to QP and tc_offset */
133
        /* for BS = 1 adding factor is (0*2), BS = 2 or 3 adding factor is (1*2)          */
134
        /* the above desired functionallity is achieved by doing (2*(bs>>1))              */
135
136
25.0M
        tc_indx = CLIP3(qp_luma + (2 * (bs >> 1)) + (tc_offset_div2 << 1), 0, 53);
137
138
25.0M
        beta = gai4_ihevc_beta_table[beta_indx];
139
25.0M
        tc = gai4_ihevc_tc_table[tc_indx];
140
25.0M
        if(0 == tc)
141
730k
        {
142
730k
            return;
143
730k
        }
144
24.3M
        src_row0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src - 4));
145
24.3M
        src_row3_8x16b = _mm_loadl_epi64((__m128i *)((pu1_src - 4) + 3 * src_strd));
146
147
24.3M
        coef_8x16b = _mm_load_si128((__m128i *)(coef_d));
148
24.3M
        mask_16x8b =  _mm_load_si128((__m128i *)(shuffle_d));
149
150
24.3M
        src_tmp_8x16b = _mm_unpacklo_epi64(src_row0_8x16b, src_row3_8x16b);
151
24.3M
        mask_de_result_8x16b = _mm_shuffle_epi8(src_tmp_8x16b, mask_16x8b);
152
153
24.3M
        mask_d_result_4x32b = _mm_maddubs_epi16(src_tmp_8x16b, coef_8x16b);
154
155
156
        //to get all 1's of 8 bit in (1)
157
24.3M
        temp_coef0_8x16b = _mm_cmpeq_epi16(src_tmp_8x16b, src_tmp_8x16b);
158
24.3M
        temp_coef1_8x16b = _mm_srli_epi16(temp_coef0_8x16b, 15);
159
        //accumulating values foe dp3 dq3 , dp0 dq0 values
160
24.3M
        mask_d_result_4x32b = _mm_madd_epi16(mask_d_result_4x32b, temp_coef1_8x16b);
161
162
24.3M
        temp_coef1_8x16b = _mm_packus_epi16(temp_coef1_8x16b, temp_coef1_8x16b);
163
        // to get all 1,-1 sets of 16 bits in (0)
164
24.3M
        temp_coef0_8x16b = _mm_unpacklo_epi8(temp_coef0_8x16b, temp_coef1_8x16b);
165
        //q33-q30,p33-p30,q03-q00,p03-p00,0,q30-p30,0,q00-p00
166
24.3M
        mask_de_result_8x16b = _mm_maddubs_epi16(mask_de_result_8x16b, temp_coef0_8x16b);
167
        //to get 16 bit 1's
168
24.3M
        temp_coef0_8x16b = _mm_srli_epi16(temp_coef1_8x16b, 8);
169
170
171
        // dq3 dp3 dq0 dp0
172
24.3M
        mask_d_result_4x32b = _mm_abs_epi32(mask_d_result_4x32b);
173
24.3M
        mask_16x8b = _mm_shuffle_epi32(mask_d_result_4x32b, 0xec);
174
24.3M
        mask_d_result_4x32b = _mm_shuffle_epi32(mask_d_result_4x32b, 0x49);
175
        // dq dp d3 d0
176
24.3M
        mask_d_result_4x32b = _mm_add_epi32(mask_d_result_4x32b, mask_16x8b);
177
        //|q33-q30|,|p33-p30|,|q03-q00|,|p03-p00|,0,|q30-p30|,0,|q00-p00|
178
24.3M
        mask_de_result_8x16b = _mm_abs_epi16(mask_de_result_8x16b);
179
        //|q33-q30|+|p33-p30|,|q03-q00|+|p03-p00|,0+|q30-p30|,0+|q00-p00|
180
24.3M
        mask_de_result_8x16b = _mm_madd_epi16(mask_de_result_8x16b, temp_coef0_8x16b);
181
182
        ///store back in a single variable
183
24.3M
        temp_coef0_8x16b = _mm_srli_si128(mask_d_result_4x32b, 4);
184
24.3M
        temp_coef1_8x16b = _mm_srli_si128(mask_d_result_4x32b, 8);
185
24.3M
        mask_16x8b = _mm_srli_si128(mask_d_result_4x32b, 12);
186
187
24.3M
        d0 = _mm_cvtsi128_si32(mask_d_result_4x32b);
188
24.3M
        d3 = _mm_cvtsi128_si32(temp_coef0_8x16b);
189
24.3M
        dp = _mm_cvtsi128_si32(temp_coef1_8x16b);
190
24.3M
        dq = _mm_cvtsi128_si32(mask_16x8b);
191
        //getting d
192
24.3M
        d = d0 + d3;
193
194
        ///store back in a single variable
195
24.3M
        temp_coef0_8x16b = _mm_srli_si128(mask_de_result_8x16b, 4);
196
24.3M
        temp_coef1_8x16b = _mm_srli_si128(mask_de_result_8x16b, 8);
197
24.3M
        mask_16x8b = _mm_srli_si128(mask_de_result_8x16b, 12);
198
199
24.3M
        de_0 = _mm_cvtsi128_si32(mask_de_result_8x16b);
200
24.3M
        de_1 = _mm_cvtsi128_si32(temp_coef0_8x16b);
201
24.3M
        de_2 = _mm_cvtsi128_si32(temp_coef1_8x16b);
202
24.3M
        de_3 = _mm_cvtsi128_si32(mask_16x8b);
203
204
24.3M
        de = 0;
205
24.3M
        dep = 0;
206
24.3M
        deq = 0;
207
24.3M
        if(d < beta)
208
22.6M
        {
209
22.6M
            d_sam0 = 0;
210
22.6M
            if((2 * d0 < (beta >> 2))
211
21.5M
                            && (de_2 < (beta >> 3))
212
20.1M
                            && (de_0 < ((5 * tc + 1) >> 1)))
213
19.9M
            {
214
19.9M
                d_sam0 = 1;
215
19.9M
            }
216
217
22.6M
            d_sam3 = 0;
218
22.6M
            if((2 * d3 < (beta >> 2))
219
21.5M
                            && (de_3 < (beta >> 3))
220
20.1M
                            && de_1 < ((5 * tc + 1) >> 1))
221
19.9M
            {
222
19.9M
                d_sam3 = 1;
223
19.9M
            }
224
225
22.6M
            de = (d_sam0 & d_sam3) + 1;
226
22.6M
            dep = (dp < (beta + (beta >> 1)) >> 3) ? 1 : 0;
227
22.6M
            deq = (dq < (beta + (beta >> 1)) >> 3) ? 1 : 0;
228
22.6M
            if(tc <= 1)
229
2.55M
            {
230
2.55M
                dep = 0;
231
2.55M
                deq = 0;
232
2.55M
            }
233
22.6M
        }
234
235
24.3M
    }
236
237
24.3M
    if(de != 0)
238
22.6M
    {
239
240
241
22.6M
        src_row1_8x16b = _mm_loadl_epi64((__m128i *)((pu1_src - 4) + src_strd));
242
22.6M
        src_row2_8x16b = _mm_loadl_epi64((__m128i *)((pu1_src - 4) + 2 * src_strd));
243
244
22.6M
        if(de == 2)
245
19.5M
        {
246
19.5M
            __m128i temp_pq_str0_16x8b;
247
19.5M
            __m128i temp_pq1_str0_16x8b, temp_pq1_str1_16x8b;
248
19.5M
            __m128i temp_pq2_str0_16x8b;
249
19.5M
            __m128i temp_pq_str1_16x8b;
250
19.5M
            __m128i temp_str0_16x8b, temp_str1_16x8b, temp_str2_16x8b, temp_str3_16x8b;
251
19.5M
            __m128i temp_max0_16x8b, temp_max1_16x8b, temp_min0_16x8b, temp_min1_16x8b;
252
19.5M
            __m128i const2_8x16b, const2tc_8x16b;
253
19.5M
            LWORD64 mask, tc2;
254
19.5M
            tc = tc << 1;
255
19.5M
            mask = (((LWORD64)filter_flag_q) << 63) | (((LWORD64)filter_flag_p) << 31);
256
19.5M
            tc2 = ((LWORD64)tc);
257
258
19.5M
            const2_8x16b = _mm_cmpeq_epi16(src_row0_8x16b, src_row0_8x16b);
259
            //q'0-q'1-2 ,p'0-p'1-2
260
19.5M
            src_row0_8x16b = _mm_unpacklo_epi64(src_row0_8x16b, src_row2_8x16b);
261
19.5M
            src_row1_8x16b = _mm_unpacklo_epi64(src_row1_8x16b, src_row3_8x16b);
262
263
19.5M
            const2_8x16b = _mm_srli_epi16(const2_8x16b, 15);
264
19.5M
            temp_pq_str0_16x8b = _mm_srli_epi64(src_row0_8x16b, 16);
265
19.5M
            temp_pq_str1_16x8b = _mm_srli_epi64(src_row1_8x16b, 16);
266
            //arranged x x x x x x x x q31 q30 q1 q10 p30 p31 p10 p11 , x x x x x x x x q21 q20 q01 q00 p20 p21 p00 p01
267
19.5M
            temp_str0_16x8b = _mm_unpacklo_epi16(temp_pq_str0_16x8b, temp_pq_str1_16x8b);
268
19.5M
            temp_str1_16x8b = _mm_unpackhi_epi16(temp_pq_str0_16x8b, temp_pq_str1_16x8b);
269
270
19.5M
            const2_8x16b = _mm_packus_epi16(const2_8x16b, const2_8x16b);
271
            //arranged q31 q30 q21 q20 q1 q10 q01 q00 p30 p31 p20 p21 p10 p11 p00 p01
272
19.5M
            temp_pq_str0_16x8b = _mm_unpacklo_epi32(temp_str0_16x8b, temp_str1_16x8b);
273
274
19.5M
            temp_pq_str0_16x8b = _mm_maddubs_epi16(temp_pq_str0_16x8b, const2_8x16b);
275
276
            //q'1-2, p'1-2
277
19.5M
            temp_pq1_str0_16x8b = _mm_srli_epi64(src_row0_8x16b, 8);
278
19.5M
            temp_pq1_str1_16x8b = _mm_srli_epi64(src_row1_8x16b, 8);
279
280
19.5M
            temp_str2_16x8b = _mm_unpacklo_epi16(temp_pq1_str0_16x8b, temp_pq1_str1_16x8b);
281
19.5M
            temp_str3_16x8b = _mm_unpackhi_epi16(temp_pq1_str0_16x8b, temp_pq1_str1_16x8b);
282
283
19.5M
            temp_str2_16x8b = _mm_shuffle_epi32(temp_str2_16x8b, 0x58);
284
19.5M
            temp_str3_16x8b = _mm_shuffle_epi32(temp_str3_16x8b, 0x58);
285
            // q30 p30 q20 p20 q10 p10 q01 q00 p30 q20 p20 q10 p10 q01 q00 p00
286
19.5M
            temp_pq1_str0_16x8b = _mm_unpackhi_epi32(temp_str2_16x8b, temp_str3_16x8b);
287
            // q32 q31 q22 q21 q12 q11 q02 q01 p32 p31 p22 p21 p12 p11 p02 p01
288
19.5M
            temp_pq1_str1_16x8b = _mm_unpacklo_epi32(temp_str2_16x8b, temp_str3_16x8b);
289
290
19.5M
            temp_pq1_str0_16x8b = _mm_maddubs_epi16(temp_pq1_str0_16x8b, const2_8x16b);
291
19.5M
            temp_pq1_str1_16x8b = _mm_maddubs_epi16(temp_pq1_str1_16x8b, const2_8x16b);
292
293
            //clipping mask design
294
19.5M
            temp_str1_16x8b = _mm_setzero_si128();
295
19.5M
            temp_str0_16x8b = _mm_loadl_epi64((__m128i *)(&mask));
296
19.5M
            const2tc_8x16b  = _mm_loadl_epi64((__m128i *)(&tc2));
297
19.5M
            temp_str0_16x8b = _mm_shuffle_epi32(temp_str0_16x8b, 0x44);
298
19.5M
            const2tc_8x16b  = _mm_shuffle_epi8(const2tc_8x16b, temp_str1_16x8b);
299
300
            //clipping mask design
301
19.5M
            temp_str0_16x8b = _mm_srai_epi32(temp_str0_16x8b, 31);
302
19.5M
            const2tc_8x16b = _mm_and_si128(const2tc_8x16b, temp_str0_16x8b);
303
            //calculating Clipping MAX for all pixel values.
304
19.5M
            temp_max0_16x8b = _mm_adds_epu8(src_row0_8x16b, const2tc_8x16b);
305
19.5M
            temp_max1_16x8b = _mm_adds_epu8(src_row1_8x16b, const2tc_8x16b);
306
307
308
            //q'2-q'0-2,p'2-p'0-2
309
19.5M
            temp_pq2_str0_16x8b = _mm_unpacklo_epi16(src_row0_8x16b, src_row2_8x16b);
310
19.5M
            temp_str3_16x8b     = _mm_unpacklo_epi16(src_row1_8x16b, src_row3_8x16b);
311
312
19.5M
            temp_pq2_str0_16x8b = _mm_shuffle_epi32(temp_pq2_str0_16x8b, 0x5c);
313
19.5M
            temp_str3_16x8b     = _mm_shuffle_epi32(temp_str3_16x8b, 0x5c);
314
315
19.5M
            const2_8x16b = _mm_slli_epi16(const2_8x16b, 1);
316
            //arranged q33 q32 q23 q22 q13 q12 q03 q02 p33 p32 p23 p22 p13 p12 p03 p02
317
19.5M
            temp_str3_16x8b = _mm_unpacklo_epi16(temp_pq2_str0_16x8b, temp_str3_16x8b);
318
319
19.5M
            temp_pq2_str0_16x8b = _mm_maddubs_epi16(temp_str3_16x8b, const2_8x16b);
320
321
            //calculating Clipping MIN for all pixel values.
322
19.5M
            temp_min0_16x8b = _mm_subs_epu8(src_row0_8x16b, const2tc_8x16b);
323
19.5M
            temp_min1_16x8b = _mm_subs_epu8(src_row1_8x16b, const2tc_8x16b);
324
            //q'0-q'1-2 ,p'0-p'1-2
325
19.5M
            temp_pq_str1_16x8b = _mm_shuffle_epi32(temp_pq_str0_16x8b, 0x4e);
326
19.5M
            temp_pq_str0_16x8b = _mm_add_epi16(temp_pq_str0_16x8b, temp_pq_str1_16x8b);
327
            //q'1-2 p'1-2
328
19.5M
            temp_pq1_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq1_str1_16x8b);
329
            //to get 2 in 16 bit
330
19.5M
            const2_8x16b = _mm_srli_epi16(const2_8x16b, 8);
331
            //to get q33 q23 q13 q03, p33 p23 p13 p03
332
19.5M
            temp_pq1_str1_16x8b = _mm_slli_epi16(temp_str3_16x8b, 8);
333
19.5M
            temp_pq_str1_16x8b = _mm_srli_epi16(temp_str3_16x8b, 8);
334
19.5M
            temp_pq1_str1_16x8b = _mm_srli_epi16(temp_pq1_str1_16x8b, 8);
335
336
            //q'1, p'1 (adding 2)
337
19.5M
            temp_pq1_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, const2_8x16b);
338
            //q'0-q'1,p'0-p'1
339
19.5M
            temp_pq_str0_16x8b = _mm_add_epi16(temp_pq_str0_16x8b, const2_8x16b);
340
            //q'2-q'1,p'2-p'1
341
19.5M
            temp_pq2_str0_16x8b = _mm_add_epi16(temp_pq2_str0_16x8b, const2_8x16b);
342
            //q'0 = (q'0-q'1)+q'1 ,p'0 = (p'0-p'1)+p'1;
343
19.5M
            temp_pq_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq_str0_16x8b);
344
            //q'2 = (q'2-q'1)+q'1 ,p'2 = (p'2-p'1)+p'1;
345
19.5M
            temp_pq2_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq2_str0_16x8b);
346
347
            //normalisation of all modified pixels
348
19.5M
            temp_pq_str0_16x8b  = _mm_srai_epi16(temp_pq_str0_16x8b, 3);
349
19.5M
            temp_pq1_str0_16x8b = _mm_srai_epi16(temp_pq1_str0_16x8b, 2);
350
19.5M
            temp_pq2_str0_16x8b = _mm_srai_epi16(temp_pq2_str0_16x8b, 3);
351
352
            //getting p0 p1 together and p2 p3 together
353
19.5M
            temp_str0_16x8b = _mm_unpacklo_epi16(temp_pq1_str0_16x8b, temp_pq_str0_16x8b);
354
19.5M
            temp_str2_16x8b = _mm_unpacklo_epi16(temp_pq1_str1_16x8b, temp_pq2_str0_16x8b);
355
            //getting q1 q0 together and  q3 q2 together
356
19.5M
            temp_pq_str0_16x8b = _mm_unpackhi_epi16(temp_pq_str0_16x8b, temp_pq1_str0_16x8b);
357
19.5M
            temp_pq2_str0_16x8b = _mm_unpackhi_epi16(temp_pq2_str0_16x8b, temp_pq_str1_16x8b);
358
            //getting p's of row0 row1 together and of row2 row3 together
359
19.5M
            temp_pq_str1_16x8b = _mm_unpacklo_epi32(temp_str2_16x8b, temp_str0_16x8b);
360
19.5M
            temp_str2_16x8b    = _mm_unpackhi_epi32(temp_str2_16x8b, temp_str0_16x8b);
361
            //getting q's of row0 row1 together and of row2 row3 together
362
19.5M
            temp_str0_16x8b    = _mm_unpacklo_epi32(temp_pq_str0_16x8b, temp_pq2_str0_16x8b);
363
19.5M
            temp_pq_str0_16x8b = _mm_unpackhi_epi32(temp_pq_str0_16x8b, temp_pq2_str0_16x8b);
364
            //getting values for respective rows in 16 bit
365
19.5M
            src_row0_8x16b = _mm_unpacklo_epi64(temp_pq_str1_16x8b, temp_str0_16x8b);
366
19.5M
            src_row1_8x16b = _mm_unpackhi_epi64(temp_pq_str1_16x8b, temp_str0_16x8b);
367
19.5M
            src_row2_8x16b = _mm_unpacklo_epi64(temp_str2_16x8b, temp_pq_str0_16x8b);
368
19.5M
            src_row3_8x16b = _mm_unpackhi_epi64(temp_str2_16x8b, temp_pq_str0_16x8b);
369
            //packing values to 8 bit
370
19.5M
            src_row0_8x16b = _mm_packus_epi16(src_row0_8x16b, src_row2_8x16b);
371
19.5M
            src_row1_8x16b = _mm_packus_epi16(src_row1_8x16b, src_row3_8x16b);
372
            //Clipping MAX
373
19.5M
            src_row0_8x16b = _mm_min_epu8(src_row0_8x16b, temp_max0_16x8b);
374
19.5M
            src_row1_8x16b = _mm_min_epu8(src_row1_8x16b, temp_max1_16x8b);
375
            //Clipping MIN
376
19.5M
            src_row0_8x16b = _mm_max_epu8(src_row0_8x16b, temp_min0_16x8b);
377
19.5M
            src_row1_8x16b = _mm_max_epu8(src_row1_8x16b, temp_min1_16x8b);
378
            //separating row 2 and row 3
379
19.5M
            src_row2_8x16b = _mm_srli_si128(src_row0_8x16b, 8);
380
19.5M
            src_row3_8x16b = _mm_srli_si128(src_row1_8x16b, 8);
381
382
19.5M
        }
383
384
3.04M
        else
385
3.04M
        {
386
387
3.04M
            __m128i tmp_delta0_8x16b, tmp_delta1_8x16b, tmp_delta2_8x16b, tmp_delta3_8x16b;
388
3.04M
            __m128i tmp0_const_8x16b, tmp1_const_8x16b, tmp2_const_8x16b, tmp3_const_8x16b;
389
3.04M
            __m128i coefdelta_0_8x16b, mask_pq_8x16b;
390
3.04M
            __m128i const2_8x16b, consttc_8x16b;
391
392
3.04M
            LWORD64 mask1;
393
3.04M
            mask1 = (((LWORD64)(filter_flag_q & deq)) << 63) | (((LWORD64)filter_flag_q) << 47) | (((LWORD64)filter_flag_p) << 31) | (((LWORD64)(filter_flag_p & dep)) << 15);
394
395
3.04M
            consttc_8x16b = _mm_set1_epi32(tc);
396
397
398
3.04M
            src_row0_8x16b = _mm_unpacklo_epi64(src_row0_8x16b, src_row1_8x16b);
399
3.04M
            src_row2_8x16b = _mm_unpacklo_epi64(src_row2_8x16b, src_row3_8x16b);
400
401
3.04M
            tmp_delta2_8x16b = _mm_srli_epi64(src_row0_8x16b, 16);
402
3.04M
            tmp_delta3_8x16b = _mm_srli_epi64(src_row2_8x16b, 16);
403
404
3.04M
            tmp_delta2_8x16b = _mm_shuffle_epi32(tmp_delta2_8x16b, 0x08);
405
3.04M
            tmp_delta3_8x16b = _mm_shuffle_epi32(tmp_delta3_8x16b, 0x08);
406
            //arranged q31 q30 p30 p31  q21 q20 p20 p21  q1 q10 p10 p11 q01 q00 p00 p01
407
3.04M
            tmp_delta2_8x16b = _mm_unpacklo_epi64(tmp_delta2_8x16b, tmp_delta3_8x16b);
408
409
3.04M
            coefdelta_0_8x16b = _mm_load_si128((__m128i *)coef_de1);
410
            // (-3q1+9q0),(-9p0+3p1)
411
3.04M
            tmp_delta3_8x16b = _mm_maddubs_epi16(tmp_delta2_8x16b, coefdelta_0_8x16b);
412
            //converting to 16 bit
413
3.04M
            consttc_8x16b = _mm_packs_epi32(consttc_8x16b, consttc_8x16b);
414
            //getting -tc store
415
3.04M
            tmp1_const_8x16b = _mm_cmpeq_epi32(consttc_8x16b, consttc_8x16b);
416
            //calc 10 *tc = 2*tc +8*tc ; 2*tc
417
3.04M
            tmp2_const_8x16b = _mm_slli_epi16(consttc_8x16b, 1);
418
            //calc 10 *tc = 2*tc +8*tc ; 8*tc
419
3.04M
            tmp0_const_8x16b = _mm_slli_epi16(consttc_8x16b, 3);
420
            //getting -tc store
421
3.04M
            tmp3_const_8x16b = _mm_sign_epi16(consttc_8x16b, tmp1_const_8x16b);
422
            //calc 10 *tc
423
3.04M
            tmp2_const_8x16b = _mm_add_epi16(tmp2_const_8x16b, tmp0_const_8x16b);
424
            //const 1
425
3.04M
            const2_8x16b = _mm_srli_epi16(tmp1_const_8x16b, 15);
426
3.04M
            tmp_delta0_8x16b = _mm_madd_epi16(tmp_delta3_8x16b, const2_8x16b);
427
3.04M
            const2_8x16b = _mm_srli_epi32(tmp1_const_8x16b, 31);
428
            //getting the mask values
429
3.04M
            mask_pq_8x16b = _mm_loadl_epi64((__m128i *)(&mask1));
430
            //loaded coef for delta1 calculation
431
3.04M
            coefdelta_0_8x16b = _mm_load_si128((__m128i *)coef_dep1);
432
            //(-2q1+q0),(p0-2p1)
433
3.04M
            tmp_delta3_8x16b = _mm_maddubs_epi16(tmp_delta2_8x16b, coefdelta_0_8x16b);
434
            //const 8
435
3.04M
            const2_8x16b = _mm_slli_epi32(const2_8x16b, 3);
436
            //rearranging the mask values
437
3.04M
            mask_pq_8x16b = _mm_unpacklo_epi64(mask_pq_8x16b, mask_pq_8x16b);
438
            //normalisation of the filter
439
3.04M
            tmp_delta0_8x16b = _mm_add_epi32(tmp_delta0_8x16b, const2_8x16b);
440
3.04M
            tmp_delta0_8x16b = _mm_srai_epi32(tmp_delta0_8x16b, 4);
441
442
            //getting deltaq0
443
3.04M
            tmp_delta2_8x16b = _mm_sign_epi32(tmp_delta0_8x16b, tmp1_const_8x16b);
444
            //packing  d3q d2q d1q d0q d3p d2p d1p d0p
445
3.04M
            tmp_delta0_8x16b = _mm_packs_epi32(tmp_delta0_8x16b, tmp_delta2_8x16b);
446
            //absolute delta
447
3.04M
            tmp_delta2_8x16b = _mm_abs_epi16(tmp_delta0_8x16b);
448
            //Clipping of delta0
449
3.04M
            tmp_delta0_8x16b = _mm_min_epi16(tmp_delta0_8x16b, consttc_8x16b);
450
            //mask for |delta| < 10*tc
451
3.04M
            tmp0_const_8x16b = _mm_cmpgt_epi16(tmp2_const_8x16b, tmp_delta2_8x16b);
452
            //Clipping of delta0
453
3.04M
            tmp_delta0_8x16b = _mm_max_epi16(tmp_delta0_8x16b, tmp3_const_8x16b);
454
455
456
            //delta 1 calc starts
457
458
            //getting q32 q22 q12 q02 p32 p12 p22 p02
459
3.04M
            tmp2_const_8x16b = _mm_loadl_epi64((__m128i *)(shuffle0));
460
3.04M
            tmp_delta2_8x16b = _mm_shuffle_epi8(src_row0_8x16b, tmp2_const_8x16b);
461
3.04M
            tmp_delta1_8x16b =  _mm_shuffle_epi8(src_row2_8x16b, tmp2_const_8x16b);
462
3.04M
            tmp_delta1_8x16b = _mm_unpacklo_epi32(tmp_delta2_8x16b, tmp_delta1_8x16b);
463
            //constant 1
464
3.04M
            const2_8x16b = _mm_srli_epi16(tmp1_const_8x16b, 15);
465
            //tc>>1 16 bit
466
3.04M
            consttc_8x16b = _mm_srai_epi16(consttc_8x16b, 1);
467
468
            //getting -tc>>1 store  16 bit
469
3.04M
            tmp1_const_8x16b = _mm_sign_epi16(consttc_8x16b, tmp1_const_8x16b);
470
            //2*delta0
471
3.04M
            tmp2_const_8x16b = _mm_add_epi16(tmp_delta0_8x16b, tmp_delta0_8x16b);
472
473
            //getting  all respective q's and p's together
474
3.04M
            tmp3_const_8x16b = _mm_load_si128((__m128i *)(shuffle1));
475
3.04M
            tmp_delta3_8x16b = _mm_shuffle_epi8(tmp_delta3_8x16b, tmp3_const_8x16b);
476
            //final adds for deltap1 and deltaq1
477
3.04M
            tmp_delta3_8x16b = _mm_add_epi16(tmp_delta3_8x16b, const2_8x16b);
478
3.04M
            tmp_delta1_8x16b = _mm_add_epi16(tmp_delta1_8x16b, tmp2_const_8x16b);
479
3.04M
            tmp_delta1_8x16b = _mm_add_epi16(tmp_delta1_8x16b, tmp_delta3_8x16b);
480
3.04M
            tmp2_const_8x16b = _mm_setzero_si128();
481
3.04M
            tmp_delta1_8x16b = _mm_srai_epi16(tmp_delta1_8x16b, 2);
482
483
            // clipping delta1
484
3.04M
            tmp_delta1_8x16b = _mm_min_epi16(tmp_delta1_8x16b, consttc_8x16b);
485
            // clipping delta1
486
3.04M
            tmp_delta1_8x16b = _mm_max_epi16(tmp_delta1_8x16b, tmp1_const_8x16b);
487
488
            //getting the mask ready
489
3.04M
            mask_pq_8x16b = _mm_srai_epi16(mask_pq_8x16b, 15);
490
            //masking of the delta values |delta|<10*tc
491
3.04M
            tmp_delta1_8x16b = _mm_and_si128(tmp_delta1_8x16b, tmp0_const_8x16b);
492
3.04M
            tmp_delta0_8x16b = _mm_and_si128(tmp_delta0_8x16b, tmp0_const_8x16b);
493
            //packing dq1 dq0 dp0 dp1
494
3.04M
            tmp1_const_8x16b = _mm_unpacklo_epi16(tmp_delta1_8x16b, tmp_delta0_8x16b);
495
3.04M
            tmp_delta0_8x16b = _mm_unpackhi_epi16(tmp_delta0_8x16b, tmp_delta1_8x16b);
496
3.04M
            tmp_delta1_8x16b = _mm_unpackhi_epi32(tmp1_const_8x16b, tmp_delta0_8x16b);
497
3.04M
            tmp_delta0_8x16b = _mm_unpacklo_epi32(tmp1_const_8x16b, tmp_delta0_8x16b);
498
499
            //masking of the delta values dep, deq , filter_p ,filter_q
500
3.04M
            tmp_delta0_8x16b = _mm_and_si128(tmp_delta0_8x16b, mask_pq_8x16b);
501
3.04M
            tmp_delta1_8x16b = _mm_and_si128(tmp_delta1_8x16b, mask_pq_8x16b);
502
            //converting 8bit to 16 bit
503
3.04M
            src_row0_8x16b = _mm_unpacklo_epi8(src_row0_8x16b, tmp2_const_8x16b);
504
3.04M
            src_row1_8x16b = _mm_unpacklo_epi8(src_row1_8x16b, tmp2_const_8x16b);
505
3.04M
            src_row2_8x16b = _mm_unpacklo_epi8(src_row2_8x16b, tmp2_const_8x16b);
506
3.04M
            src_row3_8x16b = _mm_unpacklo_epi8(src_row3_8x16b, tmp2_const_8x16b);
507
            //shuffle values loaded
508
3.04M
            tmp0_const_8x16b = _mm_load_si128((__m128i *)shuffle2);
509
3.04M
            tmp1_const_8x16b = _mm_load_si128((__m128i *)shuffle3);
510
            //arranging each row delta in different registers
511
3.04M
            tmp_delta3_8x16b = _mm_shuffle_epi8(tmp_delta1_8x16b, tmp1_const_8x16b);
512
3.04M
            tmp_delta2_8x16b = _mm_shuffle_epi8(tmp_delta1_8x16b, tmp0_const_8x16b);
513
3.04M
            tmp_delta1_8x16b = _mm_shuffle_epi8(tmp_delta0_8x16b, tmp1_const_8x16b);
514
3.04M
            tmp_delta0_8x16b = _mm_shuffle_epi8(tmp_delta0_8x16b, tmp0_const_8x16b);
515
516
            //adding the respective delta
517
3.04M
            src_row3_8x16b = _mm_add_epi16(tmp_delta3_8x16b, src_row3_8x16b);
518
3.04M
            src_row2_8x16b = _mm_add_epi16(tmp_delta2_8x16b, src_row2_8x16b);
519
3.04M
            src_row1_8x16b = _mm_add_epi16(tmp_delta1_8x16b, src_row1_8x16b);
520
3.04M
            src_row0_8x16b = _mm_add_epi16(tmp_delta0_8x16b, src_row0_8x16b);
521
            //saturating to 8 bit
522
3.04M
            src_row2_8x16b = _mm_packus_epi16(src_row2_8x16b, src_row3_8x16b);
523
3.04M
            src_row0_8x16b = _mm_packus_epi16(src_row0_8x16b, src_row1_8x16b);
524
            //separating different rows
525
3.04M
            src_row1_8x16b = _mm_srli_si128(src_row0_8x16b, 8);
526
3.04M
            src_row3_8x16b = _mm_srli_si128(src_row2_8x16b, 8);
527
3.04M
        }
528
529
22.6M
        _mm_storel_epi64((__m128i *)(pu1_src - 4), src_row0_8x16b);
530
22.6M
        _mm_storel_epi64((__m128i *)((pu1_src - 4) + src_strd), src_row1_8x16b);
531
22.6M
        _mm_storel_epi64((__m128i *)((pu1_src - 4) + 2 * src_strd), src_row2_8x16b);
532
22.6M
        _mm_storel_epi64((__m128i *)((pu1_src - 4) + 3 * src_strd), src_row3_8x16b);
533
22.6M
    }
534
24.3M
}
535
536
void ihevc_deblk_luma_horz_ssse3(UWORD8 *pu1_src,
537
                                 WORD32 src_strd,
538
                                 WORD32 bs,
539
                                 WORD32 quant_param_p,
540
                                 WORD32 quant_param_q,
541
                                 WORD32 beta_offset_div2,
542
                                 WORD32 tc_offset_div2,
543
                                 WORD32 filter_flag_p,
544
                                 WORD32 filter_flag_q)
545
27.6M
{
546
27.6M
    WORD32 qp_luma, beta_indx, tc_indx;
547
27.6M
    WORD32 beta, tc;
548
549
27.6M
    WORD32 d0, d3, dp, dq, d;
550
27.6M
    WORD32 de_0, de_1, de_2, de_3;
551
27.6M
    WORD32 d_sam0, d_sam3;
552
27.6M
    WORD32 de, dep, deq;
553
554
27.6M
    __m128i src_q0_8x16b, src_q1_8x16b, src_p0_8x16b, src_p1_8x16b, src_q2_8x16b;
555
27.6M
    __m128i tmp_pq_str1_8x16b, src_p2_8x16b, tmp_pq_str0_8x16b;
556
557
558
559
560
27.6M
    {
561
27.6M
        __m128i src_tmp_p_0_8x16b, src_tmp_p_1_8x16b, src_tmp_q_0_8x16b, src_tmp_q_1_8x16b;
562
27.6M
        __m128i coef_8x16b, mask_d_result_4x32b, mask_de_result_8x16b;
563
27.6M
        __m128i mask_16x8b, temp_coef0_8x16b, temp_coef1_8x16b;
564
565
27.6M
        ASSERT((bs > 0));
566
27.6M
        ASSERT(filter_flag_p || filter_flag_q);
567
568
27.6M
        qp_luma = (quant_param_p + quant_param_q + 1) >> 1;
569
27.6M
        beta_indx = CLIP3(qp_luma + (beta_offset_div2 << 1), 0, 51);
570
571
        /* BS based on implementation can take value 3 if it is intra/inter egde          */
572
        /* based on BS, tc index is calcuated by adding 2 * ( bs - 1) to QP and tc_offset */
573
        /* for BS = 1 adding factor is (0*2), BS = 2 or 3 adding factor is (1*2)          */
574
        /* the above desired functionallity is achieved by doing (2*(bs>>1))              */
575
576
27.6M
        tc_indx = CLIP3(qp_luma + 2 * (bs >> 1) + (tc_offset_div2 << 1), 0, 53);
577
578
27.6M
        beta = gai4_ihevc_beta_table[beta_indx];
579
27.6M
        tc = gai4_ihevc_tc_table[tc_indx];
580
27.6M
        if(0 == tc)
581
957k
        {
582
957k
            return;
583
957k
        }
584
26.6M
        src_q0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src));
585
26.6M
        src_q1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
586
26.6M
        src_p0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src - src_strd));
587
26.6M
        src_p1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src - 2 * src_strd));
588
26.6M
        src_q2_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd));
589
26.6M
        tmp_pq_str1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd));
590
26.6M
        src_p2_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src - 3 * src_strd));
591
26.6M
        tmp_pq_str0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src - 4 * src_strd));
592
593
594
26.6M
        src_tmp_p_0_8x16b = _mm_unpacklo_epi8(src_p1_8x16b, src_p0_8x16b);
595
26.6M
        src_tmp_p_1_8x16b = _mm_unpacklo_epi8(tmp_pq_str0_8x16b, src_p2_8x16b);
596
597
26.6M
        src_tmp_q_0_8x16b = _mm_unpacklo_epi8(src_q0_8x16b, src_q1_8x16b);
598
26.6M
        src_tmp_q_1_8x16b = _mm_unpacklo_epi8(src_q2_8x16b, tmp_pq_str1_8x16b);
599
600
26.6M
        src_tmp_p_0_8x16b = _mm_unpacklo_epi16(src_tmp_p_1_8x16b, src_tmp_p_0_8x16b);
601
26.6M
        src_tmp_q_0_8x16b = _mm_unpacklo_epi16(src_tmp_q_0_8x16b, src_tmp_q_1_8x16b);
602
603
26.6M
        src_tmp_p_0_8x16b = _mm_shuffle_epi32(src_tmp_p_0_8x16b, 0x6c);
604
26.6M
        src_tmp_q_0_8x16b = _mm_shuffle_epi32(src_tmp_q_0_8x16b, 0x6c);
605
606
26.6M
        coef_8x16b = _mm_load_si128((__m128i *)(coef_d));
607
26.6M
        mask_16x8b =  _mm_load_si128((__m128i *)(shuffle_d));
608
609
26.6M
        src_tmp_p_0_8x16b = _mm_unpacklo_epi32(src_tmp_p_0_8x16b, src_tmp_q_0_8x16b);
610
        //WORD32 shuffle_d[4]={0x80800403,0x80800c0b,0x03000704,0x0b080f0c};
611
26.6M
        mask_de_result_8x16b = _mm_shuffle_epi8(src_tmp_p_0_8x16b, mask_16x8b);
612
613
26.6M
        mask_d_result_4x32b = _mm_maddubs_epi16(src_tmp_p_0_8x16b, coef_8x16b);
614
615
616
        //to get all 1's of 8 bit in (1)
617
26.6M
        temp_coef0_8x16b = _mm_cmpeq_epi16(src_tmp_p_0_8x16b, src_tmp_p_0_8x16b);
618
26.6M
        temp_coef1_8x16b = _mm_srli_epi16(temp_coef0_8x16b, 15);
619
        //accumulating values foe dp3 dq3 , dp0 dq0 values
620
26.6M
        mask_d_result_4x32b = _mm_madd_epi16(mask_d_result_4x32b, temp_coef1_8x16b);
621
622
26.6M
        temp_coef1_8x16b = _mm_packus_epi16(temp_coef1_8x16b, temp_coef1_8x16b);
623
        // to get all 1,-1 sets of 16 bits in (0)
624
26.6M
        temp_coef0_8x16b = _mm_unpacklo_epi8(temp_coef0_8x16b, temp_coef1_8x16b);
625
        //q33-q30,p33-p30,q03-q00,p03-p00,0,q30-p30,0,q00-p00
626
26.6M
        mask_de_result_8x16b = _mm_maddubs_epi16(mask_de_result_8x16b, temp_coef0_8x16b);
627
        //to get 16 bit 1's
628
26.6M
        temp_coef0_8x16b = _mm_srli_epi16(temp_coef1_8x16b, 8);
629
630
631
        // dq3 dp3 dq0 dp0
632
26.6M
        mask_d_result_4x32b = _mm_abs_epi32(mask_d_result_4x32b);
633
26.6M
        mask_16x8b = _mm_shuffle_epi32(mask_d_result_4x32b, 0xec);
634
26.6M
        mask_d_result_4x32b = _mm_shuffle_epi32(mask_d_result_4x32b, 0x49);
635
        // dq dp d3 d0
636
26.6M
        mask_d_result_4x32b = _mm_add_epi32(mask_d_result_4x32b, mask_16x8b);
637
        //|q33-q30|,|p33-p30|,|q03-q00|,|p03-p00|,0,|q30-p30|,0,|q00-p00|
638
26.6M
        mask_de_result_8x16b = _mm_abs_epi16(mask_de_result_8x16b);
639
        //|q33-q30|+|p33-p30|,|q03-q00|+|p03-p00|,0+|q30-p30|,0+|q00-p00|
640
26.6M
        mask_de_result_8x16b = _mm_madd_epi16(mask_de_result_8x16b, temp_coef0_8x16b);
641
642
        ///store back in a single variable
643
26.6M
        temp_coef0_8x16b = _mm_srli_si128(mask_d_result_4x32b, 4);
644
26.6M
        temp_coef1_8x16b = _mm_srli_si128(mask_d_result_4x32b, 8);
645
26.6M
        mask_16x8b = _mm_srli_si128(mask_d_result_4x32b, 12);
646
647
26.6M
        d0 = _mm_cvtsi128_si32(mask_d_result_4x32b);
648
26.6M
        d3 = _mm_cvtsi128_si32(temp_coef0_8x16b);
649
26.6M
        dp = _mm_cvtsi128_si32(temp_coef1_8x16b);
650
26.6M
        dq = _mm_cvtsi128_si32(mask_16x8b);
651
        //getting d
652
26.6M
        d = d0 + d3;
653
654
        ///store back in a single variable
655
26.6M
        temp_coef0_8x16b = _mm_srli_si128(mask_de_result_8x16b, 4);
656
26.6M
        temp_coef1_8x16b = _mm_srli_si128(mask_de_result_8x16b, 8);
657
26.6M
        mask_16x8b = _mm_srli_si128(mask_de_result_8x16b, 12);
658
659
26.6M
        de_0 = _mm_cvtsi128_si32(mask_de_result_8x16b);
660
26.6M
        de_1 = _mm_cvtsi128_si32(temp_coef0_8x16b);
661
26.6M
        de_2 = _mm_cvtsi128_si32(temp_coef1_8x16b);
662
26.6M
        de_3 = _mm_cvtsi128_si32(mask_16x8b);
663
664
26.6M
        de = 0;
665
26.6M
        dep = 0;
666
26.6M
        deq = 0;
667
26.6M
        if(d < beta)
668
26.6M
        {
669
26.6M
            d_sam0 = 0;
670
26.6M
            if((2 * d0 < (beta >> 2))
671
25.1M
                            && (de_2 < (beta >> 3))
672
23.3M
                            && (de_0 < ((5 * tc + 1) >> 1)))
673
23.1M
            {
674
23.1M
                d_sam0 = 1;
675
23.1M
            }
676
677
26.6M
            d_sam3 = 0;
678
26.6M
            if((2 * d3 < (beta >> 2))
679
24.4M
                            && (de_3 < (beta >> 3))
680
22.9M
                            && de_1 < ((5 * tc + 1) >> 1))
681
22.7M
            {
682
22.7M
                d_sam3 = 1;
683
22.7M
            }
684
685
26.6M
            de = (d_sam0 & d_sam3) + 1;
686
26.6M
            dep = (dp < (beta + (beta >> 1)) >> 3) ? 1 : 0;
687
26.6M
            deq = (dq < (beta + (beta >> 1)) >> 3) ? 1 : 0;
688
26.6M
            if(tc <= 1)
689
2.73M
            {
690
2.73M
                dep = 0;
691
2.73M
                deq = 0;
692
2.73M
            }
693
26.6M
        }
694
695
26.6M
    }
696
697
26.6M
    if(de != 0)
698
25.6M
    {
699
700
25.6M
        if(2 == de)
701
22.3M
        {
702
703
22.3M
            __m128i temp_pq0_str0_16x8b;
704
22.3M
            __m128i temp_pq1_str0_16x8b, temp_pq1_str1_16x8b;
705
22.3M
            __m128i temp_pq2_str0_16x8b;
706
22.3M
            __m128i temp_str0_16x8b, temp_str1_16x8b;
707
22.3M
            __m128i const2_8x16b, const2tc_8x16b;
708
709
22.3M
            LWORD64 mask, tc2;
710
22.3M
            tc = tc << 1;
711
22.3M
            mask = (((LWORD64)filter_flag_q) << 63) | (((LWORD64)filter_flag_p) << 31);
712
22.3M
            tc2 = ((LWORD64)tc);
713
714
22.3M
            const2_8x16b = _mm_cmpeq_epi16(src_p1_8x16b, src_p1_8x16b);
715
            //q'0-q'1-2 ,p'0-p'1-2
716
22.3M
            temp_pq0_str0_16x8b = _mm_unpacklo_epi8(src_p1_8x16b, src_p0_8x16b);
717
22.3M
            temp_str0_16x8b   = _mm_unpacklo_epi8(src_q0_8x16b, src_q1_8x16b);
718
22.3M
            const2_8x16b = _mm_srli_epi16(const2_8x16b, 15);
719
            //arranged q31 q30 q21 q20 q1 q10 q01 q00 p30 p31 p20 p21 p10 p11 p00 p01
720
22.3M
            temp_pq0_str0_16x8b = _mm_unpacklo_epi64(temp_pq0_str0_16x8b, temp_str0_16x8b);
721
722
22.3M
            const2_8x16b = _mm_packus_epi16(const2_8x16b, const2_8x16b);
723
22.3M
            temp_pq0_str0_16x8b = _mm_maddubs_epi16(temp_pq0_str0_16x8b, const2_8x16b);
724
725
            //q'1-2, p'1-2
726
22.3M
            temp_pq1_str0_16x8b = _mm_unpacklo_epi8(src_p0_8x16b, src_q0_8x16b);
727
22.3M
            temp_pq1_str1_16x8b = _mm_unpacklo_epi8(src_q1_8x16b, src_q2_8x16b);
728
22.3M
            temp_str1_16x8b = _mm_unpacklo_epi8(src_p1_8x16b, src_p2_8x16b);
729
            // q30 p30 q20 p20 q10 p10 q01 q00 p30 q20 p20 q10 p10 q01 q00 p00
730
22.3M
            temp_pq1_str0_16x8b = _mm_unpacklo_epi64(temp_pq1_str0_16x8b, temp_pq1_str0_16x8b);
731
            // q32 q31 q22 q21 q12 q11 q02 q01 p32 p31 p22 p21 p12 p11 p02 p01
732
22.3M
            temp_pq1_str1_16x8b = _mm_unpacklo_epi64(temp_str1_16x8b, temp_pq1_str1_16x8b);
733
734
22.3M
            temp_pq1_str0_16x8b = _mm_maddubs_epi16(temp_pq1_str0_16x8b, const2_8x16b);
735
22.3M
            temp_pq1_str1_16x8b = _mm_maddubs_epi16(temp_pq1_str1_16x8b, const2_8x16b);
736
737
            //clipping mask design
738
22.3M
            temp_str1_16x8b = _mm_setzero_si128();
739
22.3M
            temp_str0_16x8b = _mm_loadl_epi64((__m128i *)(&mask));
740
22.3M
            const2tc_8x16b  = _mm_loadl_epi64((__m128i *)(&tc2));
741
22.3M
            temp_str0_16x8b = _mm_shuffle_epi32(temp_str0_16x8b, 0x44);
742
22.3M
            const2tc_8x16b  = _mm_shuffle_epi8(const2tc_8x16b, temp_str1_16x8b);
743
744
            //clipping mask design
745
22.3M
            temp_str0_16x8b = _mm_srai_epi32(temp_str0_16x8b, 31);
746
22.3M
            const2tc_8x16b = _mm_and_si128(const2tc_8x16b, temp_str0_16x8b);
747
            //calculating Clipping MAX for all pixel values.
748
22.3M
            src_p0_8x16b = _mm_unpacklo_epi32(src_p0_8x16b, src_q0_8x16b);
749
22.3M
            src_q0_8x16b = _mm_unpacklo_epi32(src_p1_8x16b, src_q1_8x16b);
750
            //for clipping calc
751
22.3M
            src_p1_8x16b = _mm_unpacklo_epi64(src_p0_8x16b, src_q0_8x16b);
752
            //saving the unmodified data of q1 p1 q0 p0
753
22.3M
            src_q1_8x16b = _mm_unpackhi_epi64(src_p0_8x16b, src_q0_8x16b);
754
            //CLIpping MAX and MIN for q1 p1 q0 p0
755
22.3M
            src_p0_8x16b = _mm_adds_epu8(src_p1_8x16b, const2tc_8x16b);
756
22.3M
            src_p1_8x16b = _mm_subs_epu8(src_p1_8x16b, const2tc_8x16b);
757
758
759
            //q'2-q'0-2,p'2-p'0-2
760
22.3M
            tmp_pq_str0_8x16b = _mm_unpacklo_epi8(src_p2_8x16b, tmp_pq_str0_8x16b);
761
22.3M
            temp_pq2_str0_16x8b = _mm_unpacklo_epi8(src_q2_8x16b, tmp_pq_str1_8x16b);
762
22.3M
            const2_8x16b = _mm_slli_epi16(const2_8x16b, 1);
763
            //arranged q33 q32 q23 q22 q13 q12 q03 q02 p32 p33 p22 p23 p12 p13 p02 p03
764
22.3M
            temp_pq2_str0_16x8b = _mm_unpacklo_epi64(tmp_pq_str0_8x16b, temp_pq2_str0_16x8b);
765
22.3M
            src_p2_8x16b = _mm_unpacklo_epi32(src_p2_8x16b, src_q2_8x16b);
766
22.3M
            temp_pq2_str0_16x8b = _mm_maddubs_epi16(temp_pq2_str0_16x8b, const2_8x16b);
767
768
            //calculating Clipping MAX and MIN for p2 and q2 .
769
22.3M
            tmp_pq_str0_8x16b = _mm_adds_epu8(src_p2_8x16b, const2tc_8x16b);
770
22.3M
            tmp_pq_str1_8x16b = _mm_subs_epu8(src_p2_8x16b, const2tc_8x16b);
771
            //q'0-q'1-2 ,p'0-p'1-2
772
22.3M
            temp_str0_16x8b = _mm_shuffle_epi32(temp_pq0_str0_16x8b, 0x4e);
773
22.3M
            temp_pq0_str0_16x8b = _mm_add_epi16(temp_pq0_str0_16x8b, temp_str0_16x8b);
774
            //q'1-2 p'1-2
775
22.3M
            temp_pq1_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq1_str1_16x8b);
776
            //to get 2 in 16 bit
777
22.3M
            const2_8x16b = _mm_srli_epi16(const2_8x16b, 8);
778
779
780
            //q'1, p'1 (adding 2)
781
22.3M
            temp_pq1_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, const2_8x16b);
782
            //q'0-q'1,p'0-p'1
783
22.3M
            temp_pq0_str0_16x8b = _mm_add_epi16(temp_pq0_str0_16x8b, const2_8x16b);
784
            //q'2-q'1,p'2-p'1
785
22.3M
            temp_pq2_str0_16x8b = _mm_add_epi16(temp_pq2_str0_16x8b, const2_8x16b);
786
            //q'0 = (q'0-q'1)+q'1 ,p'0 = (p'0-p'1)+p'1;
787
22.3M
            temp_pq0_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq0_str0_16x8b);
788
            //q'2 = (q'2-q'1)+q'1 ,p'2 = (p'2-p'1)+p'1;
789
22.3M
            temp_pq2_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq2_str0_16x8b);
790
791
            //normalisation of all modified pixels
792
22.3M
            temp_pq0_str0_16x8b = _mm_srai_epi16(temp_pq0_str0_16x8b, 3);
793
22.3M
            temp_pq1_str0_16x8b = _mm_srai_epi16(temp_pq1_str0_16x8b, 2);
794
22.3M
            temp_pq2_str0_16x8b = _mm_srai_epi16(temp_pq2_str0_16x8b, 3);
795
            //q'1 p'1 q'0 p'0
796
22.3M
            temp_pq0_str0_16x8b = _mm_packus_epi16(temp_pq0_str0_16x8b, temp_pq1_str0_16x8b);
797
22.3M
            temp_pq2_str0_16x8b = _mm_packus_epi16(temp_pq2_str0_16x8b, temp_pq2_str0_16x8b);
798
            //pack with the unmodified data of q2 and p2
799
22.3M
            src_p2_8x16b = _mm_unpackhi_epi64(temp_pq2_str0_16x8b, src_p2_8x16b);
800
            //Clipping MAX and MIN for q'1 p'1 q'0 p'0 and q'2  p'2
801
22.3M
            temp_pq0_str0_16x8b = _mm_min_epu8(temp_pq0_str0_16x8b, src_p0_8x16b);
802
22.3M
            src_p2_8x16b = _mm_min_epu8(src_p2_8x16b, tmp_pq_str0_8x16b);
803
22.3M
            temp_pq0_str0_16x8b = _mm_max_epu8(temp_pq0_str0_16x8b, src_p1_8x16b);
804
22.3M
            src_p2_8x16b = _mm_max_epu8(src_p2_8x16b, tmp_pq_str1_8x16b);
805
            //Reshuffling q'1 p'1 q'0 p'0 along with unmodified data
806
22.3M
            src_p0_8x16b = _mm_unpacklo_epi32(temp_pq0_str0_16x8b, src_q1_8x16b);
807
22.3M
            src_p1_8x16b = _mm_unpackhi_epi32(temp_pq0_str0_16x8b, src_q1_8x16b);
808
22.3M
            src_p2_8x16b = _mm_shuffle_epi32(src_p2_8x16b, 0xd8);
809
22.3M
            src_q0_8x16b = _mm_srli_si128(src_p0_8x16b, 8);
810
22.3M
            src_q1_8x16b = _mm_srli_si128(src_p1_8x16b, 8);
811
22.3M
            src_q2_8x16b = _mm_srli_si128(src_p2_8x16b, 8);
812
813
22.3M
            _mm_storel_epi64((__m128i *)(pu1_src - 3 * src_strd), src_p2_8x16b);
814
22.3M
            _mm_storel_epi64((__m128i *)(pu1_src - 2 * src_strd), src_p1_8x16b);
815
22.3M
            _mm_storel_epi64((__m128i *)(pu1_src - src_strd), src_p0_8x16b);
816
22.3M
            _mm_storel_epi64((__m128i *)(pu1_src), src_q0_8x16b);
817
22.3M
            _mm_storel_epi64((__m128i *)(pu1_src + src_strd), src_q1_8x16b);
818
22.3M
            _mm_storel_epi64((__m128i *)(pu1_src + 2 * src_strd), src_q2_8x16b);
819
820
821
22.3M
        }
822
823
3.28M
        else
824
3.28M
        {
825
826
3.28M
            __m128i tmp_delta0_8x16b, tmp_delta1_8x16b;
827
3.28M
            __m128i tmp0_const_8x16b, tmp1_const_8x16b, tmp2_const_8x16b;
828
3.28M
            __m128i coefdelta_0_8x16b;
829
3.28M
            __m128i const2_8x16b, consttc_8x16b;
830
831
3.28M
            LWORD64 maskp0, maskp1, maskq0, maskq1;
832
3.28M
            maskp0 = (LWORD64)filter_flag_p;
833
3.28M
            maskq0 = (LWORD64)filter_flag_q;
834
3.28M
            maskp1 = (LWORD64)dep;
835
3.28M
            maskq1 = (LWORD64)deq;
836
3.28M
            consttc_8x16b = _mm_set1_epi32(tc);
837
838
3.28M
            tmp_delta0_8x16b = _mm_unpacklo_epi8(src_p1_8x16b, src_p0_8x16b);
839
3.28M
            tmp_delta1_8x16b = _mm_unpacklo_epi8(src_q0_8x16b, src_q1_8x16b);
840
            //arranged q31 q30 p30 p31  q21 q20 p20 p21  q1 q10 p10 p11 q01 q00 p00 p01
841
3.28M
            tmp_delta1_8x16b = _mm_unpacklo_epi16(tmp_delta0_8x16b, tmp_delta1_8x16b);
842
843
3.28M
            coefdelta_0_8x16b = _mm_load_si128((__m128i *)coef_de1);
844
            // (-3q1+9q0),(-9p0+3p1)
845
3.28M
            tmp_delta0_8x16b = _mm_maddubs_epi16(tmp_delta1_8x16b, coefdelta_0_8x16b);
846
847
            //getting -tc store
848
3.28M
            tmp2_const_8x16b = _mm_cmpeq_epi32(consttc_8x16b, consttc_8x16b);
849
850
            //getting tc in 16 bit
851
3.28M
            consttc_8x16b = _mm_packs_epi32(consttc_8x16b, consttc_8x16b);
852
            //calc 10 *tc = 2*tc +8*tc ; 2*tc
853
3.28M
            tmp_pq_str0_8x16b = _mm_slli_epi16(consttc_8x16b, 1);
854
            //calc 10 *tc = 2*tc +8*tc ; 8*tc
855
3.28M
            tmp_pq_str1_8x16b = _mm_slli_epi16(consttc_8x16b, 3);
856
857
            //const 1
858
3.28M
            const2_8x16b = _mm_srli_epi16(tmp2_const_8x16b, 15);
859
            //calc 10 *tc
860
3.28M
            tmp_pq_str0_8x16b = _mm_add_epi16(tmp_pq_str0_8x16b, tmp_pq_str1_8x16b);
861
            //delta0 without normalisation and clipping
862
3.28M
            tmp_delta0_8x16b = _mm_madd_epi16(tmp_delta0_8x16b, const2_8x16b);
863
864
3.28M
            const2_8x16b = _mm_srli_epi32(tmp2_const_8x16b, 31);
865
866
            //loaded coef for delta1 calculation
867
3.28M
            coefdelta_0_8x16b = _mm_load_si128((__m128i *)coef_dep1);
868
            //(-2q1+q0),(p0-2p1)
869
3.28M
            tmp_delta1_8x16b = _mm_maddubs_epi16(tmp_delta1_8x16b, coefdelta_0_8x16b);
870
            //const 8
871
3.28M
            const2_8x16b = _mm_slli_epi32(const2_8x16b, 3);
872
873
            //normalisation of the filter
874
3.28M
            tmp_delta0_8x16b = _mm_add_epi32(tmp_delta0_8x16b, const2_8x16b);
875
3.28M
            tmp_delta0_8x16b = _mm_srai_epi32(tmp_delta0_8x16b, 4);
876
877
            //getting deltaq0
878
3.28M
            tmp_pq_str1_8x16b = _mm_sign_epi32(tmp_delta0_8x16b, tmp2_const_8x16b);
879
            //getting -tc
880
3.28M
            tmp1_const_8x16b = _mm_sign_epi16(consttc_8x16b, tmp2_const_8x16b);
881
            //packing  d03q d02q d01q d0q d03p d02p d01p d00p
882
3.28M
            tmp_delta0_8x16b = _mm_packs_epi32(tmp_delta0_8x16b, tmp_pq_str1_8x16b);
883
            //absolute delta
884
3.28M
            tmp_pq_str1_8x16b = _mm_abs_epi16(tmp_delta0_8x16b);
885
886
            //Clipping of delta0
887
3.28M
            tmp_delta0_8x16b = _mm_min_epi16(tmp_delta0_8x16b, consttc_8x16b);
888
            //tc>>1 16 bit
889
3.28M
            consttc_8x16b = _mm_srai_epi16(consttc_8x16b, 1);
890
            //Clipping of delta0
891
3.28M
            tmp_delta0_8x16b = _mm_max_epi16(tmp_delta0_8x16b, tmp1_const_8x16b);
892
893
            //(-tc)>>1 16 bit
894
3.28M
            tmp1_const_8x16b = _mm_sign_epi16(consttc_8x16b, tmp2_const_8x16b);
895
            //mask for |delta| < 10*tc
896
3.28M
            tmp_pq_str0_8x16b = _mm_cmpgt_epi16(tmp_pq_str0_8x16b, tmp_pq_str1_8x16b);
897
            //delta 1 calc starts
898
899
            //getting q32 q22 q12 q02 p32 p12 p22 p02
900
3.28M
            tmp0_const_8x16b = _mm_setzero_si128();
901
3.28M
            src_q2_8x16b = _mm_unpacklo_epi8(src_q2_8x16b, tmp0_const_8x16b);
902
3.28M
            src_p2_8x16b = _mm_unpacklo_epi8(src_p2_8x16b, tmp0_const_8x16b);
903
3.28M
            src_p2_8x16b = _mm_unpacklo_epi64(src_p2_8x16b, src_q2_8x16b);
904
            //constant 1
905
3.28M
            const2_8x16b = _mm_srli_epi16(tmp2_const_8x16b, 15);
906
            //2*delta0
907
3.28M
            tmp2_const_8x16b = _mm_add_epi16(tmp_delta0_8x16b, tmp_delta0_8x16b);
908
            //getting  all respective q's and p's together
909
3.28M
            coefdelta_0_8x16b = _mm_load_si128((__m128i *)(shuffle1));
910
3.28M
            tmp_delta1_8x16b = _mm_shuffle_epi8(tmp_delta1_8x16b, coefdelta_0_8x16b);
911
            //final adds for deltap1 and deltaq1
912
3.28M
            tmp_delta1_8x16b = _mm_add_epi16(tmp_delta1_8x16b, const2_8x16b);
913
3.28M
            src_p2_8x16b = _mm_add_epi16(src_p2_8x16b, tmp2_const_8x16b);
914
3.28M
            tmp_delta1_8x16b = _mm_add_epi16(tmp_delta1_8x16b, src_p2_8x16b);
915
3.28M
            tmp_delta1_8x16b = _mm_srai_epi16(tmp_delta1_8x16b, 2);
916
917
            //mask0= (((LWORD64)filter_flag_q)<<63)| (((LWORD64)filter_flag_p)<<31);
918
3.28M
            tmp_pq_str1_8x16b = _mm_loadl_epi64((__m128i *)(&(maskq0)));
919
3.28M
            src_p2_8x16b = _mm_loadl_epi64((__m128i *)(&(maskp0)));
920
921
            //   src_p2_8x16b = _mm_set_epi32(filter_flag_q,filter_flag_p,filter_flag_q,filter_flag_p);
922
            //mask1= (((LWORD64)(filter_flag_q&deq))<<63)|(((LWORD64)(filter_flag_p & dep))<<31);
923
3.28M
            src_q2_8x16b = _mm_loadl_epi64((__m128i *)(&(maskq1)));
924
3.28M
            coefdelta_0_8x16b = _mm_loadl_epi64((__m128i *)(&(maskp1)));
925
926
3.28M
            src_p2_8x16b = _mm_unpacklo_epi32(src_p2_8x16b, tmp_pq_str1_8x16b);
927
3.28M
            src_q2_8x16b = _mm_unpacklo_epi32(coefdelta_0_8x16b, src_q2_8x16b);
928
            //src_q2_8x16b = _mm_set_epi32(deq,dep,deq,dep);
929
3.28M
            src_q2_8x16b = _mm_and_si128(src_q2_8x16b, src_p2_8x16b);
930
931
            //rearranging the mask values
932
3.28M
            src_q2_8x16b = _mm_shuffle_epi32(src_q2_8x16b, 0x50);
933
3.28M
            src_p2_8x16b = _mm_shuffle_epi32(src_p2_8x16b, 0x50);
934
935
3.28M
            src_q2_8x16b = _mm_slli_epi32(src_q2_8x16b, 31);
936
3.28M
            src_p2_8x16b = _mm_slli_epi32(src_p2_8x16b, 31);
937
3.28M
            src_q2_8x16b = _mm_srai_epi32(src_q2_8x16b, 31);
938
3.28M
            src_p2_8x16b = _mm_srai_epi32(src_p2_8x16b, 31);
939
940
            //combining mask delta1
941
3.28M
            tmp_pq_str1_8x16b = _mm_and_si128(tmp_pq_str0_8x16b, src_q2_8x16b);
942
            // clipping delta1
943
3.28M
            tmp_delta1_8x16b = _mm_min_epi16(tmp_delta1_8x16b, consttc_8x16b);
944
            //combining mask delat0
945
3.28M
            tmp_pq_str0_8x16b = _mm_and_si128(tmp_pq_str0_8x16b, src_p2_8x16b);
946
            // clipping delta1
947
3.28M
            tmp_delta1_8x16b = _mm_max_epi16(tmp_delta1_8x16b, tmp1_const_8x16b);
948
949
950
            //masking of the delta values |delta|<10*tc
951
3.28M
            tmp_delta1_8x16b = _mm_and_si128(tmp_delta1_8x16b, tmp_pq_str1_8x16b);
952
3.28M
            tmp_delta0_8x16b = _mm_and_si128(tmp_delta0_8x16b, tmp_pq_str0_8x16b);
953
            //separating p and q delta 0 and addinq p0 and q0
954
3.28M
            tmp_pq_str0_8x16b = _mm_unpacklo_epi64(tmp_delta0_8x16b, tmp0_const_8x16b);
955
3.28M
            tmp_pq_str1_8x16b = _mm_unpackhi_epi64(tmp_delta0_8x16b, tmp0_const_8x16b);
956
3.28M
            src_p0_8x16b = _mm_unpacklo_epi8(src_p0_8x16b, tmp0_const_8x16b);
957
3.28M
            src_q0_8x16b = _mm_unpacklo_epi8(src_q0_8x16b, tmp0_const_8x16b);
958
3.28M
            src_p0_8x16b = _mm_add_epi16(src_p0_8x16b, tmp_pq_str0_8x16b);
959
3.28M
            src_q0_8x16b = _mm_add_epi16(src_q0_8x16b, tmp_pq_str1_8x16b);
960
            //separating p and q delta 0 and addinq p0 and q0
961
3.28M
            tmp_pq_str0_8x16b = _mm_unpacklo_epi64(tmp_delta1_8x16b, tmp0_const_8x16b);
962
3.28M
            tmp_pq_str1_8x16b = _mm_unpackhi_epi64(tmp_delta1_8x16b, tmp0_const_8x16b);
963
3.28M
            src_p1_8x16b = _mm_unpacklo_epi8(src_p1_8x16b, tmp0_const_8x16b);
964
3.28M
            src_q1_8x16b = _mm_unpacklo_epi8(src_q1_8x16b, tmp0_const_8x16b);
965
3.28M
            src_p1_8x16b = _mm_add_epi16(src_p1_8x16b, tmp_pq_str0_8x16b);
966
3.28M
            src_q1_8x16b = _mm_add_epi16(src_q1_8x16b, tmp_pq_str1_8x16b);
967
            //packing p1 q1 and p0 q0 to 8 bit
968
3.28M
            src_p1_8x16b = _mm_packus_epi16(src_p1_8x16b, src_q1_8x16b);
969
3.28M
            src_p0_8x16b = _mm_packus_epi16(src_p0_8x16b, src_q0_8x16b);
970
971
3.28M
            src_q1_8x16b = _mm_srli_si128(src_p1_8x16b, 8);
972
3.28M
            src_q0_8x16b = _mm_srli_si128(src_p0_8x16b, 8);
973
974
3.28M
            _mm_storel_epi64((__m128i *)(pu1_src - 2 * src_strd), src_p1_8x16b);
975
3.28M
            _mm_storel_epi64((__m128i *)(pu1_src - src_strd), src_p0_8x16b);
976
3.28M
            _mm_storel_epi64((__m128i *)(pu1_src), src_q0_8x16b);
977
3.28M
            _mm_storel_epi64((__m128i *)(pu1_src + src_strd), src_q1_8x16b);
978
979
980
3.28M
        }
981
982
983
984
25.6M
    }
985
986
26.6M
}
987
988
void ihevc_deblk_chroma_vert_ssse3(UWORD8 *pu1_src,
989
                                   WORD32 src_strd,
990
                                   WORD32 quant_param_p,
991
                                   WORD32 quant_param_q,
992
                                   WORD32 qp_offset_u,
993
                                   WORD32 qp_offset_v,
994
                                   WORD32 tc_offset_div2,
995
                                   WORD32 filter_flag_p,
996
                                   WORD32 filter_flag_q)
997
7.59M
{
998
7.59M
    WORD32 qp_indx_u, qp_chroma_u;
999
7.59M
    WORD32 qp_indx_v, qp_chroma_v;
1000
7.59M
    WORD32 tc_indx_u, tc_u;
1001
7.59M
    WORD32 tc_indx_v, tc_v;
1002
1003
7.59M
    __m128i src_row_0_16x8b, tmp_pxl_0_16x8b, src_row_2_16x8b, tmp_pxl_1_16x8b;
1004
7.59M
    ASSERT(filter_flag_p || filter_flag_q);
1005
1006
    /* chroma processing is done only if BS is 2             */
1007
    /* this function is assumed to be called only if BS is 2 */
1008
7.59M
    qp_indx_u = qp_offset_u + ((quant_param_p + quant_param_q + 1) >> 1);
1009
7.59M
    qp_chroma_u = qp_indx_u < 0 ? qp_indx_u : (qp_indx_u > 57 ? qp_indx_u - 6 : gai4_ihevc_qp_table[qp_indx_u]);
1010
1011
7.59M
    qp_indx_v = qp_offset_v + ((quant_param_p + quant_param_q + 1) >> 1);
1012
7.59M
    qp_chroma_v = qp_indx_v < 0 ? qp_indx_v : (qp_indx_v > 57 ? qp_indx_v - 6 : gai4_ihevc_qp_table[qp_indx_v]);
1013
1014
7.59M
    tc_indx_u = CLIP3(qp_chroma_u + 2 + (tc_offset_div2 << 1), 0, 53);
1015
7.59M
    tc_u = gai4_ihevc_tc_table[tc_indx_u];
1016
1017
7.59M
    tc_indx_v = CLIP3(qp_chroma_v + 2 + (tc_offset_div2 << 1), 0, 53);
1018
7.59M
    tc_v = gai4_ihevc_tc_table[tc_indx_v];
1019
1020
7.59M
    if(0 == tc_u && 0 == tc_v)
1021
257k
    {
1022
257k
        return;
1023
257k
    }
1024
7.34M
    src_row_0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src - 4));
1025
7.34M
    tmp_pxl_0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd - 4));
1026
7.34M
    src_row_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd - 4));
1027
7.34M
    tmp_pxl_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd - 4));
1028
1029
7.34M
    {
1030
7.34M
        LWORD64 mask_tc, mask_flag, mask;
1031
7.34M
        __m128i delta_vu0_16x8b, delta_vu1_16x8b;
1032
7.34M
        __m128i mask_tc_16x8, mask_16x8b, mask_flag_p_16x8b, mask_flag_q_16x8b;
1033
7.34M
        __m128i min_0_16x8b;
1034
7.34M
        __m128i const_16x8b;
1035
7.34M
        mask_flag = (((LWORD64)filter_flag_p) << 31) | (((LWORD64)filter_flag_q) << 63);
1036
7.34M
        mask_tc = (((LWORD64)tc_v) << 16) | ((LWORD64)tc_u);
1037
7.34M
        mask = 0xffff00000000ffffLL;
1038
1039
7.34M
        src_row_0_16x8b = _mm_unpacklo_epi64(src_row_0_16x8b, tmp_pxl_0_16x8b);
1040
7.34M
        src_row_2_16x8b = _mm_unpacklo_epi64(src_row_2_16x8b, tmp_pxl_1_16x8b);
1041
1042
7.34M
        mask_16x8b = _mm_load_si128((__m128i *)(shuffle_uv));
1043
        // qv11 qu11 qv10 qu10 qv01 qu01 qv00 qu00 pv10 pu10 pv11 pu11 pv00 pu00 pv01 pu01
1044
        // qv31 qu31 qv30 qu30 qv21 qu21 qv20 qu20 pv30 pu30 pv31 pu31 pv20 pu20 pv21 pu21
1045
7.34M
        delta_vu0_16x8b = _mm_shuffle_epi8(src_row_0_16x8b, mask_16x8b);
1046
7.34M
        delta_vu1_16x8b = _mm_shuffle_epi8(src_row_2_16x8b, mask_16x8b);
1047
1048
7.34M
        tmp_pxl_0_16x8b = _mm_unpacklo_epi64(delta_vu0_16x8b, delta_vu1_16x8b);
1049
7.34M
        tmp_pxl_1_16x8b = _mm_unpackhi_epi64(delta_vu0_16x8b, delta_vu1_16x8b);
1050
        // pv30 pv31 pu30 pu31 pv20 pv21 pu20 pu21 pv10 pv11 pu10 pu11 pv00 pv01 pu00 pu01
1051
        // qv31 qv30 qu31 qu30 qv21 qv20 qu21 qu20 qv11 qv10 qu11 qu10 qv01 qv00 qu01 qu00
1052
7.34M
        delta_vu0_16x8b = _mm_load_si128((__m128i *)delta0);
1053
7.34M
        delta_vu1_16x8b = _mm_load_si128((__m128i *)delta1);
1054
1055
7.34M
        delta_vu0_16x8b = _mm_maddubs_epi16(tmp_pxl_0_16x8b, delta_vu0_16x8b);
1056
7.34M
        delta_vu1_16x8b = _mm_maddubs_epi16(tmp_pxl_1_16x8b, delta_vu1_16x8b);
1057
1058
        //generating offset 4
1059
7.34M
        const_16x8b = _mm_cmpeq_epi16(tmp_pxl_0_16x8b, tmp_pxl_0_16x8b);
1060
        // filter flag mask and tc mask
1061
7.34M
        mask_tc_16x8 = _mm_loadl_epi64((__m128i *)(&mask_tc));
1062
7.34M
        mask_flag_q_16x8b = _mm_loadl_epi64((__m128i *)(&mask_flag));
1063
1064
7.34M
        mask_tc_16x8 = _mm_shuffle_epi32(mask_tc_16x8, 0x00);
1065
7.34M
        mask_flag_q_16x8b = _mm_srai_epi32(mask_flag_q_16x8b, 31);
1066
        //-tc
1067
7.34M
        min_0_16x8b = _mm_sign_epi16(mask_tc_16x8, const_16x8b);
1068
        //converting const 1
1069
7.34M
        const_16x8b = _mm_srli_epi16(const_16x8b, 15);
1070
1071
        //filterp and filterq flag
1072
7.34M
        mask_flag_p_16x8b = _mm_shuffle_epi32(mask_flag_q_16x8b, 0x00);
1073
7.34M
        mask_flag_q_16x8b = _mm_shuffle_epi32(mask_flag_q_16x8b, 0x55);
1074
1075
        //modified delta with a filter (1 -4 4 -1) available in 16 bit
1076
7.34M
        delta_vu0_16x8b = _mm_add_epi16(delta_vu0_16x8b, delta_vu1_16x8b);
1077
        //converting const 4
1078
7.34M
        const_16x8b = _mm_slli_epi16(const_16x8b, 2);
1079
1080
7.34M
        mask_16x8b = _mm_loadl_epi64((__m128i *)(&mask));
1081
        //offset addition
1082
7.34M
        delta_vu0_16x8b = _mm_add_epi16(delta_vu0_16x8b, const_16x8b);
1083
        //eliminating q1
1084
7.34M
        tmp_pxl_1_16x8b = _mm_slli_epi16(tmp_pxl_1_16x8b, 8);
1085
1086
7.34M
        const_16x8b = _mm_setzero_si128();
1087
        //filter after normalisation
1088
7.34M
        delta_vu0_16x8b = _mm_srai_epi16(delta_vu0_16x8b, 3);
1089
7.34M
        mask_16x8b = _mm_shuffle_epi32(mask_16x8b, 0x44);
1090
1091
        //clipping MAX
1092
7.34M
        delta_vu0_16x8b = _mm_min_epi16(delta_vu0_16x8b, mask_tc_16x8);
1093
        //getting p0 and eliminating p1
1094
7.34M
        tmp_pxl_0_16x8b = _mm_srli_epi16(tmp_pxl_0_16x8b, 8);
1095
        //clipping MIN
1096
7.34M
        delta_vu0_16x8b = _mm_max_epi16(delta_vu0_16x8b, min_0_16x8b);
1097
        //getting q0
1098
7.34M
        tmp_pxl_1_16x8b = _mm_srli_epi16(tmp_pxl_1_16x8b, 8);
1099
        //masking filter flag
1100
7.34M
        delta_vu1_16x8b = _mm_and_si128(delta_vu0_16x8b, mask_flag_q_16x8b);
1101
7.34M
        delta_vu0_16x8b = _mm_and_si128(delta_vu0_16x8b, mask_flag_p_16x8b);
1102
1103
        // q-delta ,p+delta
1104
7.34M
        tmp_pxl_1_16x8b = _mm_sub_epi16(tmp_pxl_1_16x8b, delta_vu1_16x8b);
1105
7.34M
        tmp_pxl_0_16x8b = _mm_add_epi16(tmp_pxl_0_16x8b, delta_vu0_16x8b);
1106
        //merging q0 and p0 of respective rows
1107
7.34M
        delta_vu1_16x8b = _mm_unpackhi_epi32(tmp_pxl_0_16x8b, tmp_pxl_1_16x8b);
1108
7.34M
        delta_vu0_16x8b = _mm_unpacklo_epi32(tmp_pxl_0_16x8b, tmp_pxl_1_16x8b);
1109
        // row 0 and row 1 packed , row2 and row3 packed
1110
7.34M
        delta_vu0_16x8b = _mm_packus_epi16(delta_vu0_16x8b, const_16x8b);
1111
7.34M
        delta_vu1_16x8b = _mm_packus_epi16(delta_vu1_16x8b, const_16x8b);
1112
        //removing older pixel values
1113
7.34M
        src_row_0_16x8b = _mm_and_si128(src_row_0_16x8b, mask_16x8b);
1114
7.34M
        src_row_2_16x8b = _mm_and_si128(src_row_2_16x8b, mask_16x8b);
1115
        //arranging modified pixels
1116
7.34M
        delta_vu0_16x8b = _mm_shuffle_epi32(delta_vu0_16x8b, 0xd8);
1117
7.34M
        delta_vu1_16x8b = _mm_shuffle_epi32(delta_vu1_16x8b, 0xd8);
1118
7.34M
        delta_vu0_16x8b = _mm_slli_epi64(delta_vu0_16x8b, 16);
1119
7.34M
        delta_vu1_16x8b = _mm_slli_epi64(delta_vu1_16x8b, 16);
1120
        //plugging the modified values
1121
7.34M
        src_row_0_16x8b = _mm_or_si128(src_row_0_16x8b, delta_vu0_16x8b);
1122
7.34M
        src_row_2_16x8b = _mm_or_si128(src_row_2_16x8b, delta_vu1_16x8b);
1123
1124
1125
        //geting values for row1 and row 3
1126
7.34M
        tmp_pxl_0_16x8b = _mm_srli_si128(src_row_0_16x8b, 8);
1127
7.34M
        tmp_pxl_1_16x8b = _mm_srli_si128(src_row_2_16x8b, 8);
1128
1129
7.34M
        _mm_storel_epi64((__m128i *)(pu1_src - 4), src_row_0_16x8b);
1130
7.34M
        _mm_storel_epi64((__m128i *)((pu1_src - 4) + src_strd), tmp_pxl_0_16x8b);
1131
7.34M
        _mm_storel_epi64((__m128i *)((pu1_src - 4) + 2 * src_strd), src_row_2_16x8b);
1132
7.34M
        _mm_storel_epi64((__m128i *)((pu1_src - 4) + 3 * src_strd), tmp_pxl_1_16x8b);
1133
7.34M
    }
1134
1135
1136
1137
7.34M
}
1138
1139
void ihevc_deblk_chroma_horz_ssse3(UWORD8 *pu1_src,
1140
                                   WORD32 src_strd,
1141
                                   WORD32 quant_param_p,
1142
                                   WORD32 quant_param_q,
1143
                                   WORD32 qp_offset_u,
1144
                                   WORD32 qp_offset_v,
1145
                                   WORD32 tc_offset_div2,
1146
                                   WORD32 filter_flag_p,
1147
                                   WORD32 filter_flag_q)
1148
7.43M
{
1149
7.43M
    WORD32 qp_indx_u, qp_chroma_u;
1150
7.43M
    WORD32 qp_indx_v, qp_chroma_v;
1151
7.43M
    WORD32 tc_indx_u, tc_u;
1152
7.43M
    WORD32 tc_indx_v, tc_v;
1153
1154
1155
7.43M
    __m128i tmp_p0_16x8b, src_p0_16x8b, src_q0_16x8b, tmp_q0_16x8b;
1156
1157
7.43M
    ASSERT(filter_flag_p || filter_flag_q);
1158
1159
    /* chroma processing is done only if BS is 2             */
1160
    /* this function is assumed to be called only if BS is 2 */
1161
7.43M
    qp_indx_u = qp_offset_u + ((quant_param_p + quant_param_q + 1) >> 1);
1162
7.43M
    qp_chroma_u = qp_indx_u < 0 ? qp_indx_u : (qp_indx_u > 57 ? qp_indx_u - 6 : gai4_ihevc_qp_table[qp_indx_u]);
1163
1164
7.43M
    qp_indx_v = qp_offset_v + ((quant_param_p + quant_param_q + 1) >> 1);
1165
7.43M
    qp_chroma_v = qp_indx_v < 0 ? qp_indx_v : (qp_indx_v > 57 ? qp_indx_v - 6 : gai4_ihevc_qp_table[qp_indx_v]);
1166
1167
7.43M
    tc_indx_u = CLIP3(qp_chroma_u + 2 + (tc_offset_div2 << 1), 0, 53);
1168
7.43M
    tc_u = gai4_ihevc_tc_table[tc_indx_u];
1169
1170
7.43M
    tc_indx_v = CLIP3(qp_chroma_v + 2 + (tc_offset_div2 << 1), 0, 53);
1171
7.43M
    tc_v = gai4_ihevc_tc_table[tc_indx_v];
1172
1173
7.43M
    if(0 == tc_u && 0 == tc_v)
1174
339k
    {
1175
339k
        return;
1176
339k
    }
1177
7.09M
    tmp_p0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src - 2 * src_strd));
1178
7.09M
    src_p0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src - src_strd));
1179
7.09M
    src_q0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));
1180
7.09M
    tmp_q0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
1181
1182
7.09M
    {
1183
7.09M
        LWORD64 mask_tc, mask_flag;
1184
7.09M
        __m128i delta_vu0_16x8b, delta_vu1_16x8b;
1185
7.09M
        __m128i mask_tc_16x8, mask_16x8b, mask_flag_p_16x8b, mask_flag_q_16x8b;
1186
7.09M
        __m128i min_0_16x8b;
1187
7.09M
        __m128i const_16x8b;
1188
7.09M
        mask_flag = (((LWORD64)filter_flag_p) << 31) | (((LWORD64)filter_flag_q) << 63);
1189
7.09M
        mask_tc = (((LWORD64)tc_v) << 16) | ((LWORD64)tc_u);
1190
1191
7.09M
        tmp_p0_16x8b = _mm_unpacklo_epi8(tmp_p0_16x8b, src_p0_16x8b);
1192
7.09M
        tmp_q0_16x8b = _mm_unpacklo_epi8(src_q0_16x8b, tmp_q0_16x8b);
1193
1194
        // pv30 pv31 pu30 pu31 pv20 pv21 pu20 pu21 pv10 pv11 pu10 pu11 pv00 pv01 pu00 pu01
1195
        // qv31 qv30 qu31 qu30 qv21 qv20 qu21 qu20 qv11 qv10 qu11 qu10 qv01 qv00 qu01 qu00
1196
7.09M
        delta_vu0_16x8b = _mm_load_si128((__m128i *)delta0);
1197
7.09M
        delta_vu1_16x8b = _mm_load_si128((__m128i *)delta1);
1198
1199
7.09M
        delta_vu0_16x8b = _mm_maddubs_epi16(tmp_p0_16x8b, delta_vu0_16x8b);
1200
7.09M
        delta_vu1_16x8b = _mm_maddubs_epi16(tmp_q0_16x8b, delta_vu1_16x8b);
1201
1202
1203
        // filter flag mask and tc mask
1204
7.09M
        mask_tc_16x8 = _mm_loadl_epi64((__m128i *)(&mask_tc));
1205
7.09M
        mask_flag_q_16x8b = _mm_loadl_epi64((__m128i *)(&mask_flag));
1206
1207
        //generating offset 4
1208
7.09M
        const_16x8b = _mm_cmpeq_epi16(tmp_p0_16x8b, tmp_p0_16x8b);
1209
        // filter flag mask and tc mask
1210
7.09M
        mask_tc_16x8 = _mm_shuffle_epi32(mask_tc_16x8, 0x00);
1211
7.09M
        mask_flag_q_16x8b = _mm_srai_epi32(mask_flag_q_16x8b, 31);
1212
        //-tc
1213
7.09M
        min_0_16x8b = _mm_sign_epi16(mask_tc_16x8, const_16x8b);
1214
        //converting const 1
1215
7.09M
        const_16x8b = _mm_srli_epi16(const_16x8b, 15);
1216
1217
        //filterp
1218
7.09M
        mask_flag_p_16x8b = _mm_shuffle_epi32(mask_flag_q_16x8b, 0x00);
1219
1220
1221
        //converting const 4
1222
7.09M
        const_16x8b = _mm_slli_epi16(const_16x8b, 2);
1223
        //modified delta with a filter (1 -4 4 -1) available in 16 bit
1224
7.09M
        delta_vu0_16x8b = _mm_add_epi16(delta_vu0_16x8b, delta_vu1_16x8b);
1225
1226
        //filterq flag
1227
7.09M
        mask_flag_q_16x8b = _mm_shuffle_epi32(mask_flag_q_16x8b, 0x55);
1228
        //offset addition
1229
7.09M
        delta_vu0_16x8b = _mm_add_epi16(delta_vu0_16x8b, const_16x8b);
1230
7.09M
        mask_16x8b = _mm_setzero_si128();
1231
        //filter after normalisation
1232
7.09M
        delta_vu0_16x8b = _mm_srai_epi16(delta_vu0_16x8b, 3);
1233
1234
        //converting p0 to 16bit
1235
7.09M
        src_p0_16x8b = _mm_unpacklo_epi8(src_p0_16x8b, mask_16x8b);
1236
        //clipping MAX
1237
7.09M
        delta_vu0_16x8b = _mm_min_epi16(delta_vu0_16x8b, mask_tc_16x8);
1238
        //converting q0 to 16bit
1239
7.09M
        src_q0_16x8b = _mm_unpacklo_epi8(src_q0_16x8b, mask_16x8b);
1240
        //clipping MIN
1241
7.09M
        delta_vu0_16x8b = _mm_max_epi16(delta_vu0_16x8b, min_0_16x8b);
1242
1243
        //masking filter flag
1244
7.09M
        delta_vu1_16x8b = _mm_and_si128(delta_vu0_16x8b, mask_flag_q_16x8b);
1245
7.09M
        delta_vu0_16x8b = _mm_and_si128(delta_vu0_16x8b, mask_flag_p_16x8b);
1246
1247
        // q-delta ,p+delta
1248
7.09M
        src_q0_16x8b = _mm_sub_epi16(src_q0_16x8b, delta_vu1_16x8b);
1249
7.09M
        src_p0_16x8b = _mm_add_epi16(src_p0_16x8b, delta_vu0_16x8b);
1250
1251
        // p0 and q0 packed
1252
7.09M
        src_q0_16x8b = _mm_packus_epi16(src_q0_16x8b, mask_16x8b);
1253
7.09M
        src_p0_16x8b = _mm_packus_epi16(src_p0_16x8b, mask_16x8b);
1254
1255
1256
1257
7.09M
        _mm_storel_epi64((__m128i *)(pu1_src - src_strd), src_p0_16x8b);
1258
7.09M
        _mm_storel_epi64((__m128i *)(pu1_src), src_q0_16x8b);
1259
1260
7.09M
    }
1261
1262
1263
7.09M
}