Coverage Report

Created: 2025-07-23 08:18

/src/x265/source/common/dct.cpp
Line
Count
Source (jump to first uncovered line)
1
/*****************************************************************************
2
 * Copyright (C) 2013-2020 MulticoreWare, Inc
3
 *
4
 * Authors: Mandar Gurav <mandar@multicorewareinc.com>
5
 *          Deepthi Devaki Akkoorath <deepthidevaki@multicorewareinc.com>
6
 *          Mahesh Pittala <mahesh@multicorewareinc.com>
7
 *          Rajesh Paulraj <rajesh@multicorewareinc.com>
8
 *          Min Chen <min.chen@multicorewareinc.com>
9
 *          Praveen Kumar Tiwari <praveen@multicorewareinc.com>
10
 *          Nabajit Deka <nabajit@multicorewareinc.com>
11
 *
12
 * This program is free software; you can redistribute it and/or modify
13
 * it under the terms of the GNU General Public License as published by
14
 * the Free Software Foundation; either version 2 of the License, or
15
 * (at your option) any later version.
16
 *
17
 * This program is distributed in the hope that it will be useful,
18
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20
 * GNU General Public License for more details.
21
 *
22
 * You should have received a copy of the GNU General Public License
23
 * along with this program; if not, write to the Free Software
24
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
25
 *
26
 * This program is also available under a commercial proprietary license.
27
 * For more information, contact us at license @ x265.com.
28
 *****************************************************************************/
29
30
#include "common.h"
31
#include "primitives.h"
32
#include "contexts.h"   // costCoeffNxN_c
33
#include "threading.h"  // BSR
34
35
using namespace X265_NS;
36
37
#if _MSC_VER
38
#pragma warning(disable: 4127) // conditional expression is constant, typical for templated functions
39
#endif
40
41
// Fast DST Algorithm. Full matrix multiplication for DST and Fast DST algorithm
42
// give identical results
43
static void fastForwardDst(const int16_t* block, int16_t* coeff, int shift)  // input block, output coeff
44
0
{
45
0
    int c[4];
46
0
    int rnd_factor = 1 << (shift - 1);
47
48
0
    for (int i = 0; i < 4; i++)
49
0
    {
50
        // Intermediate Variables
51
0
        c[0] = block[4 * i + 0] + block[4 * i + 3];
52
0
        c[1] = block[4 * i + 1] + block[4 * i + 3];
53
0
        c[2] = block[4 * i + 0] - block[4 * i + 1];
54
0
        c[3] = 74 * block[4 * i + 2];
55
56
0
        coeff[i] =      (int16_t)((29 * c[0] + 55 * c[1]  + c[3] + rnd_factor) >> shift);
57
0
        coeff[4 + i] =  (int16_t)((74 * (block[4 * i + 0] + block[4 * i + 1] - block[4 * i + 3]) + rnd_factor) >> shift);
58
0
        coeff[8 + i] =  (int16_t)((29 * c[2] + 55 * c[0]  - c[3] + rnd_factor) >> shift);
59
0
        coeff[12 + i] = (int16_t)((55 * c[2] - 29 * c[1] + c[3] + rnd_factor) >> shift);
60
0
    }
61
0
}
62
63
static void inversedst(const int16_t* tmp, int16_t* block, int shift)  // input tmp, output block
64
0
{
65
0
    int i, c[4];
66
0
    int rnd_factor = 1 << (shift - 1);
67
68
0
    for (i = 0; i < 4; i++)
69
0
    {
70
        // Intermediate Variables
71
0
        c[0] = tmp[i] + tmp[8 + i];
72
0
        c[1] = tmp[8 + i] + tmp[12 + i];
73
0
        c[2] = tmp[i] - tmp[12 + i];
74
0
        c[3] = 74 * tmp[4 + i];
75
76
0
        block[4 * i + 0] = (int16_t)x265_clip3(-32768, 32767, (29 * c[0] + 55 * c[1]     + c[3]               + rnd_factor) >> shift);
77
0
        block[4 * i + 1] = (int16_t)x265_clip3(-32768, 32767, (55 * c[2] - 29 * c[1]     + c[3]               + rnd_factor) >> shift);
78
0
        block[4 * i + 2] = (int16_t)x265_clip3(-32768, 32767, (74 * (tmp[i] - tmp[8 + i]  + tmp[12 + i])      + rnd_factor) >> shift);
79
0
        block[4 * i + 3] = (int16_t)x265_clip3(-32768, 32767, (55 * c[0] + 29 * c[2]     - c[3]               + rnd_factor) >> shift);
80
0
    }
81
0
}
82
83
static void partialButterfly16(const int16_t* src, int16_t* dst, int shift, int line)
84
0
{
85
0
    int j, k;
86
0
    int E[8], O[8];
87
0
    int EE[4], EO[4];
88
0
    int EEE[2], EEO[2];
89
0
    int add = 1 << (shift - 1);
90
91
0
    for (j = 0; j < line; j++)
92
0
    {
93
        /* E and O */
94
0
        for (k = 0; k < 8; k++)
95
0
        {
96
0
            E[k] = src[k] + src[15 - k];
97
0
            O[k] = src[k] - src[15 - k];
98
0
        }
99
100
        /* EE and EO */
101
0
        for (k = 0; k < 4; k++)
102
0
        {
103
0
            EE[k] = E[k] + E[7 - k];
104
0
            EO[k] = E[k] - E[7 - k];
105
0
        }
106
107
        /* EEE and EEO */
108
0
        EEE[0] = EE[0] + EE[3];
109
0
        EEO[0] = EE[0] - EE[3];
110
0
        EEE[1] = EE[1] + EE[2];
111
0
        EEO[1] = EE[1] - EE[2];
112
113
0
        dst[0] = (int16_t)((g_t16[0][0] * EEE[0] + g_t16[0][1] * EEE[1] + add) >> shift);
114
0
        dst[8 * line] = (int16_t)((g_t16[8][0] * EEE[0] + g_t16[8][1] * EEE[1] + add) >> shift);
115
0
        dst[4 * line] = (int16_t)((g_t16[4][0] * EEO[0] + g_t16[4][1] * EEO[1] + add) >> shift);
116
0
        dst[12 * line] = (int16_t)((g_t16[12][0] * EEO[0] + g_t16[12][1] * EEO[1] + add) >> shift);
117
118
0
        for (k = 2; k < 16; k += 4)
119
0
        {
120
0
            dst[k * line] = (int16_t)((g_t16[k][0] * EO[0] + g_t16[k][1] * EO[1] + g_t16[k][2] * EO[2] +
121
0
                                       g_t16[k][3] * EO[3] + add) >> shift);
122
0
        }
123
124
0
        for (k = 1; k < 16; k += 2)
125
0
        {
126
0
            dst[k * line] =  (int16_t)((g_t16[k][0] * O[0] + g_t16[k][1] * O[1] + g_t16[k][2] * O[2] + g_t16[k][3] * O[3] +
127
0
                                        g_t16[k][4] * O[4] + g_t16[k][5] * O[5] + g_t16[k][6] * O[6] + g_t16[k][7] * O[7] +
128
0
                                        add) >> shift);
129
0
        }
130
131
0
        src += 16;
132
0
        dst++;
133
0
    }
134
0
}
135
136
static void partialButterfly32(const int16_t* src, int16_t* dst, int shift, int line)
137
0
{
138
0
    int j, k;
139
0
    int E[16], O[16];
140
0
    int EE[8], EO[8];
141
0
    int EEE[4], EEO[4];
142
0
    int EEEE[2], EEEO[2];
143
0
    int add = 1 << (shift - 1);
144
145
0
    for (j = 0; j < line; j++)
146
0
    {
147
        /* E and O*/
148
0
        for (k = 0; k < 16; k++)
149
0
        {
150
0
            E[k] = src[k] + src[31 - k];
151
0
            O[k] = src[k] - src[31 - k];
152
0
        }
153
154
        /* EE and EO */
155
0
        for (k = 0; k < 8; k++)
156
0
        {
157
0
            EE[k] = E[k] + E[15 - k];
158
0
            EO[k] = E[k] - E[15 - k];
159
0
        }
160
161
        /* EEE and EEO */
162
0
        for (k = 0; k < 4; k++)
163
0
        {
164
0
            EEE[k] = EE[k] + EE[7 - k];
165
0
            EEO[k] = EE[k] - EE[7 - k];
166
0
        }
167
168
        /* EEEE and EEEO */
169
0
        EEEE[0] = EEE[0] + EEE[3];
170
0
        EEEO[0] = EEE[0] - EEE[3];
171
0
        EEEE[1] = EEE[1] + EEE[2];
172
0
        EEEO[1] = EEE[1] - EEE[2];
173
174
0
        dst[0] = (int16_t)((g_t32[0][0] * EEEE[0] + g_t32[0][1] * EEEE[1] + add) >> shift);
175
0
        dst[16 * line] = (int16_t)((g_t32[16][0] * EEEE[0] + g_t32[16][1] * EEEE[1] + add) >> shift);
176
0
        dst[8 * line] = (int16_t)((g_t32[8][0] * EEEO[0] + g_t32[8][1] * EEEO[1] + add) >> shift);
177
0
        dst[24 * line] = (int16_t)((g_t32[24][0] * EEEO[0] + g_t32[24][1] * EEEO[1] + add) >> shift);
178
0
        for (k = 4; k < 32; k += 8)
179
0
        {
180
0
            dst[k * line] = (int16_t)((g_t32[k][0] * EEO[0] + g_t32[k][1] * EEO[1] + g_t32[k][2] * EEO[2] +
181
0
                                       g_t32[k][3] * EEO[3] + add) >> shift);
182
0
        }
183
184
0
        for (k = 2; k < 32; k += 4)
185
0
        {
186
0
            dst[k * line] = (int16_t)((g_t32[k][0] * EO[0] + g_t32[k][1] * EO[1] + g_t32[k][2] * EO[2] +
187
0
                                       g_t32[k][3] * EO[3] + g_t32[k][4] * EO[4] + g_t32[k][5] * EO[5] +
188
0
                                       g_t32[k][6] * EO[6] + g_t32[k][7] * EO[7] + add) >> shift);
189
0
        }
190
191
0
        for (k = 1; k < 32; k += 2)
192
0
        {
193
0
            dst[k * line] = (int16_t)((g_t32[k][0] * O[0] + g_t32[k][1] * O[1] + g_t32[k][2] * O[2] + g_t32[k][3] * O[3] +
194
0
                                       g_t32[k][4] * O[4] + g_t32[k][5] * O[5] + g_t32[k][6] * O[6] + g_t32[k][7] * O[7] +
195
0
                                       g_t32[k][8] * O[8] + g_t32[k][9] * O[9] + g_t32[k][10] * O[10] + g_t32[k][11] *
196
0
                                       O[11] + g_t32[k][12] * O[12] + g_t32[k][13] * O[13] + g_t32[k][14] * O[14] +
197
0
                                       g_t32[k][15] * O[15] + add) >> shift);
198
0
        }
199
200
0
        src += 32;
201
0
        dst++;
202
0
    }
203
0
}
204
205
static void partialButterfly8(const int16_t* src, int16_t* dst, int shift, int line)
206
0
{
207
0
    int j, k;
208
0
    int E[4], O[4];
209
0
    int EE[2], EO[2];
210
0
    int add = 1 << (shift - 1);
211
212
0
    for (j = 0; j < line; j++)
213
0
    {
214
        /* E and O*/
215
0
        for (k = 0; k < 4; k++)
216
0
        {
217
0
            E[k] = src[k] + src[7 - k];
218
0
            O[k] = src[k] - src[7 - k];
219
0
        }
220
221
        /* EE and EO */
222
0
        EE[0] = E[0] + E[3];
223
0
        EO[0] = E[0] - E[3];
224
0
        EE[1] = E[1] + E[2];
225
0
        EO[1] = E[1] - E[2];
226
227
0
        dst[0] = (int16_t)((g_t8[0][0] * EE[0] + g_t8[0][1] * EE[1] + add) >> shift);
228
0
        dst[4 * line] = (int16_t)((g_t8[4][0] * EE[0] + g_t8[4][1] * EE[1] + add) >> shift);
229
0
        dst[2 * line] = (int16_t)((g_t8[2][0] * EO[0] + g_t8[2][1] * EO[1] + add) >> shift);
230
0
        dst[6 * line] = (int16_t)((g_t8[6][0] * EO[0] + g_t8[6][1] * EO[1] + add) >> shift);
231
232
0
        dst[line] = (int16_t)((g_t8[1][0] * O[0] + g_t8[1][1] * O[1] + g_t8[1][2] * O[2] + g_t8[1][3] * O[3] + add) >> shift);
233
0
        dst[3 * line] = (int16_t)((g_t8[3][0] * O[0] + g_t8[3][1] * O[1] + g_t8[3][2] * O[2] + g_t8[3][3] * O[3] + add) >> shift);
234
0
        dst[5 * line] = (int16_t)((g_t8[5][0] * O[0] + g_t8[5][1] * O[1] + g_t8[5][2] * O[2] + g_t8[5][3] * O[3] + add) >> shift);
235
0
        dst[7 * line] = (int16_t)((g_t8[7][0] * O[0] + g_t8[7][1] * O[1] + g_t8[7][2] * O[2] + g_t8[7][3] * O[3] + add) >> shift);
236
237
0
        src += 8;
238
0
        dst++;
239
0
    }
240
0
}
241
242
static void partialButterflyInverse4(const int16_t* src, int16_t* dst, int shift, int line)
243
0
{
244
0
    int j;
245
0
    int E[2], O[2];
246
0
    int add = 1 << (shift - 1);
247
248
0
    for (j = 0; j < line; j++)
249
0
    {
250
        /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
251
0
        O[0] = g_t4[1][0] * src[line] + g_t4[3][0] * src[3 * line];
252
0
        O[1] = g_t4[1][1] * src[line] + g_t4[3][1] * src[3 * line];
253
0
        E[0] = g_t4[0][0] * src[0] + g_t4[2][0] * src[2 * line];
254
0
        E[1] = g_t4[0][1] * src[0] + g_t4[2][1] * src[2 * line];
255
256
        /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
257
0
        dst[0] = (int16_t)(x265_clip3(-32768, 32767, (E[0] + O[0] + add) >> shift));
258
0
        dst[1] = (int16_t)(x265_clip3(-32768, 32767, (E[1] + O[1] + add) >> shift));
259
0
        dst[2] = (int16_t)(x265_clip3(-32768, 32767, (E[1] - O[1] + add) >> shift));
260
0
        dst[3] = (int16_t)(x265_clip3(-32768, 32767, (E[0] - O[0] + add) >> shift));
261
262
0
        src++;
263
0
        dst += 4;
264
0
    }
265
0
}
266
267
static void partialButterflyInverse8(const int16_t* src, int16_t* dst, int shift, int line)
268
0
{
269
0
    int j, k;
270
0
    int E[4], O[4];
271
0
    int EE[2], EO[2];
272
0
    int add = 1 << (shift - 1);
273
274
0
    for (j = 0; j < line; j++)
275
0
    {
276
        /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
277
0
        for (k = 0; k < 4; k++)
278
0
        {
279
0
            O[k] = g_t8[1][k] * src[line] + g_t8[3][k] * src[3 * line] + g_t8[5][k] * src[5 * line] + g_t8[7][k] * src[7 * line];
280
0
        }
281
282
0
        EO[0] = g_t8[2][0] * src[2 * line] + g_t8[6][0] * src[6 * line];
283
0
        EO[1] = g_t8[2][1] * src[2 * line] + g_t8[6][1] * src[6 * line];
284
0
        EE[0] = g_t8[0][0] * src[0] + g_t8[4][0] * src[4 * line];
285
0
        EE[1] = g_t8[0][1] * src[0] + g_t8[4][1] * src[4 * line];
286
287
        /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
288
0
        E[0] = EE[0] + EO[0];
289
0
        E[3] = EE[0] - EO[0];
290
0
        E[1] = EE[1] + EO[1];
291
0
        E[2] = EE[1] - EO[1];
292
0
        for (k = 0; k < 4; k++)
293
0
        {
294
0
            dst[k] = (int16_t)x265_clip3(-32768, 32767, (E[k] + O[k] + add) >> shift);
295
0
            dst[k + 4] = (int16_t)x265_clip3(-32768, 32767, (E[3 - k] - O[3 - k] + add) >> shift);
296
0
        }
297
298
0
        src++;
299
0
        dst += 8;
300
0
    }
301
0
}
302
303
static void partialButterflyInverse16(const int16_t* src, int16_t* dst, int shift, int line)
304
0
{
305
0
    int j, k;
306
0
    int E[8], O[8];
307
0
    int EE[4], EO[4];
308
0
    int EEE[2], EEO[2];
309
0
    int add = 1 << (shift - 1);
310
311
0
    for (j = 0; j < line; j++)
312
0
    {
313
        /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
314
0
        for (k = 0; k < 8; k++)
315
0
        {
316
0
            O[k] = g_t16[1][k] * src[line] + g_t16[3][k] * src[3 * line] + g_t16[5][k] * src[5 * line] + g_t16[7][k] * src[7 * line] +
317
0
                g_t16[9][k] * src[9 * line] + g_t16[11][k] * src[11 * line] + g_t16[13][k] * src[13 * line] + g_t16[15][k] * src[15 * line];
318
0
        }
319
320
0
        for (k = 0; k < 4; k++)
321
0
        {
322
0
            EO[k] = g_t16[2][k] * src[2 * line] + g_t16[6][k] * src[6 * line] + g_t16[10][k] * src[10 * line] + g_t16[14][k] * src[14 * line];
323
0
        }
324
325
0
        EEO[0] = g_t16[4][0] * src[4 * line] + g_t16[12][0] * src[12 * line];
326
0
        EEE[0] = g_t16[0][0] * src[0] + g_t16[8][0] * src[8 * line];
327
0
        EEO[1] = g_t16[4][1] * src[4 * line] + g_t16[12][1] * src[12 * line];
328
0
        EEE[1] = g_t16[0][1] * src[0] + g_t16[8][1] * src[8 * line];
329
330
        /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
331
0
        for (k = 0; k < 2; k++)
332
0
        {
333
0
            EE[k] = EEE[k] + EEO[k];
334
0
            EE[k + 2] = EEE[1 - k] - EEO[1 - k];
335
0
        }
336
337
0
        for (k = 0; k < 4; k++)
338
0
        {
339
0
            E[k] = EE[k] + EO[k];
340
0
            E[k + 4] = EE[3 - k] - EO[3 - k];
341
0
        }
342
343
0
        for (k = 0; k < 8; k++)
344
0
        {
345
0
            dst[k]   = (int16_t)x265_clip3(-32768, 32767, (E[k] + O[k] + add) >> shift);
346
0
            dst[k + 8] = (int16_t)x265_clip3(-32768, 32767, (E[7 - k] - O[7 - k] + add) >> shift);
347
0
        }
348
349
0
        src++;
350
0
        dst += 16;
351
0
    }
352
0
}
353
354
static void partialButterflyInverse32(const int16_t* src, int16_t* dst, int shift, int line)
355
0
{
356
0
    int j, k;
357
0
    int E[16], O[16];
358
0
    int EE[8], EO[8];
359
0
    int EEE[4], EEO[4];
360
0
    int EEEE[2], EEEO[2];
361
0
    int add = 1 << (shift - 1);
362
363
0
    for (j = 0; j < line; j++)
364
0
    {
365
        /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
366
0
        for (k = 0; k < 16; k++)
367
0
        {
368
0
            O[k] = g_t32[1][k] * src[line] + g_t32[3][k] * src[3 * line] + g_t32[5][k] * src[5 * line] + g_t32[7][k] * src[7 * line] +
369
0
                g_t32[9][k] * src[9 * line] + g_t32[11][k] * src[11 * line] + g_t32[13][k] * src[13 * line] + g_t32[15][k] * src[15 * line] +
370
0
                g_t32[17][k] * src[17 * line] + g_t32[19][k] * src[19 * line] + g_t32[21][k] * src[21 * line] + g_t32[23][k] * src[23 * line] +
371
0
                g_t32[25][k] * src[25 * line] + g_t32[27][k] * src[27 * line] + g_t32[29][k] * src[29 * line] + g_t32[31][k] * src[31 * line];
372
0
        }
373
374
0
        for (k = 0; k < 8; k++)
375
0
        {
376
0
            EO[k] = g_t32[2][k] * src[2 * line] + g_t32[6][k] * src[6 * line] + g_t32[10][k] * src[10 * line] + g_t32[14][k] * src[14 * line] +
377
0
                g_t32[18][k] * src[18 * line] + g_t32[22][k] * src[22 * line] + g_t32[26][k] * src[26 * line] + g_t32[30][k] * src[30 * line];
378
0
        }
379
380
0
        for (k = 0; k < 4; k++)
381
0
        {
382
0
            EEO[k] = g_t32[4][k] * src[4 * line] + g_t32[12][k] * src[12 * line] + g_t32[20][k] * src[20 * line] + g_t32[28][k] * src[28 * line];
383
0
        }
384
385
0
        EEEO[0] = g_t32[8][0] * src[8 * line] + g_t32[24][0] * src[24 * line];
386
0
        EEEO[1] = g_t32[8][1] * src[8 * line] + g_t32[24][1] * src[24 * line];
387
0
        EEEE[0] = g_t32[0][0] * src[0] + g_t32[16][0] * src[16 * line];
388
0
        EEEE[1] = g_t32[0][1] * src[0] + g_t32[16][1] * src[16 * line];
389
390
        /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
391
0
        EEE[0] = EEEE[0] + EEEO[0];
392
0
        EEE[3] = EEEE[0] - EEEO[0];
393
0
        EEE[1] = EEEE[1] + EEEO[1];
394
0
        EEE[2] = EEEE[1] - EEEO[1];
395
0
        for (k = 0; k < 4; k++)
396
0
        {
397
0
            EE[k] = EEE[k] + EEO[k];
398
0
            EE[k + 4] = EEE[3 - k] - EEO[3 - k];
399
0
        }
400
401
0
        for (k = 0; k < 8; k++)
402
0
        {
403
0
            E[k] = EE[k] + EO[k];
404
0
            E[k + 8] = EE[7 - k] - EO[7 - k];
405
0
        }
406
407
0
        for (k = 0; k < 16; k++)
408
0
        {
409
0
            dst[k] = (int16_t)x265_clip3(-32768, 32767, (E[k] + O[k] + add) >> shift);
410
0
            dst[k + 16] = (int16_t)x265_clip3(-32768, 32767, (E[15 - k] - O[15 - k] + add) >> shift);
411
0
        }
412
413
0
        src++;
414
0
        dst += 32;
415
0
    }
416
0
}
417
418
static void partialButterfly4(const int16_t* src, int16_t* dst, int shift, int line)
419
0
{
420
0
    int j;
421
0
    int E[2], O[2];
422
0
    int add = 1 << (shift - 1);
423
424
0
    for (j = 0; j < line; j++)
425
0
    {
426
        /* E and O */
427
0
        E[0] = src[0] + src[3];
428
0
        O[0] = src[0] - src[3];
429
0
        E[1] = src[1] + src[2];
430
0
        O[1] = src[1] - src[2];
431
432
0
        dst[0] = (int16_t)((g_t4[0][0] * E[0] + g_t4[0][1] * E[1] + add) >> shift);
433
0
        dst[2 * line] = (int16_t)((g_t4[2][0] * E[0] + g_t4[2][1] * E[1] + add) >> shift);
434
0
        dst[line] = (int16_t)((g_t4[1][0] * O[0] + g_t4[1][1] * O[1] + add) >> shift);
435
0
        dst[3 * line] = (int16_t)((g_t4[3][0] * O[0] + g_t4[3][1] * O[1] + add) >> shift);
436
437
0
        src += 4;
438
0
        dst++;
439
0
    }
440
0
}
441
442
namespace X265_NS {
443
void dst4_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
444
0
{
445
0
    const int shift_1st = 1 + X265_DEPTH - 8;
446
0
    const int shift_2nd = 8;
447
448
0
    ALIGN_VAR_32(int16_t, coef[4 * 4]);
449
0
    ALIGN_VAR_32(int16_t, block[4 * 4]);
450
451
0
    for (int i = 0; i < 4; i++)
452
0
    {
453
0
        memcpy(&block[i * 4], &src[i * srcStride], 4 * sizeof(int16_t));
454
0
    }
455
456
0
    fastForwardDst(block, coef, shift_1st);
457
0
    fastForwardDst(coef, dst, shift_2nd);
458
0
}
459
460
void dct4_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
461
0
{
462
0
    const int shift_1st = 1 + X265_DEPTH - 8;
463
0
    const int shift_2nd = 8;
464
465
0
    ALIGN_VAR_32(int16_t, coef[4 * 4]);
466
0
    ALIGN_VAR_32(int16_t, block[4 * 4]);
467
468
0
    for (int i = 0; i < 4; i++)
469
0
    {
470
0
        memcpy(&block[i * 4], &src[i * srcStride], 4 * sizeof(int16_t));
471
0
    }
472
473
0
    partialButterfly4(block, coef, shift_1st, 4);
474
0
    partialButterfly4(coef, dst, shift_2nd, 4);
475
0
}
476
477
void dct8_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
478
0
{
479
0
    const int shift_1st = 2 + X265_DEPTH - 8;
480
0
    const int shift_2nd = 9;
481
482
0
    ALIGN_VAR_32(int16_t, coef[8 * 8]);
483
0
    ALIGN_VAR_32(int16_t, block[8 * 8]);
484
485
0
    for (int i = 0; i < 8; i++)
486
0
    {
487
0
        memcpy(&block[i * 8], &src[i * srcStride], 8 * sizeof(int16_t));
488
0
    }
489
490
0
    partialButterfly8(block, coef, shift_1st, 8);
491
0
    partialButterfly8(coef, dst, shift_2nd, 8);
492
0
}
493
494
void dct16_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
495
0
{
496
0
    const int shift_1st = 3 + X265_DEPTH - 8;
497
0
    const int shift_2nd = 10;
498
499
0
    ALIGN_VAR_32(int16_t, coef[16 * 16]);
500
0
    ALIGN_VAR_32(int16_t, block[16 * 16]);
501
502
0
    for (int i = 0; i < 16; i++)
503
0
    {
504
0
        memcpy(&block[i * 16], &src[i * srcStride], 16 * sizeof(int16_t));
505
0
    }
506
507
0
    partialButterfly16(block, coef, shift_1st, 16);
508
0
    partialButterfly16(coef, dst, shift_2nd, 16);
509
0
}
510
511
void dct32_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
512
0
{
513
0
    const int shift_1st = 4 + X265_DEPTH - 8;
514
0
    const int shift_2nd = 11;
515
516
0
    ALIGN_VAR_32(int16_t, coef[32 * 32]);
517
0
    ALIGN_VAR_32(int16_t, block[32 * 32]);
518
519
0
    for (int i = 0; i < 32; i++)
520
0
    {
521
0
        memcpy(&block[i * 32], &src[i * srcStride], 32 * sizeof(int16_t));
522
0
    }
523
524
0
    partialButterfly32(block, coef, shift_1st, 32);
525
0
    partialButterfly32(coef, dst, shift_2nd, 32);
526
0
}
527
528
void idst4_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
529
0
{
530
0
    const int shift_1st = 7;
531
0
    const int shift_2nd = 12 - (X265_DEPTH - 8);
532
533
0
    ALIGN_VAR_32(int16_t, coef[4 * 4]);
534
0
    ALIGN_VAR_32(int16_t, block[4 * 4]);
535
536
0
    inversedst(src, coef, shift_1st); // Forward DST BY FAST ALGORITHM, block input, coef output
537
0
    inversedst(coef, block, shift_2nd); // Forward DST BY FAST ALGORITHM, coef input, coeff output
538
539
0
    for (int i = 0; i < 4; i++)
540
0
    {
541
0
        memcpy(&dst[i * dstStride], &block[i * 4], 4 * sizeof(int16_t));
542
0
    }
543
0
}
544
545
void idct4_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
546
0
{
547
0
    const int shift_1st = 7;
548
0
    const int shift_2nd = 12 - (X265_DEPTH - 8);
549
550
0
    ALIGN_VAR_32(int16_t, coef[4 * 4]);
551
0
    ALIGN_VAR_32(int16_t, block[4 * 4]);
552
553
0
    partialButterflyInverse4(src, coef, shift_1st, 4); // Forward DST BY FAST ALGORITHM, block input, coef output
554
0
    partialButterflyInverse4(coef, block, shift_2nd, 4); // Forward DST BY FAST ALGORITHM, coef input, coeff output
555
556
0
    for (int i = 0; i < 4; i++)
557
0
    {
558
0
        memcpy(&dst[i * dstStride], &block[i * 4], 4 * sizeof(int16_t));
559
0
    }
560
0
}
561
562
void idct8_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
563
0
{
564
0
    const int shift_1st = 7;
565
0
    const int shift_2nd = 12 - (X265_DEPTH - 8);
566
567
0
    ALIGN_VAR_32(int16_t, coef[8 * 8]);
568
0
    ALIGN_VAR_32(int16_t, block[8 * 8]);
569
570
0
    partialButterflyInverse8(src, coef, shift_1st, 8);
571
0
    partialButterflyInverse8(coef, block, shift_2nd, 8);
572
573
0
    for (int i = 0; i < 8; i++)
574
0
    {
575
0
        memcpy(&dst[i * dstStride], &block[i * 8], 8 * sizeof(int16_t));
576
0
    }
577
0
}
578
579
void idct16_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
580
0
{
581
0
    const int shift_1st = 7;
582
0
    const int shift_2nd = 12 - (X265_DEPTH - 8);
583
584
0
    ALIGN_VAR_32(int16_t, coef[16 * 16]);
585
0
    ALIGN_VAR_32(int16_t, block[16 * 16]);
586
587
0
    partialButterflyInverse16(src, coef, shift_1st, 16);
588
0
    partialButterflyInverse16(coef, block, shift_2nd, 16);
589
590
0
    for (int i = 0; i < 16; i++)
591
0
    {
592
0
        memcpy(&dst[i * dstStride], &block[i * 16], 16 * sizeof(int16_t));
593
0
    }
594
0
}
595
596
void idct32_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
597
0
{
598
0
    const int shift_1st = 7;
599
0
    const int shift_2nd = 12 - (X265_DEPTH - 8);
600
601
0
    ALIGN_VAR_32(int16_t, coef[32 * 32]);
602
0
    ALIGN_VAR_32(int16_t, block[32 * 32]);
603
604
0
    partialButterflyInverse32(src, coef, shift_1st, 32);
605
0
    partialButterflyInverse32(coef, block, shift_2nd, 32);
606
607
0
    for (int i = 0; i < 32; i++)
608
0
    {
609
0
        memcpy(&dst[i * dstStride], &block[i * 32], 32 * sizeof(int16_t));
610
0
    }
611
0
}
612
} // namespace X265_NS
613
614
static void dequant_normal_c(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift)
615
0
{
616
#if HIGH_BIT_DEPTH
617
    X265_CHECK(scale < 32768 || ((scale & 3) == 0 && shift > (X265_DEPTH - 8)), "dequant invalid scale %d\n", scale);
618
#else
619
    // NOTE: maximum of scale is (72 * 256)
620
0
    X265_CHECK(scale < 32768, "dequant invalid scale %d\n", scale);
621
0
#endif
622
0
    X265_CHECK(num <= 32 * 32, "dequant num %d too large\n", num);
623
0
    X265_CHECK((num % 8) == 0, "dequant num %d not multiple of 8\n", num);
624
0
    X265_CHECK(shift <= 10, "shift too large %d\n", shift);
625
0
    X265_CHECK(((intptr_t)coef & 31) == 0, "dequant coef buffer not aligned\n");
626
627
0
    int add, coeffQ;
628
629
0
    add = 1 << (shift - 1);
630
631
0
    for (int n = 0; n < num; n++)
632
0
    {
633
0
        coeffQ = (quantCoef[n] * scale + add) >> shift;
634
0
        coef[n] = (int16_t)x265_clip3(-32768, 32767, coeffQ);
635
0
    }
636
0
}
637
638
static void dequant_scaling_c(const int16_t* quantCoef, const int32_t* deQuantCoef, int16_t* coef, int num, int per, int shift)
639
0
{
640
0
    X265_CHECK(num <= 32 * 32, "dequant num %d too large\n", num);
641
642
0
    int add, coeffQ;
643
644
0
    shift += 4;
645
646
0
    if (shift > per)
647
0
    {
648
0
        add = 1 << (shift - per - 1);
649
650
0
        for (int n = 0; n < num; n++)
651
0
        {
652
0
            coeffQ = ((quantCoef[n] * deQuantCoef[n]) + add) >> (shift - per);
653
0
            coef[n] = (int16_t)x265_clip3(-32768, 32767, coeffQ);
654
0
        }
655
0
    }
656
0
    else
657
0
    {
658
0
        for (int n = 0; n < num; n++)
659
0
        {
660
0
            coeffQ   = x265_clip3(-32768, 32767, quantCoef[n] * deQuantCoef[n]);
661
0
            coef[n] = (int16_t)x265_clip3(-32768, 32767, coeffQ << (per - shift));
662
0
        }
663
0
    }
664
0
}
665
666
static uint32_t quant_c(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff)
667
0
{
668
0
    X265_CHECK(qBits >= 8, "qBits less than 8\n");
669
0
    X265_CHECK((numCoeff % 16) == 0, "numCoeff must be multiple of 16\n");
670
0
    int qBits8 = qBits - 8;
671
0
    uint32_t numSig = 0;
672
673
0
    for (int blockpos = 0; blockpos < numCoeff; blockpos++)
674
0
    {
675
0
        int level = coef[blockpos];
676
0
        int sign  = (level < 0 ? -1 : 1);
677
678
0
        int tmplevel = abs(level) * quantCoeff[blockpos];
679
0
        level = ((tmplevel + add) >> qBits);
680
0
        deltaU[blockpos] = ((tmplevel - (level << qBits)) >> qBits8);
681
0
        if (level)
682
0
            ++numSig;
683
0
        level *= sign;
684
0
        qCoef[blockpos] = (int16_t)x265_clip3(-32768, 32767, level);
685
0
    }
686
687
0
    return numSig;
688
0
}
689
690
static uint32_t nquant_c(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff)
691
0
{
692
0
    X265_CHECK((numCoeff % 16) == 0, "number of quant coeff is not multiple of 4x4\n");
693
0
    X265_CHECK((uint32_t)add < ((uint32_t)1 << qBits), "2 ^ qBits less than add\n");
694
0
    X265_CHECK(((intptr_t)quantCoeff & 31) == 0, "quantCoeff buffer not aligned\n");
695
696
0
    uint32_t numSig = 0;
697
698
0
    for (int blockpos = 0; blockpos < numCoeff; blockpos++)
699
0
    {
700
0
        int level = coef[blockpos];
701
0
        int sign  = (level < 0 ? -1 : 1);
702
703
0
        int tmplevel = abs(level) * quantCoeff[blockpos];
704
0
        level = ((tmplevel + add) >> qBits);
705
0
        if (level)
706
0
            ++numSig;
707
0
        level *= sign;
708
709
        // TODO: when we limit range to [-32767, 32767], we can get more performance with output change
710
        //       But nquant is a little percent in rdoQuant, so I keep old dynamic range for compatible
711
0
        qCoef[blockpos] = (int16_t)abs(x265_clip3(-32768, 32767, level));
712
0
    }
713
714
0
    return numSig;
715
0
}
716
template<int trSize>
717
int  count_nonzero_c(const int16_t* quantCoeff)
718
0
{
719
0
    X265_CHECK(((intptr_t)quantCoeff & 15) == 0, "quant buffer not aligned\n");
720
0
    int count = 0;
721
0
    int numCoeff = trSize * trSize;
722
0
    for (int i = 0; i < numCoeff; i++)
723
0
    {
724
0
        count += quantCoeff[i] != 0;
725
0
    }
726
727
0
    return count;
728
0
}
Unexecuted instantiation: int count_nonzero_c<4>(short const*)
Unexecuted instantiation: int count_nonzero_c<8>(short const*)
Unexecuted instantiation: int count_nonzero_c<16>(short const*)
Unexecuted instantiation: int count_nonzero_c<32>(short const*)
729
730
template<int trSize>
731
uint32_t copy_count(int16_t* coeff, const int16_t* residual, intptr_t resiStride)
732
0
{
733
0
    uint32_t numSig = 0;
734
0
    for (int k = 0; k < trSize; k++)
735
0
    {
736
0
        for (int j = 0; j < trSize; j++)
737
0
        {
738
0
            coeff[k * trSize + j] = residual[k * resiStride + j];
739
0
            numSig += (residual[k * resiStride + j] != 0);
740
0
        }
741
0
    }
742
743
0
    return numSig;
744
0
}
Unexecuted instantiation: unsigned int copy_count<4>(short*, short const*, long)
Unexecuted instantiation: unsigned int copy_count<8>(short*, short const*, long)
Unexecuted instantiation: unsigned int copy_count<16>(short*, short const*, long)
Unexecuted instantiation: unsigned int copy_count<32>(short*, short const*, long)
745
746
static void denoiseDct_c(int16_t* dctCoef, uint32_t* resSum, const uint16_t* offset, int numCoeff)
747
0
{
748
0
    for (int i = 0; i < numCoeff; i++)
749
0
    {
750
0
        int level = dctCoef[i];
751
0
        int sign = level >> 31;
752
0
        level = (level + sign) ^ sign;
753
0
        resSum[i] += level;
754
0
        level -= offset[i];
755
0
        dctCoef[i] = (int16_t)(level < 0 ? 0 : (level ^ sign) - sign);
756
0
    }
757
0
}
758
759
static int scanPosLast_c(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* /*scanCG4x4*/, const int /*trSize*/)
760
0
{
761
0
    memset(coeffNum, 0, MLS_GRP_NUM * sizeof(*coeffNum));
762
0
    memset(coeffFlag, 0, MLS_GRP_NUM * sizeof(*coeffFlag));
763
0
    memset(coeffSign, 0, MLS_GRP_NUM * sizeof(*coeffSign));
764
765
0
    int scanPosLast = 0;
766
0
    do
767
0
    {
768
0
        const uint32_t cgIdx = (uint32_t)scanPosLast >> MLS_CG_SIZE;
769
770
0
        const uint32_t posLast = scan[scanPosLast++];
771
772
0
        const int curCoeff = coeff[posLast];
773
0
        const uint32_t isNZCoeff = (curCoeff != 0);
774
        // get L1 sig map
775
        // NOTE: the new algorithm is complicated, so I keep reference code here
776
        //uint32_t posy   = posLast >> log2TrSize;
777
        //uint32_t posx   = posLast - (posy << log2TrSize);
778
        //uint32_t blkIdx0 = ((posy >> MLS_CG_LOG2_SIZE) << codingParameters.log2TrSizeCG) + (posx >> MLS_CG_LOG2_SIZE);
779
        //const uint32_t blkIdx = ((posLast >> (2 * MLS_CG_LOG2_SIZE)) & ~maskPosXY) + ((posLast >> MLS_CG_LOG2_SIZE) & maskPosXY);
780
        //sigCoeffGroupFlag64 |= ((uint64_t)isNZCoeff << blkIdx);
781
0
        numSig -= isNZCoeff;
782
783
        // TODO: optimize by instruction BTS
784
0
        coeffSign[cgIdx] += (uint16_t)(((uint32_t)curCoeff >> 31) << coeffNum[cgIdx]);
785
0
        coeffFlag[cgIdx] = (coeffFlag[cgIdx] << 1) + (uint16_t)isNZCoeff;
786
0
        coeffNum[cgIdx] += (uint8_t)isNZCoeff;
787
0
    }
788
0
    while (numSig > 0);
789
0
    return scanPosLast - 1;
790
0
}
791
792
// NOTE: no defined value on lastNZPosInCG & absSumSign when ALL ZEROS block as input
793
static uint32_t findPosFirstLast_c(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16])
794
0
{
795
0
    int n;
796
797
0
    for (n = SCAN_SET_SIZE - 1; n >= 0; n--)
798
0
    {
799
0
        const uint32_t idx = scanTbl[n];
800
0
        const uint32_t idxY = idx / MLS_CG_SIZE;
801
0
        const uint32_t idxX = idx % MLS_CG_SIZE;
802
0
        if (dstCoeff[idxY * trSize + idxX])
803
0
            break;
804
0
    }
805
806
0
    X265_CHECK(n >= -1, "non-zero coeff scan failuare!\n");
807
808
0
    uint32_t lastNZPosInCG = (uint32_t)n;
809
810
0
    for (n = 0; n < SCAN_SET_SIZE; n++)
811
0
    {
812
0
        const uint32_t idx = scanTbl[n];
813
0
        const uint32_t idxY = idx / MLS_CG_SIZE;
814
0
        const uint32_t idxX = idx % MLS_CG_SIZE;
815
0
        if (dstCoeff[idxY * trSize + idxX])
816
0
            break;
817
0
    }
818
819
0
    uint32_t firstNZPosInCG = (uint32_t)n;
820
821
0
    uint32_t absSumSign = 0;
822
0
    for (n = firstNZPosInCG; n <= (int)lastNZPosInCG; n++)
823
0
    {
824
0
        const uint32_t idx = scanTbl[n];
825
0
        const uint32_t idxY = idx / MLS_CG_SIZE;
826
0
        const uint32_t idxX = idx % MLS_CG_SIZE;
827
0
        absSumSign += dstCoeff[idxY * trSize + idxX];
828
0
    }
829
830
    // NOTE: when coeff block all ZERO, the lastNZPosInCG is undefined and firstNZPosInCG is 16
831
0
    return ((absSumSign << 31) | (lastNZPosInCG << 8) | firstNZPosInCG);
832
0
}
833
834
835
static uint32_t costCoeffNxN_c(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase)
836
0
{
837
0
    ALIGN_VAR_32(uint16_t, tmpCoeff[SCAN_SET_SIZE]);
838
0
    uint32_t numNonZero = (scanPosSigOff < (SCAN_SET_SIZE - 1) ? 1 : 0);
839
0
    uint32_t sum = 0;
840
841
    // correct offset to match assembly
842
0
    absCoeff -= numNonZero;
843
844
0
    for (int i = 0; i < MLS_CG_SIZE; i++)
845
0
    {
846
0
        tmpCoeff[i * MLS_CG_SIZE + 0] = (uint16_t)abs(coeff[i * trSize + 0]);
847
0
        tmpCoeff[i * MLS_CG_SIZE + 1] = (uint16_t)abs(coeff[i * trSize + 1]);
848
0
        tmpCoeff[i * MLS_CG_SIZE + 2] = (uint16_t)abs(coeff[i * trSize + 2]);
849
0
        tmpCoeff[i * MLS_CG_SIZE + 3] = (uint16_t)abs(coeff[i * trSize + 3]);
850
0
    }
851
852
0
    do
853
0
    {
854
0
        uint32_t blkPos, sig, ctxSig;
855
0
        blkPos = scan[scanPosSigOff];
856
0
        const uint32_t posZeroMask = (subPosBase + scanPosSigOff) ? ~0 : 0;
857
0
        sig     = scanFlagMask & 1;
858
0
        scanFlagMask >>= 1;
859
0
        X265_CHECK((uint32_t)(tmpCoeff[blkPos] != 0) == sig, "sign bit mistake\n");
860
0
        if ((scanPosSigOff != 0) || (subPosBase == 0) || numNonZero)
861
0
        {
862
0
            const uint32_t cnt = tabSigCtx[blkPos] + offset;
863
0
            ctxSig = cnt & posZeroMask;
864
865
            //X265_CHECK(ctxSig == Quant::getSigCtxInc(patternSigCtx, log2TrSize, trSize, codingParameters.scan[subPosBase + scanPosSigOff], bIsLuma, codingParameters.firstSignificanceMapContext), "sigCtx mistake!\n");;
866
            //encodeBin(sig, baseCtx[ctxSig]);
867
0
            const uint32_t mstate = baseCtx[ctxSig];
868
0
            const uint32_t mps = mstate & 1;
869
0
            const uint32_t stateBits = PFX(entropyStateBits)[mstate ^ sig];
870
0
            uint32_t nextState = (stateBits >> 24) + mps;
871
0
            if ((mstate ^ sig) == 1)
872
0
                nextState = sig;
873
0
            X265_CHECK(sbacNext(mstate, sig) == nextState, "nextState check failure\n");
874
0
            X265_CHECK(sbacGetEntropyBits(mstate, sig) == (stateBits & 0xFFFFFF), "entropyBits check failure\n");
875
0
            baseCtx[ctxSig] = (uint8_t)nextState;
876
0
            sum += stateBits;
877
0
        }
878
0
        assert(numNonZero <= 15);
879
0
        assert(blkPos <= 15);
880
0
        absCoeff[numNonZero] = tmpCoeff[blkPos];
881
0
        numNonZero += sig;
882
0
        scanPosSigOff--;
883
0
    }
884
0
    while(scanPosSigOff >= 0);
885
886
0
    return (sum & 0xFFFFFF);
887
0
}
888
889
static uint32_t costCoeffRemain_c(uint16_t *absCoeff, int numNonZero, int idx)
890
0
{
891
0
    uint32_t goRiceParam = 0;
892
893
0
    uint32_t sum = 0;
894
0
    int baseLevel = 3;
895
0
    do
896
0
    {
897
0
        if (idx >= C1FLAG_NUMBER)
898
0
            baseLevel = 1;
899
900
        // TODO: the IDX is not really idx, so this check inactive
901
        //X265_CHECK(baseLevel == ((idx < C1FLAG_NUMBER) ? (2 + firstCoeff2) : 1), "baseLevel check failurr\n");
902
0
        int codeNumber = absCoeff[idx] - baseLevel;
903
904
0
        if (codeNumber >= 0)
905
0
        {
906
            //writeCoefRemainExGolomb(absCoeff[idx] - baseLevel, goRiceParam);
907
0
            uint32_t length = 0;
908
909
0
            codeNumber = ((uint32_t)codeNumber >> goRiceParam) - COEF_REMAIN_BIN_REDUCTION;
910
0
            if (codeNumber >= 0)
911
0
            {
912
0
                {
913
0
                    unsigned long cidx;
914
0
                    BSR(cidx, codeNumber + 1);
915
0
                    length = cidx;
916
0
                }
917
0
                X265_CHECK((codeNumber != 0) || (length == 0), "length check failure\n");
918
919
0
                codeNumber = (length + length);
920
0
            }
921
0
            sum += (COEF_REMAIN_BIN_REDUCTION + 1 + goRiceParam + codeNumber);
922
923
0
            if (absCoeff[idx] > (COEF_REMAIN_BIN_REDUCTION << goRiceParam))
924
0
                goRiceParam = (goRiceParam + 1) - (goRiceParam >> 2);
925
0
            X265_CHECK(goRiceParam <= 4, "goRiceParam check failure\n");
926
0
        }
927
0
        baseLevel = 2;
928
0
        idx++;
929
0
    }
930
0
    while(idx < numNonZero);
931
932
0
    return sum;
933
0
}
934
935
936
static uint32_t costC1C2Flag_c(uint16_t *absCoeff, intptr_t numC1Flag, uint8_t *baseCtxMod, intptr_t ctxOffset)
937
0
{
938
0
    uint32_t sum = 0;
939
0
    uint32_t c1 = 1;
940
0
    uint32_t firstC2Idx = 8;
941
0
    uint32_t firstC2Flag = 2;
942
0
    uint32_t c1Next = 0xFFFFFFFE;
943
944
0
    int idx = 0;
945
0
    do
946
0
    {
947
0
        uint32_t symbol1 = absCoeff[idx] > 1;
948
0
        uint32_t symbol2 = absCoeff[idx] > 2;
949
        //encodeBin(symbol1, baseCtxMod[c1]);
950
0
        {
951
0
            const uint32_t mstate = baseCtxMod[c1];
952
0
            baseCtxMod[c1] = sbacNext(mstate, symbol1);
953
0
            sum += sbacGetEntropyBits(mstate, symbol1);
954
0
        }
955
956
0
        if (symbol1)
957
0
            c1Next = 0;
958
959
0
        if (symbol1 + firstC2Flag == 3)
960
0
            firstC2Flag = symbol2;
961
962
0
        if (symbol1 + firstC2Idx == 9)
963
0
            firstC2Idx  = idx;
964
965
0
        c1 = (c1Next & 3);
966
0
        c1Next >>= 2;
967
0
        X265_CHECK(c1 <= 3, "c1 check failure\n");
968
0
        idx++;
969
0
    }
970
0
    while(idx < numC1Flag);
971
972
0
    if (!c1)
973
0
    {
974
0
        X265_CHECK((firstC2Flag <= 1), "firstC2FlagIdx check failure\n");
975
976
0
        baseCtxMod += ctxOffset;
977
978
        //encodeBin(firstC2Flag, baseCtxMod[0]);
979
0
        {
980
0
            const uint32_t mstate = baseCtxMod[0];
981
0
            baseCtxMod[0] = sbacNext(mstate, firstC2Flag);
982
0
            sum += sbacGetEntropyBits(mstate, firstC2Flag);
983
0
        }
984
0
    }
985
0
    return (sum & 0x00FFFFFF) + (c1 << 26) + (firstC2Idx << 28);
986
0
}
987
template<int log2TrSize>
988
static void nonPsyRdoQuant_c(int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, uint32_t blkPos)
989
0
{
990
0
    const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
991
0
    const int scaleBits = SCALE_BITS - 2 * transformShift;
992
0
    const uint32_t trSize = 1 << log2TrSize;
993
994
0
    for (int y = 0; y < MLS_CG_SIZE; y++)
995
0
    {
996
0
        for (int x = 0; x < MLS_CG_SIZE; x++)
997
0
        {
998
0
             int64_t signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
999
0
             costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits));
1000
0
             *totalUncodedCost += costUncoded[blkPos + x];
1001
0
             *totalRdCost += costUncoded[blkPos + x];
1002
0
        }
1003
0
        blkPos += trSize;
1004
0
    }
1005
0
}
Unexecuted instantiation: dct.cpp:void nonPsyRdoQuant_c<2>(short*, long*, long*, long*, unsigned int)
Unexecuted instantiation: dct.cpp:void nonPsyRdoQuant_c<3>(short*, long*, long*, long*, unsigned int)
Unexecuted instantiation: dct.cpp:void nonPsyRdoQuant_c<4>(short*, long*, long*, long*, unsigned int)
Unexecuted instantiation: dct.cpp:void nonPsyRdoQuant_c<5>(short*, long*, long*, long*, unsigned int)
1006
template<int log2TrSize>
1007
static void psyRdoQuant_c(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos)
1008
0
{
1009
0
    const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
1010
0
    const int scaleBits = SCALE_BITS - 2 * transformShift;
1011
0
    const uint32_t trSize = 1 << log2TrSize;
1012
0
    int max = X265_MAX(0, (2 * transformShift + 1));
1013
1014
0
    for (int y = 0; y < MLS_CG_SIZE; y++)
1015
0
    {
1016
0
        for (int x = 0; x < MLS_CG_SIZE; x++)
1017
0
        {
1018
0
            int64_t signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
1019
0
            int64_t predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/
1020
1021
0
            costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits));
1022
1023
            /* when no residual coefficient is coded, predicted coef == recon coef */
1024
0
            costUncoded[blkPos + x] -= static_cast<int64_t>((double)(((*psyScale) * predictedCoef) >> max));
1025
1026
0
            *totalUncodedCost += costUncoded[blkPos + x];
1027
0
            *totalRdCost += costUncoded[blkPos + x];
1028
0
        }
1029
0
        blkPos += trSize;
1030
0
    }
1031
0
}
Unexecuted instantiation: dct.cpp:void psyRdoQuant_c<2>(short*, short*, long*, long*, long*, long*, unsigned int)
Unexecuted instantiation: dct.cpp:void psyRdoQuant_c<3>(short*, short*, long*, long*, long*, long*, unsigned int)
Unexecuted instantiation: dct.cpp:void psyRdoQuant_c<4>(short*, short*, long*, long*, long*, long*, unsigned int)
Unexecuted instantiation: dct.cpp:void psyRdoQuant_c<5>(short*, short*, long*, long*, long*, long*, unsigned int)
1032
template<int log2TrSize>
1033
static void psyRdoQuant_c_1(int16_t *m_resiDctCoeff, /*int16_t  *m_fencDctCoeff, */ int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, /* int64_t *psyScale,*/ uint32_t blkPos)
1034
0
{
1035
0
  const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
1036
0
  const int scaleBits = SCALE_BITS - 2 * transformShift;
1037
0
  const uint32_t trSize = 1 << log2TrSize;
1038
1039
0
  for (int y = 0; y < MLS_CG_SIZE; y++)
1040
0
  {
1041
0
    for (int x = 0; x < MLS_CG_SIZE; x++)
1042
0
    {
1043
0
      int64_t signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
1044
0
      costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits));
1045
0
      *totalUncodedCost += costUncoded[blkPos + x];
1046
0
      *totalRdCost += costUncoded[blkPos + x];
1047
0
    }
1048
0
    blkPos += trSize;
1049
0
  }
1050
0
}
Unexecuted instantiation: dct.cpp:void psyRdoQuant_c_1<2>(short*, long*, long*, long*, unsigned int)
Unexecuted instantiation: dct.cpp:void psyRdoQuant_c_1<3>(short*, long*, long*, long*, unsigned int)
Unexecuted instantiation: dct.cpp:void psyRdoQuant_c_1<4>(short*, long*, long*, long*, unsigned int)
Unexecuted instantiation: dct.cpp:void psyRdoQuant_c_1<5>(short*, long*, long*, long*, unsigned int)
1051
template<int log2TrSize>
1052
static void psyRdoQuant_c_2(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos)
1053
0
{
1054
0
  const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
1055
1056
0
  const uint32_t trSize = 1 << log2TrSize;
1057
0
  int max = X265_MAX(0, (2 * transformShift + 1));
1058
1059
0
  for (int y = 0; y < MLS_CG_SIZE; y++)
1060
0
  {
1061
0
    for (int x = 0; x < MLS_CG_SIZE; x++)
1062
0
    {
1063
0
      int64_t signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
1064
0
      int64_t predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/
1065
0
      costUncoded[blkPos + x] -= static_cast<int64_t>((double)(((*psyScale) * predictedCoef) >> max));
1066
0
      *totalUncodedCost += costUncoded[blkPos + x];
1067
0
      *totalRdCost += costUncoded[blkPos + x];
1068
0
    }
1069
0
    blkPos += trSize;
1070
0
  }
1071
0
}
Unexecuted instantiation: dct.cpp:void psyRdoQuant_c_2<2>(short*, short*, long*, long*, long*, long*, unsigned int)
Unexecuted instantiation: dct.cpp:void psyRdoQuant_c_2<3>(short*, short*, long*, long*, long*, long*, unsigned int)
Unexecuted instantiation: dct.cpp:void psyRdoQuant_c_2<4>(short*, short*, long*, long*, long*, long*, unsigned int)
Unexecuted instantiation: dct.cpp:void psyRdoQuant_c_2<5>(short*, short*, long*, long*, long*, long*, unsigned int)
1072
1073
namespace X265_NS {
1074
// x265 private namespace
1075
void setupDCTPrimitives_c(EncoderPrimitives& p)
1076
0
{
1077
0
    p.dequant_scaling = dequant_scaling_c;
1078
0
    p.dequant_normal = dequant_normal_c;
1079
0
    p.quant = quant_c;
1080
0
    p.nquant = nquant_c;
1081
0
    p.cu[BLOCK_4x4].nonPsyRdoQuant   = nonPsyRdoQuant_c<2>;
1082
0
    p.cu[BLOCK_8x8].nonPsyRdoQuant   = nonPsyRdoQuant_c<3>;
1083
0
    p.cu[BLOCK_16x16].nonPsyRdoQuant = nonPsyRdoQuant_c<4>;
1084
0
    p.cu[BLOCK_32x32].nonPsyRdoQuant = nonPsyRdoQuant_c<5>;
1085
0
    p.cu[BLOCK_4x4].psyRdoQuant = psyRdoQuant_c<2>;
1086
0
    p.cu[BLOCK_8x8].psyRdoQuant = psyRdoQuant_c<3>;
1087
0
    p.cu[BLOCK_16x16].psyRdoQuant = psyRdoQuant_c<4>;
1088
0
    p.cu[BLOCK_32x32].psyRdoQuant = psyRdoQuant_c<5>;
1089
0
    p.dst4x4 = dst4_c;
1090
0
    p.cu[BLOCK_4x4].dct   = dct4_c;
1091
0
    p.cu[BLOCK_8x8].dct   = dct8_c;
1092
0
    p.cu[BLOCK_16x16].dct = dct16_c;
1093
0
    p.cu[BLOCK_32x32].dct = dct32_c;
1094
0
    p.idst4x4 = idst4_c;
1095
0
    p.cu[BLOCK_4x4].idct   = idct4_c;
1096
0
    p.cu[BLOCK_8x8].idct   = idct8_c;
1097
0
    p.cu[BLOCK_16x16].idct = idct16_c;
1098
0
    p.cu[BLOCK_32x32].idct = idct32_c;
1099
0
    p.denoiseDct = denoiseDct_c;
1100
0
    p.cu[BLOCK_4x4].count_nonzero = count_nonzero_c<4>;
1101
0
    p.cu[BLOCK_8x8].count_nonzero = count_nonzero_c<8>;
1102
0
    p.cu[BLOCK_16x16].count_nonzero = count_nonzero_c<16>;
1103
0
    p.cu[BLOCK_32x32].count_nonzero = count_nonzero_c<32>;
1104
1105
0
    p.cu[BLOCK_4x4].copy_cnt   = copy_count<4>;
1106
0
    p.cu[BLOCK_8x8].copy_cnt   = copy_count<8>;
1107
0
    p.cu[BLOCK_16x16].copy_cnt = copy_count<16>;
1108
0
    p.cu[BLOCK_32x32].copy_cnt = copy_count<32>;
1109
0
  p.cu[BLOCK_4x4].psyRdoQuant_1p = psyRdoQuant_c_1<2>;
1110
0
  p.cu[BLOCK_4x4].psyRdoQuant_2p = psyRdoQuant_c_2<2>;
1111
0
  p.cu[BLOCK_8x8].psyRdoQuant_1p = psyRdoQuant_c_1<3>;
1112
0
  p.cu[BLOCK_8x8].psyRdoQuant_2p = psyRdoQuant_c_2<3>;
1113
0
  p.cu[BLOCK_16x16].psyRdoQuant_1p = psyRdoQuant_c_1<4>;
1114
0
  p.cu[BLOCK_16x16].psyRdoQuant_2p = psyRdoQuant_c_2<4>;
1115
0
  p.cu[BLOCK_32x32].psyRdoQuant_1p = psyRdoQuant_c_1<5>;
1116
0
  p.cu[BLOCK_32x32].psyRdoQuant_2p = psyRdoQuant_c_2<5>;
1117
0
    p.scanPosLast = scanPosLast_c;
1118
0
    p.findPosFirstLast = findPosFirstLast_c;
1119
0
    p.costCoeffNxN = costCoeffNxN_c;
1120
0
    p.costCoeffRemain = costCoeffRemain_c;
1121
0
    p.costC1C2Flag = costC1C2Flag_c;
1122
0
}
1123
}