Coverage Report

Created: 2022-08-24 06:17

/src/x265/source/common/dct.cpp
Line
Count
Source (jump to first uncovered line)
1
/*****************************************************************************
2
 * Copyright (C) 2013-2020 MulticoreWare, Inc
3
 *
4
 * Authors: Mandar Gurav <mandar@multicorewareinc.com>
5
 *          Deepthi Devaki Akkoorath <deepthidevaki@multicorewareinc.com>
6
 *          Mahesh Pittala <mahesh@multicorewareinc.com>
7
 *          Rajesh Paulraj <rajesh@multicorewareinc.com>
8
 *          Min Chen <min.chen@multicorewareinc.com>
9
 *          Praveen Kumar Tiwari <praveen@multicorewareinc.com>
10
 *          Nabajit Deka <nabajit@multicorewareinc.com>
11
 *
12
 * This program is free software; you can redistribute it and/or modify
13
 * it under the terms of the GNU General Public License as published by
14
 * the Free Software Foundation; either version 2 of the License, or
15
 * (at your option) any later version.
16
 *
17
 * This program is distributed in the hope that it will be useful,
18
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20
 * GNU General Public License for more details.
21
 *
22
 * You should have received a copy of the GNU General Public License
23
 * along with this program; if not, write to the Free Software
24
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
25
 *
26
 * This program is also available under a commercial proprietary license.
27
 * For more information, contact us at license @ x265.com.
28
 *****************************************************************************/
29
30
#include "common.h"
31
#include "primitives.h"
32
#include "contexts.h"   // costCoeffNxN_c
33
#include "threading.h"  // CLZ
34
35
using namespace X265_NS;
36
37
#if _MSC_VER
38
#pragma warning(disable: 4127) // conditional expression is constant, typical for templated functions
39
#endif
40
41
// Fast DST Algorithm. Full matrix multiplication for DST and Fast DST algorithm
42
// give identical results
43
static void fastForwardDst(const int16_t* block, int16_t* coeff, int shift)  // input block, output coeff
44
5.59M
{
45
5.59M
    int c[4];
46
5.59M
    int rnd_factor = 1 << (shift - 1);
47
48
27.9M
    for (int i = 0; i < 4; i++)
49
22.3M
    {
50
        // Intermediate Variables
51
22.3M
        c[0] = block[4 * i + 0] + block[4 * i + 3];
52
22.3M
        c[1] = block[4 * i + 1] + block[4 * i + 3];
53
22.3M
        c[2] = block[4 * i + 0] - block[4 * i + 1];
54
22.3M
        c[3] = 74 * block[4 * i + 2];
55
56
22.3M
        coeff[i] =      (int16_t)((29 * c[0] + 55 * c[1]  + c[3] + rnd_factor) >> shift);
57
22.3M
        coeff[4 + i] =  (int16_t)((74 * (block[4 * i + 0] + block[4 * i + 1] - block[4 * i + 3]) + rnd_factor) >> shift);
58
22.3M
        coeff[8 + i] =  (int16_t)((29 * c[2] + 55 * c[0]  - c[3] + rnd_factor) >> shift);
59
22.3M
        coeff[12 + i] = (int16_t)((55 * c[2] - 29 * c[1] + c[3] + rnd_factor) >> shift);
60
22.3M
    }
61
5.59M
}
62
63
static void inversedst(const int16_t* tmp, int16_t* block, int shift)  // input tmp, output block
64
12.4k
{
65
12.4k
    int i, c[4];
66
12.4k
    int rnd_factor = 1 << (shift - 1);
67
68
62.0k
    for (i = 0; i < 4; i++)
69
49.6k
    {
70
        // Intermediate Variables
71
49.6k
        c[0] = tmp[i] + tmp[8 + i];
72
49.6k
        c[1] = tmp[8 + i] + tmp[12 + i];
73
49.6k
        c[2] = tmp[i] - tmp[12 + i];
74
49.6k
        c[3] = 74 * tmp[4 + i];
75
76
49.6k
        block[4 * i + 0] = (int16_t)x265_clip3(-32768, 32767, (29 * c[0] + 55 * c[1]     + c[3]               + rnd_factor) >> shift);
77
49.6k
        block[4 * i + 1] = (int16_t)x265_clip3(-32768, 32767, (55 * c[2] - 29 * c[1]     + c[3]               + rnd_factor) >> shift);
78
49.6k
        block[4 * i + 2] = (int16_t)x265_clip3(-32768, 32767, (74 * (tmp[i] - tmp[8 + i]  + tmp[12 + i])      + rnd_factor) >> shift);
79
49.6k
        block[4 * i + 3] = (int16_t)x265_clip3(-32768, 32767, (55 * c[0] + 29 * c[2]     - c[3]               + rnd_factor) >> shift);
80
49.6k
    }
81
12.4k
}
82
83
static void partialButterfly16(const int16_t* src, int16_t* dst, int shift, int line)
84
887k
{
85
887k
    int j, k;
86
887k
    int E[8], O[8];
87
887k
    int EE[4], EO[4];
88
887k
    int EEE[2], EEO[2];
89
887k
    int add = 1 << (shift - 1);
90
91
15.0M
    for (j = 0; j < line; j++)
92
14.1M
    {
93
        /* E and O */
94
127M
        for (k = 0; k < 8; k++)
95
113M
        {
96
113M
            E[k] = src[k] + src[15 - k];
97
113M
            O[k] = src[k] - src[15 - k];
98
113M
        }
99
100
        /* EE and EO */
101
70.7M
        for (k = 0; k < 4; k++)
102
56.5M
        {
103
56.5M
            EE[k] = E[k] + E[7 - k];
104
56.5M
            EO[k] = E[k] - E[7 - k];
105
56.5M
        }
106
107
        /* EEE and EEO */
108
14.1M
        EEE[0] = EE[0] + EE[3];
109
14.1M
        EEO[0] = EE[0] - EE[3];
110
14.1M
        EEE[1] = EE[1] + EE[2];
111
14.1M
        EEO[1] = EE[1] - EE[2];
112
113
14.1M
        dst[0] = (int16_t)((g_t16[0][0] * EEE[0] + g_t16[0][1] * EEE[1] + add) >> shift);
114
14.1M
        dst[8 * line] = (int16_t)((g_t16[8][0] * EEE[0] + g_t16[8][1] * EEE[1] + add) >> shift);
115
14.1M
        dst[4 * line] = (int16_t)((g_t16[4][0] * EEO[0] + g_t16[4][1] * EEO[1] + add) >> shift);
116
14.1M
        dst[12 * line] = (int16_t)((g_t16[12][0] * EEO[0] + g_t16[12][1] * EEO[1] + add) >> shift);
117
118
70.5M
        for (k = 2; k < 16; k += 4)
119
56.4M
        {
120
56.4M
            dst[k * line] = (int16_t)((g_t16[k][0] * EO[0] + g_t16[k][1] * EO[1] + g_t16[k][2] * EO[2] +
121
56.4M
                                       g_t16[k][3] * EO[3] + add) >> shift);
122
56.4M
        }
123
124
126M
        for (k = 1; k < 16; k += 2)
125
112M
        {
126
112M
            dst[k * line] =  (int16_t)((g_t16[k][0] * O[0] + g_t16[k][1] * O[1] + g_t16[k][2] * O[2] + g_t16[k][3] * O[3] +
127
112M
                                        g_t16[k][4] * O[4] + g_t16[k][5] * O[5] + g_t16[k][6] * O[6] + g_t16[k][7] * O[7] +
128
112M
                                        add) >> shift);
129
112M
        }
130
131
14.1M
        src += 16;
132
14.1M
        dst++;
133
14.1M
    }
134
887k
}
135
136
static void partialButterfly32(const int16_t* src, int16_t* dst, int shift, int line)
137
100k
{
138
100k
    int j, k;
139
100k
    int E[16], O[16];
140
100k
    int EE[8], EO[8];
141
100k
    int EEE[4], EEO[4];
142
100k
    int EEEE[2], EEEO[2];
143
100k
    int add = 1 << (shift - 1);
144
145
3.31M
    for (j = 0; j < line; j++)
146
3.21M
    {
147
        /* E and O*/
148
54.5M
        for (k = 0; k < 16; k++)
149
51.3M
        {
150
51.3M
            E[k] = src[k] + src[31 - k];
151
51.3M
            O[k] = src[k] - src[31 - k];
152
51.3M
        }
153
154
        /* EE and EO */
155
28.9M
        for (k = 0; k < 8; k++)
156
25.6M
        {
157
25.6M
            EE[k] = E[k] + E[15 - k];
158
25.6M
            EO[k] = E[k] - E[15 - k];
159
25.6M
        }
160
161
        /* EEE and EEO */
162
16.0M
        for (k = 0; k < 4; k++)
163
12.8M
        {
164
12.8M
            EEE[k] = EE[k] + EE[7 - k];
165
12.8M
            EEO[k] = EE[k] - EE[7 - k];
166
12.8M
        }
167
168
        /* EEEE and EEEO */
169
3.21M
        EEEE[0] = EEE[0] + EEE[3];
170
3.21M
        EEEO[0] = EEE[0] - EEE[3];
171
3.21M
        EEEE[1] = EEE[1] + EEE[2];
172
3.21M
        EEEO[1] = EEE[1] - EEE[2];
173
174
3.21M
        dst[0] = (int16_t)((g_t32[0][0] * EEEE[0] + g_t32[0][1] * EEEE[1] + add) >> shift);
175
3.21M
        dst[16 * line] = (int16_t)((g_t32[16][0] * EEEE[0] + g_t32[16][1] * EEEE[1] + add) >> shift);
176
3.21M
        dst[8 * line] = (int16_t)((g_t32[8][0] * EEEO[0] + g_t32[8][1] * EEEO[1] + add) >> shift);
177
3.21M
        dst[24 * line] = (int16_t)((g_t32[24][0] * EEEO[0] + g_t32[24][1] * EEEO[1] + add) >> shift);
178
15.9M
        for (k = 4; k < 32; k += 8)
179
12.7M
        {
180
12.7M
            dst[k * line] = (int16_t)((g_t32[k][0] * EEO[0] + g_t32[k][1] * EEO[1] + g_t32[k][2] * EEO[2] +
181
12.7M
                                       g_t32[k][3] * EEO[3] + add) >> shift);
182
12.7M
        }
183
184
28.7M
        for (k = 2; k < 32; k += 4)
185
25.5M
        {
186
25.5M
            dst[k * line] = (int16_t)((g_t32[k][0] * EO[0] + g_t32[k][1] * EO[1] + g_t32[k][2] * EO[2] +
187
25.5M
                                       g_t32[k][3] * EO[3] + g_t32[k][4] * EO[4] + g_t32[k][5] * EO[5] +
188
25.5M
                                       g_t32[k][6] * EO[6] + g_t32[k][7] * EO[7] + add) >> shift);
189
25.5M
        }
190
191
54.3M
        for (k = 1; k < 32; k += 2)
192
51.1M
        {
193
51.1M
            dst[k * line] = (int16_t)((g_t32[k][0] * O[0] + g_t32[k][1] * O[1] + g_t32[k][2] * O[2] + g_t32[k][3] * O[3] +
194
51.1M
                                       g_t32[k][4] * O[4] + g_t32[k][5] * O[5] + g_t32[k][6] * O[6] + g_t32[k][7] * O[7] +
195
51.1M
                                       g_t32[k][8] * O[8] + g_t32[k][9] * O[9] + g_t32[k][10] * O[10] + g_t32[k][11] *
196
51.1M
                                       O[11] + g_t32[k][12] * O[12] + g_t32[k][13] * O[13] + g_t32[k][14] * O[14] +
197
51.1M
                                       g_t32[k][15] * O[15] + add) >> shift);
198
51.1M
        }
199
200
3.21M
        src += 32;
201
3.21M
        dst++;
202
3.21M
    }
203
100k
}
204
205
static void partialButterfly8(const int16_t* src, int16_t* dst, int shift, int line)
206
3.90M
{
207
3.90M
    int j, k;
208
3.90M
    int E[4], O[4];
209
3.90M
    int EE[2], EO[2];
210
3.90M
    int add = 1 << (shift - 1);
211
212
35.0M
    for (j = 0; j < line; j++)
213
31.1M
    {
214
        /* E and O*/
215
155M
        for (k = 0; k < 4; k++)
216
124M
        {
217
124M
            E[k] = src[k] + src[7 - k];
218
124M
            O[k] = src[k] - src[7 - k];
219
124M
        }
220
221
        /* EE and EO */
222
31.1M
        EE[0] = E[0] + E[3];
223
31.1M
        EO[0] = E[0] - E[3];
224
31.1M
        EE[1] = E[1] + E[2];
225
31.1M
        EO[1] = E[1] - E[2];
226
227
31.1M
        dst[0] = (int16_t)((g_t8[0][0] * EE[0] + g_t8[0][1] * EE[1] + add) >> shift);
228
31.1M
        dst[4 * line] = (int16_t)((g_t8[4][0] * EE[0] + g_t8[4][1] * EE[1] + add) >> shift);
229
31.1M
        dst[2 * line] = (int16_t)((g_t8[2][0] * EO[0] + g_t8[2][1] * EO[1] + add) >> shift);
230
31.1M
        dst[6 * line] = (int16_t)((g_t8[6][0] * EO[0] + g_t8[6][1] * EO[1] + add) >> shift);
231
232
31.1M
        dst[line] = (int16_t)((g_t8[1][0] * O[0] + g_t8[1][1] * O[1] + g_t8[1][2] * O[2] + g_t8[1][3] * O[3] + add) >> shift);
233
31.1M
        dst[3 * line] = (int16_t)((g_t8[3][0] * O[0] + g_t8[3][1] * O[1] + g_t8[3][2] * O[2] + g_t8[3][3] * O[3] + add) >> shift);
234
31.1M
        dst[5 * line] = (int16_t)((g_t8[5][0] * O[0] + g_t8[5][1] * O[1] + g_t8[5][2] * O[2] + g_t8[5][3] * O[3] + add) >> shift);
235
31.1M
        dst[7 * line] = (int16_t)((g_t8[7][0] * O[0] + g_t8[7][1] * O[1] + g_t8[7][2] * O[2] + g_t8[7][3] * O[3] + add) >> shift);
236
237
31.1M
        src += 8;
238
31.1M
        dst++;
239
31.1M
    }
240
3.90M
}
241
242
static void partialButterflyInverse4(const int16_t* src, int16_t* dst, int shift, int line)
243
0
{
244
0
    int j;
245
0
    int E[2], O[2];
246
0
    int add = 1 << (shift - 1);
247
248
0
    for (j = 0; j < line; j++)
249
0
    {
250
        /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
251
0
        O[0] = g_t4[1][0] * src[line] + g_t4[3][0] * src[3 * line];
252
0
        O[1] = g_t4[1][1] * src[line] + g_t4[3][1] * src[3 * line];
253
0
        E[0] = g_t4[0][0] * src[0] + g_t4[2][0] * src[2 * line];
254
0
        E[1] = g_t4[0][1] * src[0] + g_t4[2][1] * src[2 * line];
255
256
        /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
257
0
        dst[0] = (int16_t)(x265_clip3(-32768, 32767, (E[0] + O[0] + add) >> shift));
258
0
        dst[1] = (int16_t)(x265_clip3(-32768, 32767, (E[1] + O[1] + add) >> shift));
259
0
        dst[2] = (int16_t)(x265_clip3(-32768, 32767, (E[1] - O[1] + add) >> shift));
260
0
        dst[3] = (int16_t)(x265_clip3(-32768, 32767, (E[0] - O[0] + add) >> shift));
261
262
0
        src++;
263
0
        dst += 4;
264
0
    }
265
0
}
266
267
static void partialButterflyInverse8(const int16_t* src, int16_t* dst, int shift, int line)
268
0
{
269
0
    int j, k;
270
0
    int E[4], O[4];
271
0
    int EE[2], EO[2];
272
0
    int add = 1 << (shift - 1);
273
274
0
    for (j = 0; j < line; j++)
275
0
    {
276
        /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
277
0
        for (k = 0; k < 4; k++)
278
0
        {
279
0
            O[k] = g_t8[1][k] * src[line] + g_t8[3][k] * src[3 * line] + g_t8[5][k] * src[5 * line] + g_t8[7][k] * src[7 * line];
280
0
        }
281
282
0
        EO[0] = g_t8[2][0] * src[2 * line] + g_t8[6][0] * src[6 * line];
283
0
        EO[1] = g_t8[2][1] * src[2 * line] + g_t8[6][1] * src[6 * line];
284
0
        EE[0] = g_t8[0][0] * src[0] + g_t8[4][0] * src[4 * line];
285
0
        EE[1] = g_t8[0][1] * src[0] + g_t8[4][1] * src[4 * line];
286
287
        /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
288
0
        E[0] = EE[0] + EO[0];
289
0
        E[3] = EE[0] - EO[0];
290
0
        E[1] = EE[1] + EO[1];
291
0
        E[2] = EE[1] - EO[1];
292
0
        for (k = 0; k < 4; k++)
293
0
        {
294
0
            dst[k] = (int16_t)x265_clip3(-32768, 32767, (E[k] + O[k] + add) >> shift);
295
0
            dst[k + 4] = (int16_t)x265_clip3(-32768, 32767, (E[3 - k] - O[3 - k] + add) >> shift);
296
0
        }
297
298
0
        src++;
299
0
        dst += 8;
300
0
    }
301
0
}
302
303
static void partialButterflyInverse16(const int16_t* src, int16_t* dst, int shift, int line)
304
0
{
305
0
    int j, k;
306
0
    int E[8], O[8];
307
0
    int EE[4], EO[4];
308
0
    int EEE[2], EEO[2];
309
0
    int add = 1 << (shift - 1);
310
311
0
    for (j = 0; j < line; j++)
312
0
    {
313
        /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
314
0
        for (k = 0; k < 8; k++)
315
0
        {
316
0
            O[k] = g_t16[1][k] * src[line] + g_t16[3][k] * src[3 * line] + g_t16[5][k] * src[5 * line] + g_t16[7][k] * src[7 * line] +
317
0
                g_t16[9][k] * src[9 * line] + g_t16[11][k] * src[11 * line] + g_t16[13][k] * src[13 * line] + g_t16[15][k] * src[15 * line];
318
0
        }
319
320
0
        for (k = 0; k < 4; k++)
321
0
        {
322
0
            EO[k] = g_t16[2][k] * src[2 * line] + g_t16[6][k] * src[6 * line] + g_t16[10][k] * src[10 * line] + g_t16[14][k] * src[14 * line];
323
0
        }
324
325
0
        EEO[0] = g_t16[4][0] * src[4 * line] + g_t16[12][0] * src[12 * line];
326
0
        EEE[0] = g_t16[0][0] * src[0] + g_t16[8][0] * src[8 * line];
327
0
        EEO[1] = g_t16[4][1] * src[4 * line] + g_t16[12][1] * src[12 * line];
328
0
        EEE[1] = g_t16[0][1] * src[0] + g_t16[8][1] * src[8 * line];
329
330
        /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
331
0
        for (k = 0; k < 2; k++)
332
0
        {
333
0
            EE[k] = EEE[k] + EEO[k];
334
0
            EE[k + 2] = EEE[1 - k] - EEO[1 - k];
335
0
        }
336
337
0
        for (k = 0; k < 4; k++)
338
0
        {
339
0
            E[k] = EE[k] + EO[k];
340
0
            E[k + 4] = EE[3 - k] - EO[3 - k];
341
0
        }
342
343
0
        for (k = 0; k < 8; k++)
344
0
        {
345
0
            dst[k]   = (int16_t)x265_clip3(-32768, 32767, (E[k] + O[k] + add) >> shift);
346
0
            dst[k + 8] = (int16_t)x265_clip3(-32768, 32767, (E[7 - k] - O[7 - k] + add) >> shift);
347
0
        }
348
349
0
        src++;
350
0
        dst += 16;
351
0
    }
352
0
}
353
354
static void partialButterflyInverse32(const int16_t* src, int16_t* dst, int shift, int line)
355
0
{
356
0
    int j, k;
357
0
    int E[16], O[16];
358
0
    int EE[8], EO[8];
359
0
    int EEE[4], EEO[4];
360
0
    int EEEE[2], EEEO[2];
361
0
    int add = 1 << (shift - 1);
362
363
0
    for (j = 0; j < line; j++)
364
0
    {
365
        /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
366
0
        for (k = 0; k < 16; k++)
367
0
        {
368
0
            O[k] = g_t32[1][k] * src[line] + g_t32[3][k] * src[3 * line] + g_t32[5][k] * src[5 * line] + g_t32[7][k] * src[7 * line] +
369
0
                g_t32[9][k] * src[9 * line] + g_t32[11][k] * src[11 * line] + g_t32[13][k] * src[13 * line] + g_t32[15][k] * src[15 * line] +
370
0
                g_t32[17][k] * src[17 * line] + g_t32[19][k] * src[19 * line] + g_t32[21][k] * src[21 * line] + g_t32[23][k] * src[23 * line] +
371
0
                g_t32[25][k] * src[25 * line] + g_t32[27][k] * src[27 * line] + g_t32[29][k] * src[29 * line] + g_t32[31][k] * src[31 * line];
372
0
        }
373
374
0
        for (k = 0; k < 8; k++)
375
0
        {
376
0
            EO[k] = g_t32[2][k] * src[2 * line] + g_t32[6][k] * src[6 * line] + g_t32[10][k] * src[10 * line] + g_t32[14][k] * src[14 * line] +
377
0
                g_t32[18][k] * src[18 * line] + g_t32[22][k] * src[22 * line] + g_t32[26][k] * src[26 * line] + g_t32[30][k] * src[30 * line];
378
0
        }
379
380
0
        for (k = 0; k < 4; k++)
381
0
        {
382
0
            EEO[k] = g_t32[4][k] * src[4 * line] + g_t32[12][k] * src[12 * line] + g_t32[20][k] * src[20 * line] + g_t32[28][k] * src[28 * line];
383
0
        }
384
385
0
        EEEO[0] = g_t32[8][0] * src[8 * line] + g_t32[24][0] * src[24 * line];
386
0
        EEEO[1] = g_t32[8][1] * src[8 * line] + g_t32[24][1] * src[24 * line];
387
0
        EEEE[0] = g_t32[0][0] * src[0] + g_t32[16][0] * src[16 * line];
388
0
        EEEE[1] = g_t32[0][1] * src[0] + g_t32[16][1] * src[16 * line];
389
390
        /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
391
0
        EEE[0] = EEEE[0] + EEEO[0];
392
0
        EEE[3] = EEEE[0] - EEEO[0];
393
0
        EEE[1] = EEEE[1] + EEEO[1];
394
0
        EEE[2] = EEEE[1] - EEEO[1];
395
0
        for (k = 0; k < 4; k++)
396
0
        {
397
0
            EE[k] = EEE[k] + EEO[k];
398
0
            EE[k + 4] = EEE[3 - k] - EEO[3 - k];
399
0
        }
400
401
0
        for (k = 0; k < 8; k++)
402
0
        {
403
0
            E[k] = EE[k] + EO[k];
404
0
            E[k + 8] = EE[7 - k] - EO[7 - k];
405
0
        }
406
407
0
        for (k = 0; k < 16; k++)
408
0
        {
409
0
            dst[k] = (int16_t)x265_clip3(-32768, 32767, (E[k] + O[k] + add) >> shift);
410
0
            dst[k + 16] = (int16_t)x265_clip3(-32768, 32767, (E[15 - k] - O[15 - k] + add) >> shift);
411
0
        }
412
413
0
        src++;
414
0
        dst += 32;
415
0
    }
416
0
}
417
418
static void partialButterfly4(const int16_t* src, int16_t* dst, int shift, int line)
419
14.8M
{
420
14.8M
    int j;
421
14.8M
    int E[2], O[2];
422
14.8M
    int add = 1 << (shift - 1);
423
424
74.1M
    for (j = 0; j < line; j++)
425
59.3M
    {
426
        /* E and O */
427
59.3M
        E[0] = src[0] + src[3];
428
59.3M
        O[0] = src[0] - src[3];
429
59.3M
        E[1] = src[1] + src[2];
430
59.3M
        O[1] = src[1] - src[2];
431
432
59.3M
        dst[0] = (int16_t)((g_t4[0][0] * E[0] + g_t4[0][1] * E[1] + add) >> shift);
433
59.3M
        dst[2 * line] = (int16_t)((g_t4[2][0] * E[0] + g_t4[2][1] * E[1] + add) >> shift);
434
59.3M
        dst[line] = (int16_t)((g_t4[1][0] * O[0] + g_t4[1][1] * O[1] + add) >> shift);
435
59.3M
        dst[3 * line] = (int16_t)((g_t4[3][0] * O[0] + g_t4[3][1] * O[1] + add) >> shift);
436
437
59.3M
        src += 4;
438
59.3M
        dst++;
439
59.3M
    }
440
14.8M
}
441
442
static void dst4_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
443
2.80M
{
444
2.80M
    const int shift_1st = 1 + X265_DEPTH - 8;
445
2.80M
    const int shift_2nd = 8;
446
447
2.80M
    ALIGN_VAR_32(int16_t, coef[4 * 4]);
448
2.80M
    ALIGN_VAR_32(int16_t, block[4 * 4]);
449
450
14.0M
    for (int i = 0; i < 4; i++)
451
11.2M
    {
452
11.2M
        memcpy(&block[i * 4], &src[i * srcStride], 4 * sizeof(int16_t));
453
11.2M
    }
454
455
2.80M
    fastForwardDst(block, coef, shift_1st);
456
2.80M
    fastForwardDst(coef, dst, shift_2nd);
457
2.80M
}
458
459
static void dct4_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
460
7.42M
{
461
7.42M
    const int shift_1st = 1 + X265_DEPTH - 8;
462
7.42M
    const int shift_2nd = 8;
463
464
7.42M
    ALIGN_VAR_32(int16_t, coef[4 * 4]);
465
7.42M
    ALIGN_VAR_32(int16_t, block[4 * 4]);
466
467
37.1M
    for (int i = 0; i < 4; i++)
468
29.7M
    {
469
29.7M
        memcpy(&block[i * 4], &src[i * srcStride], 4 * sizeof(int16_t));
470
29.7M
    }
471
472
7.42M
    partialButterfly4(block, coef, shift_1st, 4);
473
7.42M
    partialButterfly4(coef, dst, shift_2nd, 4);
474
7.42M
}
475
476
static void dct8_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
477
1.95M
{
478
1.95M
    const int shift_1st = 2 + X265_DEPTH - 8;
479
1.95M
    const int shift_2nd = 9;
480
481
1.95M
    ALIGN_VAR_32(int16_t, coef[8 * 8]);
482
1.95M
    ALIGN_VAR_32(int16_t, block[8 * 8]);
483
484
17.5M
    for (int i = 0; i < 8; i++)
485
15.6M
    {
486
15.6M
        memcpy(&block[i * 8], &src[i * srcStride], 8 * sizeof(int16_t));
487
15.6M
    }
488
489
1.95M
    partialButterfly8(block, coef, shift_1st, 8);
490
1.95M
    partialButterfly8(coef, dst, shift_2nd, 8);
491
1.95M
}
492
493
static void dct16_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
494
443k
{
495
443k
    const int shift_1st = 3 + X265_DEPTH - 8;
496
443k
    const int shift_2nd = 10;
497
498
443k
    ALIGN_VAR_32(int16_t, coef[16 * 16]);
499
443k
    ALIGN_VAR_32(int16_t, block[16 * 16]);
500
501
7.54M
    for (int i = 0; i < 16; i++)
502
7.10M
    {
503
7.10M
        memcpy(&block[i * 16], &src[i * srcStride], 16 * sizeof(int16_t));
504
7.10M
    }
505
506
443k
    partialButterfly16(block, coef, shift_1st, 16);
507
443k
    partialButterfly16(coef, dst, shift_2nd, 16);
508
443k
}
509
510
static void dct32_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
511
50.3k
{
512
50.3k
    const int shift_1st = 4 + X265_DEPTH - 8;
513
50.3k
    const int shift_2nd = 11;
514
515
50.3k
    ALIGN_VAR_32(int16_t, coef[32 * 32]);
516
50.3k
    ALIGN_VAR_32(int16_t, block[32 * 32]);
517
518
1.66M
    for (int i = 0; i < 32; i++)
519
1.61M
    {
520
1.61M
        memcpy(&block[i * 32], &src[i * srcStride], 32 * sizeof(int16_t));
521
1.61M
    }
522
523
50.3k
    partialButterfly32(block, coef, shift_1st, 32);
524
50.3k
    partialButterfly32(coef, dst, shift_2nd, 32);
525
50.3k
}
526
527
static void idst4_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
528
6.20k
{
529
6.20k
    const int shift_1st = 7;
530
6.20k
    const int shift_2nd = 12 - (X265_DEPTH - 8);
531
532
6.20k
    ALIGN_VAR_32(int16_t, coef[4 * 4]);
533
6.20k
    ALIGN_VAR_32(int16_t, block[4 * 4]);
534
535
6.20k
    inversedst(src, coef, shift_1st); // Forward DST BY FAST ALGORITHM, block input, coef output
536
6.20k
    inversedst(coef, block, shift_2nd); // Forward DST BY FAST ALGORITHM, coef input, coeff output
537
538
31.0k
    for (int i = 0; i < 4; i++)
539
24.8k
    {
540
24.8k
        memcpy(&dst[i * dstStride], &block[i * 4], 4 * sizeof(int16_t));
541
24.8k
    }
542
6.20k
}
543
544
static void idct4_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
545
0
{
546
0
    const int shift_1st = 7;
547
0
    const int shift_2nd = 12 - (X265_DEPTH - 8);
548
549
0
    ALIGN_VAR_32(int16_t, coef[4 * 4]);
550
0
    ALIGN_VAR_32(int16_t, block[4 * 4]);
551
552
0
    partialButterflyInverse4(src, coef, shift_1st, 4); // Forward DST BY FAST ALGORITHM, block input, coef output
553
0
    partialButterflyInverse4(coef, block, shift_2nd, 4); // Forward DST BY FAST ALGORITHM, coef input, coeff output
554
555
0
    for (int i = 0; i < 4; i++)
556
0
    {
557
0
        memcpy(&dst[i * dstStride], &block[i * 4], 4 * sizeof(int16_t));
558
0
    }
559
0
}
560
561
static void idct8_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
562
0
{
563
0
    const int shift_1st = 7;
564
0
    const int shift_2nd = 12 - (X265_DEPTH - 8);
565
566
0
    ALIGN_VAR_32(int16_t, coef[8 * 8]);
567
0
    ALIGN_VAR_32(int16_t, block[8 * 8]);
568
569
0
    partialButterflyInverse8(src, coef, shift_1st, 8);
570
0
    partialButterflyInverse8(coef, block, shift_2nd, 8);
571
572
0
    for (int i = 0; i < 8; i++)
573
0
    {
574
0
        memcpy(&dst[i * dstStride], &block[i * 8], 8 * sizeof(int16_t));
575
0
    }
576
0
}
577
578
static void idct16_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
579
0
{
580
0
    const int shift_1st = 7;
581
0
    const int shift_2nd = 12 - (X265_DEPTH - 8);
582
583
0
    ALIGN_VAR_32(int16_t, coef[16 * 16]);
584
0
    ALIGN_VAR_32(int16_t, block[16 * 16]);
585
586
0
    partialButterflyInverse16(src, coef, shift_1st, 16);
587
0
    partialButterflyInverse16(coef, block, shift_2nd, 16);
588
589
0
    for (int i = 0; i < 16; i++)
590
0
    {
591
0
        memcpy(&dst[i * dstStride], &block[i * 16], 16 * sizeof(int16_t));
592
0
    }
593
0
}
594
595
static void idct32_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
596
0
{
597
0
    const int shift_1st = 7;
598
0
    const int shift_2nd = 12 - (X265_DEPTH - 8);
599
600
0
    ALIGN_VAR_32(int16_t, coef[32 * 32]);
601
0
    ALIGN_VAR_32(int16_t, block[32 * 32]);
602
603
0
    partialButterflyInverse32(src, coef, shift_1st, 32);
604
0
    partialButterflyInverse32(coef, block, shift_2nd, 32);
605
606
0
    for (int i = 0; i < 32; i++)
607
0
    {
608
0
        memcpy(&dst[i * dstStride], &block[i * 32], 32 * sizeof(int16_t));
609
0
    }
610
0
}
611
612
static void dequant_normal_c(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift)
613
39.2k
{
614
#if HIGH_BIT_DEPTH
615
    X265_CHECK(scale < 32768 || ((scale & 3) == 0 && shift > (X265_DEPTH - 8)), "dequant invalid scale %d\n", scale);
616
#else
617
    // NOTE: maximum of scale is (72 * 256)
618
39.2k
    X265_CHECK(scale < 32768, "dequant invalid scale %d\n", scale);
619
39.2k
#endif
620
39.2k
    X265_CHECK(num <= 32 * 32, "dequant num %d too large\n", num);
621
39.2k
    X265_CHECK((num % 8) == 0, "dequant num %d not multiple of 8\n", num);
622
39.2k
    X265_CHECK(shift <= 10, "shift too large %d\n", shift);
623
39.2k
    X265_CHECK(((intptr_t)coef & 31) == 0, "dequant coef buffer not aligned\n");
624
625
39.2k
    int add, coeffQ;
626
627
39.2k
    add = 1 << (shift - 1);
628
629
6.15M
    for (int n = 0; n < num; n++)
630
6.11M
    {
631
6.11M
        coeffQ = (quantCoef[n] * scale + add) >> shift;
632
6.11M
        coef[n] = (int16_t)x265_clip3(-32768, 32767, coeffQ);
633
6.11M
    }
634
39.2k
}
635
636
static void dequant_scaling_c(const int16_t* quantCoef, const int32_t* deQuantCoef, int16_t* coef, int num, int per, int shift)
637
0
{
638
0
    X265_CHECK(num <= 32 * 32, "dequant num %d too large\n", num);
639
640
0
    int add, coeffQ;
641
642
0
    shift += 4;
643
644
0
    if (shift > per)
645
0
    {
646
0
        add = 1 << (shift - per - 1);
647
648
0
        for (int n = 0; n < num; n++)
649
0
        {
650
0
            coeffQ = ((quantCoef[n] * deQuantCoef[n]) + add) >> (shift - per);
651
0
            coef[n] = (int16_t)x265_clip3(-32768, 32767, coeffQ);
652
0
        }
653
0
    }
654
0
    else
655
0
    {
656
0
        for (int n = 0; n < num; n++)
657
0
        {
658
0
            coeffQ   = x265_clip3(-32768, 32767, quantCoef[n] * deQuantCoef[n]);
659
0
            coef[n] = (int16_t)x265_clip3(-32768, 32767, coeffQ << (per - shift));
660
0
        }
661
0
    }
662
0
}
663
664
static uint32_t quant_c(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff)
665
0
{
666
0
    X265_CHECK(qBits >= 8, "qBits less than 8\n");
667
0
    X265_CHECK((numCoeff % 16) == 0, "numCoeff must be multiple of 16\n");
668
0
    int qBits8 = qBits - 8;
669
0
    uint32_t numSig = 0;
670
671
0
    for (int blockpos = 0; blockpos < numCoeff; blockpos++)
672
0
    {
673
0
        int level = coef[blockpos];
674
0
        int sign  = (level < 0 ? -1 : 1);
675
676
0
        int tmplevel = abs(level) * quantCoeff[blockpos];
677
0
        level = ((tmplevel + add) >> qBits);
678
0
        deltaU[blockpos] = ((tmplevel - (level << qBits)) >> qBits8);
679
0
        if (level)
680
0
            ++numSig;
681
0
        level *= sign;
682
0
        qCoef[blockpos] = (int16_t)x265_clip3(-32768, 32767, level);
683
0
    }
684
685
0
    return numSig;
686
0
}
687
688
static uint32_t nquant_c(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff)
689
8.99M
{
690
8.99M
    X265_CHECK((numCoeff % 16) == 0, "number of quant coeff is not multiple of 4x4\n");
691
8.99M
    X265_CHECK((uint32_t)add < ((uint32_t)1 << qBits), "2 ^ qBits less than add\n");
692
8.99M
    X265_CHECK(((intptr_t)quantCoeff & 31) == 0, "quantCoeff buffer not aligned\n");
693
694
8.99M
    uint32_t numSig = 0;
695
696
301M
    for (int blockpos = 0; blockpos < numCoeff; blockpos++)
697
292M
    {
698
292M
        int level = coef[blockpos];
699
292M
        int sign  = (level < 0 ? -1 : 1);
700
701
292M
        int tmplevel = abs(level) * quantCoeff[blockpos];
702
292M
        level = ((tmplevel + add) >> qBits);
703
292M
        if (level)
704
112k
            ++numSig;
705
292M
        level *= sign;
706
707
        // TODO: when we limit range to [-32767, 32767], we can get more performance with output change
708
        //       But nquant is a little percent in rdoQuant, so I keep old dynamic range for compatible
709
292M
        qCoef[blockpos] = (int16_t)abs(x265_clip3(-32768, 32767, level));
710
292M
    }
711
712
8.99M
    return numSig;
713
8.99M
}
714
template<int trSize>
715
int  count_nonzero_c(const int16_t* quantCoeff)
716
66.8k
{
717
66.8k
    X265_CHECK(((intptr_t)quantCoeff & 15) == 0, "quant buffer not aligned\n");
718
66.8k
    int count = 0;
719
66.8k
    int numCoeff = trSize * trSize;
720
10.6M
    for (int i = 0; i < numCoeff; i++)
721
10.5M
    {
722
10.5M
        count += quantCoeff[i] != 0;
723
10.5M
    }
724
725
66.8k
    return count;
726
66.8k
}
int count_nonzero_c<4>(short const*)
Line
Count
Source
716
29.7k
{
717
29.7k
    X265_CHECK(((intptr_t)quantCoeff & 15) == 0, "quant buffer not aligned\n");
718
29.7k
    int count = 0;
719
29.7k
    int numCoeff = trSize * trSize;
720
506k
    for (int i = 0; i < numCoeff; i++)
721
476k
    {
722
476k
        count += quantCoeff[i] != 0;
723
476k
    }
724
725
29.7k
    return count;
726
29.7k
}
int count_nonzero_c<8>(short const*)
Line
Count
Source
716
16.8k
{
717
16.8k
    X265_CHECK(((intptr_t)quantCoeff & 15) == 0, "quant buffer not aligned\n");
718
16.8k
    int count = 0;
719
16.8k
    int numCoeff = trSize * trSize;
720
1.09M
    for (int i = 0; i < numCoeff; i++)
721
1.07M
    {
722
1.07M
        count += quantCoeff[i] != 0;
723
1.07M
    }
724
725
16.8k
    return count;
726
16.8k
}
int count_nonzero_c<16>(short const*)
Line
Count
Source
716
15.3k
{
717
15.3k
    X265_CHECK(((intptr_t)quantCoeff & 15) == 0, "quant buffer not aligned\n");
718
15.3k
    int count = 0;
719
15.3k
    int numCoeff = trSize * trSize;
720
3.94M
    for (int i = 0; i < numCoeff; i++)
721
3.92M
    {
722
3.92M
        count += quantCoeff[i] != 0;
723
3.92M
    }
724
725
15.3k
    return count;
726
15.3k
}
int count_nonzero_c<32>(short const*)
Line
Count
Source
716
4.96k
{
717
4.96k
    X265_CHECK(((intptr_t)quantCoeff & 15) == 0, "quant buffer not aligned\n");
718
4.96k
    int count = 0;
719
4.96k
    int numCoeff = trSize * trSize;
720
5.08M
    for (int i = 0; i < numCoeff; i++)
721
5.08M
    {
722
5.08M
        count += quantCoeff[i] != 0;
723
5.08M
    }
724
725
4.96k
    return count;
726
4.96k
}
727
728
template<int trSize>
729
uint32_t copy_count(int16_t* coeff, const int16_t* residual, intptr_t resiStride)
730
4.05M
{
731
4.05M
    uint32_t numSig = 0;
732
24.4M
    for (int k = 0; k < trSize; k++)
733
20.3M
    {
734
155M
        for (int j = 0; j < trSize; j++)
735
135M
        {
736
135M
            coeff[k * trSize + j] = residual[k * resiStride + j];
737
135M
            numSig += (residual[k * resiStride + j] != 0);
738
135M
        }
739
20.3M
    }
740
741
4.05M
    return numSig;
742
4.05M
}
unsigned int copy_count<4>(short*, short const*, long)
Line
Count
Source
730
3.34M
{
731
3.34M
    uint32_t numSig = 0;
732
16.7M
    for (int k = 0; k < trSize; k++)
733
13.3M
    {
734
66.9M
        for (int j = 0; j < trSize; j++)
735
53.5M
        {
736
53.5M
            coeff[k * trSize + j] = residual[k * resiStride + j];
737
53.5M
            numSig += (residual[k * resiStride + j] != 0);
738
53.5M
        }
739
13.3M
    }
740
741
3.34M
    return numSig;
742
3.34M
}
unsigned int copy_count<8>(short*, short const*, long)
Line
Count
Source
730
568k
{
731
568k
    uint32_t numSig = 0;
732
5.11M
    for (int k = 0; k < trSize; k++)
733
4.54M
    {
734
40.9M
        for (int j = 0; j < trSize; j++)
735
36.3M
        {
736
36.3M
            coeff[k * trSize + j] = residual[k * resiStride + j];
737
36.3M
            numSig += (residual[k * resiStride + j] != 0);
738
36.3M
        }
739
4.54M
    }
740
741
568k
    return numSig;
742
568k
}
unsigned int copy_count<16>(short*, short const*, long)
Line
Count
Source
730
129k
{
731
129k
    uint32_t numSig = 0;
732
2.19M
    for (int k = 0; k < trSize; k++)
733
2.06M
    {
734
35.1M
        for (int j = 0; j < trSize; j++)
735
33.0M
        {
736
33.0M
            coeff[k * trSize + j] = residual[k * resiStride + j];
737
33.0M
            numSig += (residual[k * resiStride + j] != 0);
738
33.0M
        }
739
2.06M
    }
740
741
129k
    return numSig;
742
129k
}
unsigned int copy_count<32>(short*, short const*, long)
Line
Count
Source
730
12.2k
{
731
12.2k
    uint32_t numSig = 0;
732
403k
    for (int k = 0; k < trSize; k++)
733
391k
    {
734
12.9M
        for (int j = 0; j < trSize; j++)
735
12.5M
        {
736
12.5M
            coeff[k * trSize + j] = residual[k * resiStride + j];
737
12.5M
            numSig += (residual[k * resiStride + j] != 0);
738
12.5M
        }
739
391k
    }
740
741
12.2k
    return numSig;
742
12.2k
}
743
744
static void denoiseDct_c(int16_t* dctCoef, uint32_t* resSum, const uint16_t* offset, int numCoeff)
745
0
{
746
0
    for (int i = 0; i < numCoeff; i++)
747
0
    {
748
0
        int level = dctCoef[i];
749
0
        int sign = level >> 31;
750
0
        level = (level + sign) ^ sign;
751
0
        resSum[i] += level;
752
0
        level -= offset[i];
753
0
        dctCoef[i] = (int16_t)(level < 0 ? 0 : (level ^ sign) - sign);
754
0
    }
755
0
}
756
757
static int scanPosLast_c(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* /*scanCG4x4*/, const int /*trSize*/)
758
117k
{
759
117k
    memset(coeffNum, 0, MLS_GRP_NUM * sizeof(*coeffNum));
760
117k
    memset(coeffFlag, 0, MLS_GRP_NUM * sizeof(*coeffFlag));
761
117k
    memset(coeffSign, 0, MLS_GRP_NUM * sizeof(*coeffSign));
762
763
117k
    int scanPosLast = 0;
764
117k
    do
765
2.39M
    {
766
2.39M
        const uint32_t cgIdx = (uint32_t)scanPosLast >> MLS_CG_SIZE;
767
768
2.39M
        const uint32_t posLast = scan[scanPosLast++];
769
770
2.39M
        const int curCoeff = coeff[posLast];
771
2.39M
        const uint32_t isNZCoeff = (curCoeff != 0);
772
        // get L1 sig map
773
        // NOTE: the new algorithm is complicated, so I keep reference code here
774
        //uint32_t posy   = posLast >> log2TrSize;
775
        //uint32_t posx   = posLast - (posy << log2TrSize);
776
        //uint32_t blkIdx0 = ((posy >> MLS_CG_LOG2_SIZE) << codingParameters.log2TrSizeCG) + (posx >> MLS_CG_LOG2_SIZE);
777
        //const uint32_t blkIdx = ((posLast >> (2 * MLS_CG_LOG2_SIZE)) & ~maskPosXY) + ((posLast >> MLS_CG_LOG2_SIZE) & maskPosXY);
778
        //sigCoeffGroupFlag64 |= ((uint64_t)isNZCoeff << blkIdx);
779
2.39M
        numSig -= isNZCoeff;
780
781
        // TODO: optimize by instruction BTS
782
2.39M
        coeffSign[cgIdx] += (uint16_t)(((uint32_t)curCoeff >> 31) << coeffNum[cgIdx]);
783
2.39M
        coeffFlag[cgIdx] = (coeffFlag[cgIdx] << 1) + (uint16_t)isNZCoeff;
784
2.39M
        coeffNum[cgIdx] += (uint8_t)isNZCoeff;
785
2.39M
    }
786
2.39M
    while (numSig > 0);
787
117k
    return scanPosLast - 1;
788
117k
}
789
790
// NOTE: no defined value on lastNZPosInCG & absSumSign when ALL ZEROS block as input
791
static uint32_t findPosFirstLast_c(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16])
792
4.99k
{
793
4.99k
    int n;
794
795
18.7k
    for (n = SCAN_SET_SIZE - 1; n >= 0; n--)
796
18.7k
    {
797
18.7k
        const uint32_t idx = scanTbl[n];
798
18.7k
        const uint32_t idxY = idx / MLS_CG_SIZE;
799
18.7k
        const uint32_t idxX = idx % MLS_CG_SIZE;
800
18.7k
        if (dstCoeff[idxY * trSize + idxX])
801
4.99k
            break;
802
18.7k
    }
803
804
4.99k
    X265_CHECK(n >= -1, "non-zero coeff scan failuare!\n");
805
806
4.99k
    uint32_t lastNZPosInCG = (uint32_t)n;
807
808
4.99k
    for (n = 0; n < SCAN_SET_SIZE; n++)
809
4.99k
    {
810
4.99k
        const uint32_t idx = scanTbl[n];
811
4.99k
        const uint32_t idxY = idx / MLS_CG_SIZE;
812
4.99k
        const uint32_t idxX = idx % MLS_CG_SIZE;
813
4.99k
        if (dstCoeff[idxY * trSize + idxX])
814
4.99k
            break;
815
4.99k
    }
816
817
4.99k
    uint32_t firstNZPosInCG = (uint32_t)n;
818
819
4.99k
    uint32_t absSumSign = 0;
820
71.0k
    for (n = firstNZPosInCG; n <= (int)lastNZPosInCG; n++)
821
66.1k
    {
822
66.1k
        const uint32_t idx = scanTbl[n];
823
66.1k
        const uint32_t idxY = idx / MLS_CG_SIZE;
824
66.1k
        const uint32_t idxX = idx % MLS_CG_SIZE;
825
66.1k
        absSumSign += dstCoeff[idxY * trSize + idxX];
826
66.1k
    }
827
828
    // NOTE: when coeff block all ZERO, the lastNZPosInCG is undefined and firstNZPosInCG is 16
829
4.99k
    return ((absSumSign << 31) | (lastNZPosInCG << 8) | firstNZPosInCG);
830
4.99k
}
831
832
833
static uint32_t costCoeffNxN_c(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase)
834
140k
{
835
140k
    ALIGN_VAR_32(uint16_t, tmpCoeff[SCAN_SET_SIZE]);
836
140k
    uint32_t numNonZero = (scanPosSigOff < (SCAN_SET_SIZE - 1) ? 1 : 0);
837
140k
    uint32_t sum = 0;
838
839
    // correct offset to match assembly
840
140k
    absCoeff -= numNonZero;
841
842
701k
    for (int i = 0; i < MLS_CG_SIZE; i++)
843
561k
    {
844
561k
        tmpCoeff[i * MLS_CG_SIZE + 0] = (uint16_t)abs(coeff[i * trSize + 0]);
845
561k
        tmpCoeff[i * MLS_CG_SIZE + 1] = (uint16_t)abs(coeff[i * trSize + 1]);
846
561k
        tmpCoeff[i * MLS_CG_SIZE + 2] = (uint16_t)abs(coeff[i * trSize + 2]);
847
561k
        tmpCoeff[i * MLS_CG_SIZE + 3] = (uint16_t)abs(coeff[i * trSize + 3]);
848
561k
    }
849
850
140k
    do
851
2.20M
    {
852
2.20M
        uint32_t blkPos, sig, ctxSig;
853
2.20M
        blkPos = scan[scanPosSigOff];
854
2.20M
        const uint32_t posZeroMask = (subPosBase + scanPosSigOff) ? ~0 : 0;
855
2.20M
        sig     = scanFlagMask & 1;
856
2.20M
        scanFlagMask >>= 1;
857
2.20M
        X265_CHECK((uint32_t)(tmpCoeff[blkPos] != 0) == sig, "sign bit mistake\n");
858
2.20M
        if ((scanPosSigOff != 0) || (subPosBase == 0) || numNonZero)
859
2.20M
        {
860
2.20M
            const uint32_t cnt = tabSigCtx[blkPos] + offset;
861
2.20M
            ctxSig = cnt & posZeroMask;
862
863
            //X265_CHECK(ctxSig == Quant::getSigCtxInc(patternSigCtx, log2TrSize, trSize, codingParameters.scan[subPosBase + scanPosSigOff], bIsLuma, codingParameters.firstSignificanceMapContext), "sigCtx mistake!\n");;
864
            //encodeBin(sig, baseCtx[ctxSig]);
865
2.20M
            const uint32_t mstate = baseCtx[ctxSig];
866
2.20M
            const uint32_t mps = mstate & 1;
867
2.20M
            const uint32_t stateBits = PFX(entropyStateBits)[mstate ^ sig];
868
2.20M
            uint32_t nextState = (stateBits >> 24) + mps;
869
2.20M
            if ((mstate ^ sig) == 1)
870
16.8k
                nextState = sig;
871
2.20M
            X265_CHECK(sbacNext(mstate, sig) == nextState, "nextState check failure\n");
872
2.20M
            X265_CHECK(sbacGetEntropyBits(mstate, sig) == (stateBits & 0xFFFFFF), "entropyBits check failure\n");
873
2.20M
            baseCtx[ctxSig] = (uint8_t)nextState;
874
2.20M
            sum += stateBits;
875
2.20M
        }
876
2.20M
        assert(numNonZero <= 15);
877
2.20M
        assert(blkPos <= 15);
878
2.20M
        absCoeff[numNonZero] = tmpCoeff[blkPos];
879
2.20M
        numNonZero += sig;
880
2.20M
        scanPosSigOff--;
881
2.20M
    }
882
2.20M
    while(scanPosSigOff >= 0);
883
884
140k
    return (sum & 0xFFFFFF);
885
140k
}
886
887
static uint32_t costCoeffRemain_c(uint16_t *absCoeff, int numNonZero, int idx)
888
176k
{
889
176k
    uint32_t goRiceParam = 0;
890
891
176k
    uint32_t sum = 0;
892
176k
    int baseLevel = 3;
893
176k
    do
894
2.25M
    {
895
2.25M
        if (idx >= C1FLAG_NUMBER)
896
1.10M
            baseLevel = 1;
897
898
        // TODO: the IDX is not really idx, so this check inactive
899
        //X265_CHECK(baseLevel == ((idx < C1FLAG_NUMBER) ? (2 + firstCoeff2) : 1), "baseLevel check failurr\n");
900
2.25M
        int codeNumber = absCoeff[idx] - baseLevel;
901
902
2.25M
        if (codeNumber >= 0)
903
2.25M
        {
904
            //writeCoefRemainExGolomb(absCoeff[idx] - baseLevel, goRiceParam);
905
2.25M
            uint32_t length = 0;
906
907
2.25M
            codeNumber = ((uint32_t)codeNumber >> goRiceParam) - COEF_REMAIN_BIN_REDUCTION;
908
2.25M
            if (codeNumber >= 0)
909
2.21M
            {
910
2.21M
                {
911
2.21M
                    unsigned long cidx;
912
2.21M
                    CLZ(cidx, codeNumber + 1);
913
2.21M
                    length = cidx;
914
2.21M
                }
915
2.21M
                X265_CHECK((codeNumber != 0) || (length == 0), "length check failure\n");
916
917
2.21M
                codeNumber = (length + length);
918
2.21M
            }
919
2.25M
            sum += (COEF_REMAIN_BIN_REDUCTION + 1 + goRiceParam + codeNumber);
920
921
2.25M
            if (absCoeff[idx] > (COEF_REMAIN_BIN_REDUCTION << goRiceParam))
922
2.22M
                goRiceParam = (goRiceParam + 1) - (goRiceParam >> 2);
923
2.25M
            X265_CHECK(goRiceParam <= 4, "goRiceParam check failure\n");
924
2.25M
        }
925
2.25M
        baseLevel = 2;
926
2.25M
        idx++;
927
2.25M
    }
928
2.25M
    while(idx < numNonZero);
929
930
176k
    return sum;
931
176k
}
932
933
934
static uint32_t costC1C2Flag_c(uint16_t *absCoeff, intptr_t numC1Flag, uint8_t *baseCtxMod, intptr_t ctxOffset)
935
182k
{
936
182k
    uint32_t sum = 0;
937
182k
    uint32_t c1 = 1;
938
182k
    uint32_t firstC2Idx = 8;
939
182k
    uint32_t firstC2Flag = 2;
940
182k
    uint32_t c1Next = 0xFFFFFFFE;
941
942
182k
    int idx = 0;
943
182k
    do
944
1.16M
    {
945
1.16M
        uint32_t symbol1 = absCoeff[idx] > 1;
946
1.16M
        uint32_t symbol2 = absCoeff[idx] > 2;
947
        //encodeBin(symbol1, baseCtxMod[c1]);
948
1.16M
        {
949
1.16M
            const uint32_t mstate = baseCtxMod[c1];
950
1.16M
            baseCtxMod[c1] = sbacNext(mstate, symbol1);
951
1.16M
            sum += sbacGetEntropyBits(mstate, symbol1);
952
1.16M
        }
953
954
1.16M
        if (symbol1)
955
1.14M
            c1Next = 0;
956
957
1.16M
        if (symbol1 + firstC2Flag == 3)
958
176k
            firstC2Flag = symbol2;
959
960
1.16M
        if (symbol1 + firstC2Idx == 9)
961
176k
            firstC2Idx  = idx;
962
963
1.16M
        c1 = (c1Next & 3);
964
1.16M
        c1Next >>= 2;
965
1.16M
        X265_CHECK(c1 <= 3, "c1 check failure\n");
966
1.16M
        idx++;
967
1.16M
    }
968
1.16M
    while(idx < numC1Flag);
969
970
182k
    if (!c1)
971
176k
    {
972
176k
        X265_CHECK((firstC2Flag <= 1), "firstC2FlagIdx check failure\n");
973
974
176k
        baseCtxMod += ctxOffset;
975
976
        //encodeBin(firstC2Flag, baseCtxMod[0]);
977
176k
        {
978
176k
            const uint32_t mstate = baseCtxMod[0];
979
176k
            baseCtxMod[0] = sbacNext(mstate, firstC2Flag);
980
176k
            sum += sbacGetEntropyBits(mstate, firstC2Flag);
981
176k
        }
982
176k
    }
983
182k
    return (sum & 0x00FFFFFF) + (c1 << 26) + (firstC2Idx << 28);
984
182k
}
985
template<int log2TrSize>
986
static void nonPsyRdoQuant_c(int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, uint32_t blkPos)
987
106k
{
988
106k
    const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
989
106k
    const int scaleBits = SCALE_BITS - 2 * transformShift;
990
106k
    const uint32_t trSize = 1 << log2TrSize;
991
992
531k
    for (int y = 0; y < MLS_CG_SIZE; y++)
993
424k
    {
994
2.12M
        for (int x = 0; x < MLS_CG_SIZE; x++)
995
1.69M
        {
996
1.69M
             int64_t signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
997
1.69M
             costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits));
998
1.69M
             *totalUncodedCost += costUncoded[blkPos + x];
999
1.69M
             *totalRdCost += costUncoded[blkPos + x];
1000
1.69M
        }
1001
424k
        blkPos += trSize;
1002
424k
    }
1003
106k
}
Unexecuted instantiation: dct.cpp:void nonPsyRdoQuant_c<2>(short*, long*, long*, long*, unsigned int)
dct.cpp:void nonPsyRdoQuant_c<3>(short*, long*, long*, long*, unsigned int)
Line
Count
Source
987
45.1k
{
988
45.1k
    const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
989
45.1k
    const int scaleBits = SCALE_BITS - 2 * transformShift;
990
45.1k
    const uint32_t trSize = 1 << log2TrSize;
991
992
225k
    for (int y = 0; y < MLS_CG_SIZE; y++)
993
180k
    {
994
902k
        for (int x = 0; x < MLS_CG_SIZE; x++)
995
722k
        {
996
722k
             int64_t signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
997
722k
             costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits));
998
722k
             *totalUncodedCost += costUncoded[blkPos + x];
999
722k
             *totalRdCost += costUncoded[blkPos + x];
1000
722k
        }
1001
180k
        blkPos += trSize;
1002
180k
    }
1003
45.1k
}
dct.cpp:void nonPsyRdoQuant_c<4>(short*, long*, long*, long*, unsigned int)
Line
Count
Source
987
61.0k
{
988
61.0k
    const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
989
61.0k
    const int scaleBits = SCALE_BITS - 2 * transformShift;
990
61.0k
    const uint32_t trSize = 1 << log2TrSize;
991
992
305k
    for (int y = 0; y < MLS_CG_SIZE; y++)
993
244k
    {
994
1.22M
        for (int x = 0; x < MLS_CG_SIZE; x++)
995
976k
        {
996
976k
             int64_t signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
997
976k
             costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits));
998
976k
             *totalUncodedCost += costUncoded[blkPos + x];
999
976k
             *totalRdCost += costUncoded[blkPos + x];
1000
976k
        }
1001
244k
        blkPos += trSize;
1002
244k
    }
1003
61.0k
}
Unexecuted instantiation: dct.cpp:void nonPsyRdoQuant_c<5>(short*, long*, long*, long*, unsigned int)
1004
template<int log2TrSize>
1005
static void psyRdoQuant_c(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos)
1006
0
{
1007
0
    const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
1008
0
    const int scaleBits = SCALE_BITS - 2 * transformShift;
1009
0
    const uint32_t trSize = 1 << log2TrSize;
1010
0
    int max = X265_MAX(0, (2 * transformShift + 1));
1011
1012
0
    for (int y = 0; y < MLS_CG_SIZE; y++)
1013
0
    {
1014
0
        for (int x = 0; x < MLS_CG_SIZE; x++)
1015
0
        {
1016
0
            int64_t signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
1017
0
            int64_t predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/
1018
1019
0
            costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits));
1020
1021
            /* when no residual coefficient is coded, predicted coef == recon coef */
1022
0
            costUncoded[blkPos + x] -= static_cast<int64_t>((double)(((*psyScale) * predictedCoef) >> max));
1023
1024
0
            *totalUncodedCost += costUncoded[blkPos + x];
1025
0
            *totalRdCost += costUncoded[blkPos + x];
1026
0
        }
1027
0
        blkPos += trSize;
1028
0
    }
1029
0
}
Unexecuted instantiation: dct.cpp:void psyRdoQuant_c<2>(short*, short*, long*, long*, long*, long*, unsigned int)
Unexecuted instantiation: dct.cpp:void psyRdoQuant_c<3>(short*, short*, long*, long*, long*, long*, unsigned int)
Unexecuted instantiation: dct.cpp:void psyRdoQuant_c<4>(short*, short*, long*, long*, long*, long*, unsigned int)
Unexecuted instantiation: dct.cpp:void psyRdoQuant_c<5>(short*, short*, long*, long*, long*, long*, unsigned int)
1030
template<int log2TrSize>
1031
static void psyRdoQuant_c_1(int16_t *m_resiDctCoeff, /*int16_t  *m_fencDctCoeff, */ int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, /* int64_t *psyScale,*/ uint32_t blkPos)
1032
276k
{
1033
276k
  const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
1034
276k
  const int scaleBits = SCALE_BITS - 2 * transformShift;
1035
276k
  const uint32_t trSize = 1 << log2TrSize;
1036
1037
1.38M
  for (int y = 0; y < MLS_CG_SIZE; y++)
1038
1.10M
  {
1039
5.53M
    for (int x = 0; x < MLS_CG_SIZE; x++)
1040
4.43M
    {
1041
4.43M
      int64_t signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
1042
4.43M
      costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits));
1043
4.43M
      *totalUncodedCost += costUncoded[blkPos + x];
1044
4.43M
      *totalRdCost += costUncoded[blkPos + x];
1045
4.43M
    }
1046
1.10M
    blkPos += trSize;
1047
1.10M
  }
1048
276k
}
Unexecuted instantiation: dct.cpp:void psyRdoQuant_c_1<2>(short*, long*, long*, long*, unsigned int)
dct.cpp:void psyRdoQuant_c_1<3>(short*, long*, long*, long*, unsigned int)
Line
Count
Source
1032
15.3k
{
1033
15.3k
  const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
1034
15.3k
  const int scaleBits = SCALE_BITS - 2 * transformShift;
1035
15.3k
  const uint32_t trSize = 1 << log2TrSize;
1036
1037
76.8k
  for (int y = 0; y < MLS_CG_SIZE; y++)
1038
61.4k
  {
1039
307k
    for (int x = 0; x < MLS_CG_SIZE; x++)
1040
245k
    {
1041
245k
      int64_t signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
1042
245k
      costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits));
1043
245k
      *totalUncodedCost += costUncoded[blkPos + x];
1044
245k
      *totalRdCost += costUncoded[blkPos + x];
1045
245k
    }
1046
61.4k
    blkPos += trSize;
1047
61.4k
  }
1048
15.3k
}
dct.cpp:void psyRdoQuant_c_1<4>(short*, long*, long*, long*, unsigned int)
Line
Count
Source
1032
97.0k
{
1033
97.0k
  const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
1034
97.0k
  const int scaleBits = SCALE_BITS - 2 * transformShift;
1035
97.0k
  const uint32_t trSize = 1 << log2TrSize;
1036
1037
485k
  for (int y = 0; y < MLS_CG_SIZE; y++)
1038
388k
  {
1039
1.94M
    for (int x = 0; x < MLS_CG_SIZE; x++)
1040
1.55M
    {
1041
1.55M
      int64_t signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
1042
1.55M
      costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits));
1043
1.55M
      *totalUncodedCost += costUncoded[blkPos + x];
1044
1.55M
      *totalRdCost += costUncoded[blkPos + x];
1045
1.55M
    }
1046
388k
    blkPos += trSize;
1047
388k
  }
1048
97.0k
}
dct.cpp:void psyRdoQuant_c_1<5>(short*, long*, long*, long*, unsigned int)
Line
Count
Source
1032
164k
{
1033
164k
  const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
1034
164k
  const int scaleBits = SCALE_BITS - 2 * transformShift;
1035
164k
  const uint32_t trSize = 1 << log2TrSize;
1036
1037
822k
  for (int y = 0; y < MLS_CG_SIZE; y++)
1038
657k
  {
1039
3.28M
    for (int x = 0; x < MLS_CG_SIZE; x++)
1040
2.63M
    {
1041
2.63M
      int64_t signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
1042
2.63M
      costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits));
1043
2.63M
      *totalUncodedCost += costUncoded[blkPos + x];
1044
2.63M
      *totalRdCost += costUncoded[blkPos + x];
1045
2.63M
    }
1046
657k
    blkPos += trSize;
1047
657k
  }
1048
164k
}
1049
template<int log2TrSize>
1050
static void psyRdoQuant_c_2(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos)
1051
276k
{
1052
276k
  const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
1053
1054
276k
  const uint32_t trSize = 1 << log2TrSize;
1055
276k
  int max = X265_MAX(0, (2 * transformShift + 1));
1056
1057
1.38M
  for (int y = 0; y < MLS_CG_SIZE; y++)
1058
1.10M
  {
1059
5.53M
    for (int x = 0; x < MLS_CG_SIZE; x++)
1060
4.43M
    {
1061
4.43M
      int64_t signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
1062
4.43M
      int64_t predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/
1063
4.43M
      costUncoded[blkPos + x] -= static_cast<int64_t>((double)(((*psyScale) * predictedCoef) >> max));
1064
4.43M
      *totalUncodedCost += costUncoded[blkPos + x];
1065
4.43M
      *totalRdCost += costUncoded[blkPos + x];
1066
4.43M
    }
1067
1.10M
    blkPos += trSize;
1068
1.10M
  }
1069
276k
}
Unexecuted instantiation: dct.cpp:void psyRdoQuant_c_2<2>(short*, short*, long*, long*, long*, long*, unsigned int)
dct.cpp:void psyRdoQuant_c_2<3>(short*, short*, long*, long*, long*, long*, unsigned int)
Line
Count
Source
1051
15.3k
{
1052
15.3k
  const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
1053
1054
15.3k
  const uint32_t trSize = 1 << log2TrSize;
1055
15.3k
  int max = X265_MAX(0, (2 * transformShift + 1));
1056
1057
76.8k
  for (int y = 0; y < MLS_CG_SIZE; y++)
1058
61.4k
  {
1059
307k
    for (int x = 0; x < MLS_CG_SIZE; x++)
1060
245k
    {
1061
245k
      int64_t signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
1062
245k
      int64_t predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/
1063
245k
      costUncoded[blkPos + x] -= static_cast<int64_t>((double)(((*psyScale) * predictedCoef) >> max));
1064
245k
      *totalUncodedCost += costUncoded[blkPos + x];
1065
245k
      *totalRdCost += costUncoded[blkPos + x];
1066
245k
    }
1067
61.4k
    blkPos += trSize;
1068
61.4k
  }
1069
15.3k
}
dct.cpp:void psyRdoQuant_c_2<4>(short*, short*, long*, long*, long*, long*, unsigned int)
Line
Count
Source
1051
97.0k
{
1052
97.0k
  const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
1053
1054
97.0k
  const uint32_t trSize = 1 << log2TrSize;
1055
97.0k
  int max = X265_MAX(0, (2 * transformShift + 1));
1056
1057
485k
  for (int y = 0; y < MLS_CG_SIZE; y++)
1058
388k
  {
1059
1.94M
    for (int x = 0; x < MLS_CG_SIZE; x++)
1060
1.55M
    {
1061
1.55M
      int64_t signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
1062
1.55M
      int64_t predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/
1063
1.55M
      costUncoded[blkPos + x] -= static_cast<int64_t>((double)(((*psyScale) * predictedCoef) >> max));
1064
1.55M
      *totalUncodedCost += costUncoded[blkPos + x];
1065
1.55M
      *totalRdCost += costUncoded[blkPos + x];
1066
1.55M
    }
1067
388k
    blkPos += trSize;
1068
388k
  }
1069
97.0k
}
dct.cpp:void psyRdoQuant_c_2<5>(short*, short*, long*, long*, long*, long*, unsigned int)
Line
Count
Source
1051
164k
{
1052
164k
  const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
1053
1054
164k
  const uint32_t trSize = 1 << log2TrSize;
1055
164k
  int max = X265_MAX(0, (2 * transformShift + 1));
1056
1057
822k
  for (int y = 0; y < MLS_CG_SIZE; y++)
1058
657k
  {
1059
3.28M
    for (int x = 0; x < MLS_CG_SIZE; x++)
1060
2.63M
    {
1061
2.63M
      int64_t signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
1062
2.63M
      int64_t predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/
1063
2.63M
      costUncoded[blkPos + x] -= static_cast<int64_t>((double)(((*psyScale) * predictedCoef) >> max));
1064
2.63M
      *totalUncodedCost += costUncoded[blkPos + x];
1065
2.63M
      *totalRdCost += costUncoded[blkPos + x];
1066
2.63M
    }
1067
657k
    blkPos += trSize;
1068
657k
  }
1069
164k
}
1070
1071
namespace X265_NS {
1072
// x265 private namespace
1073
void setupDCTPrimitives_c(EncoderPrimitives& p)
1074
1
{
1075
1
    p.dequant_scaling = dequant_scaling_c;
1076
1
    p.dequant_normal = dequant_normal_c;
1077
1
    p.quant = quant_c;
1078
1
    p.nquant = nquant_c;
1079
1
    p.cu[BLOCK_4x4].nonPsyRdoQuant   = nonPsyRdoQuant_c<2>;
1080
1
    p.cu[BLOCK_8x8].nonPsyRdoQuant   = nonPsyRdoQuant_c<3>;
1081
1
    p.cu[BLOCK_16x16].nonPsyRdoQuant = nonPsyRdoQuant_c<4>;
1082
1
    p.cu[BLOCK_32x32].nonPsyRdoQuant = nonPsyRdoQuant_c<5>;
1083
1
    p.cu[BLOCK_4x4].psyRdoQuant = psyRdoQuant_c<2>;
1084
1
    p.cu[BLOCK_8x8].psyRdoQuant = psyRdoQuant_c<3>;
1085
1
    p.cu[BLOCK_16x16].psyRdoQuant = psyRdoQuant_c<4>;
1086
1
    p.cu[BLOCK_32x32].psyRdoQuant = psyRdoQuant_c<5>;
1087
1
    p.dst4x4 = dst4_c;
1088
1
    p.cu[BLOCK_4x4].dct   = dct4_c;
1089
1
    p.cu[BLOCK_8x8].dct   = dct8_c;
1090
1
    p.cu[BLOCK_16x16].dct = dct16_c;
1091
1
    p.cu[BLOCK_32x32].dct = dct32_c;
1092
1
    p.idst4x4 = idst4_c;
1093
1
    p.cu[BLOCK_4x4].idct   = idct4_c;
1094
1
    p.cu[BLOCK_8x8].idct   = idct8_c;
1095
1
    p.cu[BLOCK_16x16].idct = idct16_c;
1096
1
    p.cu[BLOCK_32x32].idct = idct32_c;
1097
1
    p.denoiseDct = denoiseDct_c;
1098
1
    p.cu[BLOCK_4x4].count_nonzero = count_nonzero_c<4>;
1099
1
    p.cu[BLOCK_8x8].count_nonzero = count_nonzero_c<8>;
1100
1
    p.cu[BLOCK_16x16].count_nonzero = count_nonzero_c<16>;
1101
1
    p.cu[BLOCK_32x32].count_nonzero = count_nonzero_c<32>;
1102
1103
1
    p.cu[BLOCK_4x4].copy_cnt   = copy_count<4>;
1104
1
    p.cu[BLOCK_8x8].copy_cnt   = copy_count<8>;
1105
1
    p.cu[BLOCK_16x16].copy_cnt = copy_count<16>;
1106
1
    p.cu[BLOCK_32x32].copy_cnt = copy_count<32>;
1107
1
  p.cu[BLOCK_4x4].psyRdoQuant_1p = psyRdoQuant_c_1<2>;
1108
1
  p.cu[BLOCK_4x4].psyRdoQuant_2p = psyRdoQuant_c_2<2>;
1109
1
  p.cu[BLOCK_8x8].psyRdoQuant_1p = psyRdoQuant_c_1<3>;
1110
1
  p.cu[BLOCK_8x8].psyRdoQuant_2p = psyRdoQuant_c_2<3>;
1111
1
  p.cu[BLOCK_16x16].psyRdoQuant_1p = psyRdoQuant_c_1<4>;
1112
1
  p.cu[BLOCK_16x16].psyRdoQuant_2p = psyRdoQuant_c_2<4>;
1113
1
  p.cu[BLOCK_32x32].psyRdoQuant_1p = psyRdoQuant_c_1<5>;
1114
1
  p.cu[BLOCK_32x32].psyRdoQuant_2p = psyRdoQuant_c_2<5>;
1115
1
    p.scanPosLast = scanPosLast_c;
1116
1
    p.findPosFirstLast = findPosFirstLast_c;
1117
1
    p.costCoeffNxN = costCoeffNxN_c;
1118
1
    p.costCoeffRemain = costCoeffRemain_c;
1119
1
    p.costC1C2Flag = costC1C2Flag_c;
1120
1
}
1121
}