Coverage Report

Created: 2026-02-26 06:25

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/work/x265/source/common/dct.cpp
Line
Count
Source
1
/*****************************************************************************
2
 * Copyright (C) 2013-2020 MulticoreWare, Inc
3
 *
4
 * Authors: Mandar Gurav <mandar@multicorewareinc.com>
5
 *          Deepthi Devaki Akkoorath <deepthidevaki@multicorewareinc.com>
6
 *          Mahesh Pittala <mahesh@multicorewareinc.com>
7
 *          Rajesh Paulraj <rajesh@multicorewareinc.com>
8
 *          Min Chen <min.chen@multicorewareinc.com>
9
 *          Praveen Kumar Tiwari <praveen@multicorewareinc.com>
10
 *          Nabajit Deka <nabajit@multicorewareinc.com>
11
 *
12
 * This program is free software; you can redistribute it and/or modify
13
 * it under the terms of the GNU General Public License as published by
14
 * the Free Software Foundation; either version 2 of the License, or
15
 * (at your option) any later version.
16
 *
17
 * This program is distributed in the hope that it will be useful,
18
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20
 * GNU General Public License for more details.
21
 *
22
 * You should have received a copy of the GNU General Public License
23
 * along with this program; if not, write to the Free Software
24
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
25
 *
26
 * This program is also available under a commercial proprietary license.
27
 * For more information, contact us at license @ x265.com.
28
 *****************************************************************************/
29
30
#include "common.h"
31
#include "primitives.h"
32
#include "contexts.h"   // costCoeffNxN_c
33
#include "threading.h"  // BSR
34
35
using namespace X265_NS;
36
37
#if _MSC_VER
38
#pragma warning(disable: 4127) // conditional expression is constant, typical for templated functions
39
#endif
40
41
// Fast DST Algorithm. Full matrix multiplication for DST and Fast DST algorithm
42
// give identical results
43
static void fastForwardDst(const int16_t* block, int16_t* coeff, int shift)  // input block, output coeff
44
5.08M
{
45
5.08M
    int c[4];
46
5.08M
    int rnd_factor = 1 << (shift - 1);
47
48
25.4M
    for (int i = 0; i < 4; i++)
49
20.3M
    {
50
        // Intermediate Variables
51
20.3M
        c[0] = block[4 * i + 0] + block[4 * i + 3];
52
20.3M
        c[1] = block[4 * i + 1] + block[4 * i + 3];
53
20.3M
        c[2] = block[4 * i + 0] - block[4 * i + 1];
54
20.3M
        c[3] = 74 * block[4 * i + 2];
55
56
20.3M
        coeff[i] =      (int16_t)((29 * c[0] + 55 * c[1]  + c[3] + rnd_factor) >> shift);
57
20.3M
        coeff[4 + i] =  (int16_t)((74 * (block[4 * i + 0] + block[4 * i + 1] - block[4 * i + 3]) + rnd_factor) >> shift);
58
20.3M
        coeff[8 + i] =  (int16_t)((29 * c[2] + 55 * c[0]  - c[3] + rnd_factor) >> shift);
59
20.3M
        coeff[12 + i] = (int16_t)((55 * c[2] - 29 * c[1] + c[3] + rnd_factor) >> shift);
60
20.3M
    }
61
5.08M
}
62
63
static void inversedst(const int16_t* tmp, int16_t* block, int shift)  // input tmp, output block
64
12.5k
{
65
12.5k
    int i, c[4];
66
12.5k
    int rnd_factor = 1 << (shift - 1);
67
68
62.6k
    for (i = 0; i < 4; i++)
69
50.1k
    {
70
        // Intermediate Variables
71
50.1k
        c[0] = tmp[i] + tmp[8 + i];
72
50.1k
        c[1] = tmp[8 + i] + tmp[12 + i];
73
50.1k
        c[2] = tmp[i] - tmp[12 + i];
74
50.1k
        c[3] = 74 * tmp[4 + i];
75
76
50.1k
        block[4 * i + 0] = (int16_t)x265_clip3(-32768, 32767, (29 * c[0] + 55 * c[1]     + c[3]               + rnd_factor) >> shift);
77
50.1k
        block[4 * i + 1] = (int16_t)x265_clip3(-32768, 32767, (55 * c[2] - 29 * c[1]     + c[3]               + rnd_factor) >> shift);
78
50.1k
        block[4 * i + 2] = (int16_t)x265_clip3(-32768, 32767, (74 * (tmp[i] - tmp[8 + i]  + tmp[12 + i])      + rnd_factor) >> shift);
79
50.1k
        block[4 * i + 3] = (int16_t)x265_clip3(-32768, 32767, (55 * c[0] + 29 * c[2]     - c[3]               + rnd_factor) >> shift);
80
50.1k
    }
81
12.5k
}
82
83
static void partialButterfly16(const int16_t* src, int16_t* dst, int shift, int line)
84
784k
{
85
784k
    int j, k;
86
784k
    int E[8], O[8];
87
784k
    int EE[4], EO[4];
88
784k
    int EEE[2], EEO[2];
89
784k
    int add = 1 << (shift - 1);
90
91
13.3M
    for (j = 0; j < line; j++)
92
12.5M
    {
93
        /* E and O */
94
112M
        for (k = 0; k < 8; k++)
95
100M
        {
96
100M
            E[k] = src[k] + src[15 - k];
97
100M
            O[k] = src[k] - src[15 - k];
98
100M
        }
99
100
        /* EE and EO */
101
62.5M
        for (k = 0; k < 4; k++)
102
50.0M
        {
103
50.0M
            EE[k] = E[k] + E[7 - k];
104
50.0M
            EO[k] = E[k] - E[7 - k];
105
50.0M
        }
106
107
        /* EEE and EEO */
108
12.5M
        EEE[0] = EE[0] + EE[3];
109
12.5M
        EEO[0] = EE[0] - EE[3];
110
12.5M
        EEE[1] = EE[1] + EE[2];
111
12.5M
        EEO[1] = EE[1] - EE[2];
112
113
12.5M
        dst[0] = (int16_t)((g_t16[0][0] * EEE[0] + g_t16[0][1] * EEE[1] + add) >> shift);
114
12.5M
        dst[8 * line] = (int16_t)((g_t16[8][0] * EEE[0] + g_t16[8][1] * EEE[1] + add) >> shift);
115
12.5M
        dst[4 * line] = (int16_t)((g_t16[4][0] * EEO[0] + g_t16[4][1] * EEO[1] + add) >> shift);
116
12.5M
        dst[12 * line] = (int16_t)((g_t16[12][0] * EEO[0] + g_t16[12][1] * EEO[1] + add) >> shift);
117
118
62.6M
        for (k = 2; k < 16; k += 4)
119
50.1M
        {
120
50.1M
            dst[k * line] = (int16_t)((g_t16[k][0] * EO[0] + g_t16[k][1] * EO[1] + g_t16[k][2] * EO[2] +
121
50.1M
                                       g_t16[k][3] * EO[3] + add) >> shift);
122
50.1M
        }
123
124
112M
        for (k = 1; k < 16; k += 2)
125
100M
        {
126
100M
            dst[k * line] =  (int16_t)((g_t16[k][0] * O[0] + g_t16[k][1] * O[1] + g_t16[k][2] * O[2] + g_t16[k][3] * O[3] +
127
100M
                                        g_t16[k][4] * O[4] + g_t16[k][5] * O[5] + g_t16[k][6] * O[6] + g_t16[k][7] * O[7] +
128
100M
                                        add) >> shift);
129
100M
        }
130
131
12.5M
        src += 16;
132
12.5M
        dst++;
133
12.5M
    }
134
784k
}
135
136
static void partialButterfly32(const int16_t* src, int16_t* dst, int shift, int line)
137
87.0k
{
138
87.0k
    int j, k;
139
87.0k
    int E[16], O[16];
140
87.0k
    int EE[8], EO[8];
141
87.0k
    int EEE[4], EEO[4];
142
87.0k
    int EEEE[2], EEEO[2];
143
87.0k
    int add = 1 << (shift - 1);
144
145
2.87M
    for (j = 0; j < line; j++)
146
2.78M
    {
147
        /* E and O*/
148
47.3M
        for (k = 0; k < 16; k++)
149
44.5M
        {
150
44.5M
            E[k] = src[k] + src[31 - k];
151
44.5M
            O[k] = src[k] - src[31 - k];
152
44.5M
        }
153
154
        /* EE and EO */
155
25.0M
        for (k = 0; k < 8; k++)
156
22.2M
        {
157
22.2M
            EE[k] = E[k] + E[15 - k];
158
22.2M
            EO[k] = E[k] - E[15 - k];
159
22.2M
        }
160
161
        /* EEE and EEO */
162
13.9M
        for (k = 0; k < 4; k++)
163
11.1M
        {
164
11.1M
            EEE[k] = EE[k] + EE[7 - k];
165
11.1M
            EEO[k] = EE[k] - EE[7 - k];
166
11.1M
        }
167
168
        /* EEEE and EEEO */
169
2.78M
        EEEE[0] = EEE[0] + EEE[3];
170
2.78M
        EEEO[0] = EEE[0] - EEE[3];
171
2.78M
        EEEE[1] = EEE[1] + EEE[2];
172
2.78M
        EEEO[1] = EEE[1] - EEE[2];
173
174
2.78M
        dst[0] = (int16_t)((g_t32[0][0] * EEEE[0] + g_t32[0][1] * EEEE[1] + add) >> shift);
175
2.78M
        dst[16 * line] = (int16_t)((g_t32[16][0] * EEEE[0] + g_t32[16][1] * EEEE[1] + add) >> shift);
176
2.78M
        dst[8 * line] = (int16_t)((g_t32[8][0] * EEEO[0] + g_t32[8][1] * EEEO[1] + add) >> shift);
177
2.78M
        dst[24 * line] = (int16_t)((g_t32[24][0] * EEEO[0] + g_t32[24][1] * EEEO[1] + add) >> shift);
178
13.9M
        for (k = 4; k < 32; k += 8)
179
11.1M
        {
180
11.1M
            dst[k * line] = (int16_t)((g_t32[k][0] * EEO[0] + g_t32[k][1] * EEO[1] + g_t32[k][2] * EEO[2] +
181
11.1M
                                       g_t32[k][3] * EEO[3] + add) >> shift);
182
11.1M
        }
183
184
25.0M
        for (k = 2; k < 32; k += 4)
185
22.2M
        {
186
22.2M
            dst[k * line] = (int16_t)((g_t32[k][0] * EO[0] + g_t32[k][1] * EO[1] + g_t32[k][2] * EO[2] +
187
22.2M
                                       g_t32[k][3] * EO[3] + g_t32[k][4] * EO[4] + g_t32[k][5] * EO[5] +
188
22.2M
                                       g_t32[k][6] * EO[6] + g_t32[k][7] * EO[7] + add) >> shift);
189
22.2M
        }
190
191
47.2M
        for (k = 1; k < 32; k += 2)
192
44.4M
        {
193
44.4M
            dst[k * line] = (int16_t)((g_t32[k][0] * O[0] + g_t32[k][1] * O[1] + g_t32[k][2] * O[2] + g_t32[k][3] * O[3] +
194
44.4M
                                       g_t32[k][4] * O[4] + g_t32[k][5] * O[5] + g_t32[k][6] * O[6] + g_t32[k][7] * O[7] +
195
44.4M
                                       g_t32[k][8] * O[8] + g_t32[k][9] * O[9] + g_t32[k][10] * O[10] + g_t32[k][11] *
196
44.4M
                                       O[11] + g_t32[k][12] * O[12] + g_t32[k][13] * O[13] + g_t32[k][14] * O[14] +
197
44.4M
                                       g_t32[k][15] * O[15] + add) >> shift);
198
44.4M
        }
199
200
2.78M
        src += 32;
201
2.78M
        dst++;
202
2.78M
    }
203
87.0k
}
204
205
static void partialButterfly8(const int16_t* src, int16_t* dst, int shift, int line)
206
3.54M
{
207
3.54M
    int j, k;
208
3.54M
    int E[4], O[4];
209
3.54M
    int EE[2], EO[2];
210
3.54M
    int add = 1 << (shift - 1);
211
212
31.8M
    for (j = 0; j < line; j++)
213
28.3M
    {
214
        /* E and O*/
215
141M
        for (k = 0; k < 4; k++)
216
113M
        {
217
113M
            E[k] = src[k] + src[7 - k];
218
113M
            O[k] = src[k] - src[7 - k];
219
113M
        }
220
221
        /* EE and EO */
222
28.3M
        EE[0] = E[0] + E[3];
223
28.3M
        EO[0] = E[0] - E[3];
224
28.3M
        EE[1] = E[1] + E[2];
225
28.3M
        EO[1] = E[1] - E[2];
226
227
28.3M
        dst[0] = (int16_t)((g_t8[0][0] * EE[0] + g_t8[0][1] * EE[1] + add) >> shift);
228
28.3M
        dst[4 * line] = (int16_t)((g_t8[4][0] * EE[0] + g_t8[4][1] * EE[1] + add) >> shift);
229
28.3M
        dst[2 * line] = (int16_t)((g_t8[2][0] * EO[0] + g_t8[2][1] * EO[1] + add) >> shift);
230
28.3M
        dst[6 * line] = (int16_t)((g_t8[6][0] * EO[0] + g_t8[6][1] * EO[1] + add) >> shift);
231
232
28.3M
        dst[line] = (int16_t)((g_t8[1][0] * O[0] + g_t8[1][1] * O[1] + g_t8[1][2] * O[2] + g_t8[1][3] * O[3] + add) >> shift);
233
28.3M
        dst[3 * line] = (int16_t)((g_t8[3][0] * O[0] + g_t8[3][1] * O[1] + g_t8[3][2] * O[2] + g_t8[3][3] * O[3] + add) >> shift);
234
28.3M
        dst[5 * line] = (int16_t)((g_t8[5][0] * O[0] + g_t8[5][1] * O[1] + g_t8[5][2] * O[2] + g_t8[5][3] * O[3] + add) >> shift);
235
28.3M
        dst[7 * line] = (int16_t)((g_t8[7][0] * O[0] + g_t8[7][1] * O[1] + g_t8[7][2] * O[2] + g_t8[7][3] * O[3] + add) >> shift);
236
237
28.3M
        src += 8;
238
28.3M
        dst++;
239
28.3M
    }
240
3.54M
}
241
242
static void partialButterflyInverse4(const int16_t* src, int16_t* dst, int shift, int line)
243
0
{
244
0
    int j;
245
0
    int E[2], O[2];
246
0
    int add = 1 << (shift - 1);
247
248
0
    for (j = 0; j < line; j++)
249
0
    {
250
        /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
251
0
        O[0] = g_t4[1][0] * src[line] + g_t4[3][0] * src[3 * line];
252
0
        O[1] = g_t4[1][1] * src[line] + g_t4[3][1] * src[3 * line];
253
0
        E[0] = g_t4[0][0] * src[0] + g_t4[2][0] * src[2 * line];
254
0
        E[1] = g_t4[0][1] * src[0] + g_t4[2][1] * src[2 * line];
255
256
        /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
257
0
        dst[0] = (int16_t)(x265_clip3(-32768, 32767, (E[0] + O[0] + add) >> shift));
258
0
        dst[1] = (int16_t)(x265_clip3(-32768, 32767, (E[1] + O[1] + add) >> shift));
259
0
        dst[2] = (int16_t)(x265_clip3(-32768, 32767, (E[1] - O[1] + add) >> shift));
260
0
        dst[3] = (int16_t)(x265_clip3(-32768, 32767, (E[0] - O[0] + add) >> shift));
261
262
0
        src++;
263
0
        dst += 4;
264
0
    }
265
0
}
266
267
static void partialButterflyInverse8(const int16_t* src, int16_t* dst, int shift, int line)
268
0
{
269
0
    int j, k;
270
0
    int E[4], O[4];
271
0
    int EE[2], EO[2];
272
0
    int add = 1 << (shift - 1);
273
274
0
    for (j = 0; j < line; j++)
275
0
    {
276
        /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
277
0
        for (k = 0; k < 4; k++)
278
0
        {
279
0
            O[k] = g_t8[1][k] * src[line] + g_t8[3][k] * src[3 * line] + g_t8[5][k] * src[5 * line] + g_t8[7][k] * src[7 * line];
280
0
        }
281
282
0
        EO[0] = g_t8[2][0] * src[2 * line] + g_t8[6][0] * src[6 * line];
283
0
        EO[1] = g_t8[2][1] * src[2 * line] + g_t8[6][1] * src[6 * line];
284
0
        EE[0] = g_t8[0][0] * src[0] + g_t8[4][0] * src[4 * line];
285
0
        EE[1] = g_t8[0][1] * src[0] + g_t8[4][1] * src[4 * line];
286
287
        /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
288
0
        E[0] = EE[0] + EO[0];
289
0
        E[3] = EE[0] - EO[0];
290
0
        E[1] = EE[1] + EO[1];
291
0
        E[2] = EE[1] - EO[1];
292
0
        for (k = 0; k < 4; k++)
293
0
        {
294
0
            dst[k] = (int16_t)x265_clip3(-32768, 32767, (E[k] + O[k] + add) >> shift);
295
0
            dst[k + 4] = (int16_t)x265_clip3(-32768, 32767, (E[3 - k] - O[3 - k] + add) >> shift);
296
0
        }
297
298
0
        src++;
299
0
        dst += 8;
300
0
    }
301
0
}
302
303
static void partialButterflyInverse16(const int16_t* src, int16_t* dst, int shift, int line)
304
0
{
305
0
    int j, k;
306
0
    int E[8], O[8];
307
0
    int EE[4], EO[4];
308
0
    int EEE[2], EEO[2];
309
0
    int add = 1 << (shift - 1);
310
311
0
    for (j = 0; j < line; j++)
312
0
    {
313
        /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
314
0
        for (k = 0; k < 8; k++)
315
0
        {
316
0
            O[k] = g_t16[1][k] * src[line] + g_t16[3][k] * src[3 * line] + g_t16[5][k] * src[5 * line] + g_t16[7][k] * src[7 * line] +
317
0
                g_t16[9][k] * src[9 * line] + g_t16[11][k] * src[11 * line] + g_t16[13][k] * src[13 * line] + g_t16[15][k] * src[15 * line];
318
0
        }
319
320
0
        for (k = 0; k < 4; k++)
321
0
        {
322
0
            EO[k] = g_t16[2][k] * src[2 * line] + g_t16[6][k] * src[6 * line] + g_t16[10][k] * src[10 * line] + g_t16[14][k] * src[14 * line];
323
0
        }
324
325
0
        EEO[0] = g_t16[4][0] * src[4 * line] + g_t16[12][0] * src[12 * line];
326
0
        EEE[0] = g_t16[0][0] * src[0] + g_t16[8][0] * src[8 * line];
327
0
        EEO[1] = g_t16[4][1] * src[4 * line] + g_t16[12][1] * src[12 * line];
328
0
        EEE[1] = g_t16[0][1] * src[0] + g_t16[8][1] * src[8 * line];
329
330
        /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
331
0
        for (k = 0; k < 2; k++)
332
0
        {
333
0
            EE[k] = EEE[k] + EEO[k];
334
0
            EE[k + 2] = EEE[1 - k] - EEO[1 - k];
335
0
        }
336
337
0
        for (k = 0; k < 4; k++)
338
0
        {
339
0
            E[k] = EE[k] + EO[k];
340
0
            E[k + 4] = EE[3 - k] - EO[3 - k];
341
0
        }
342
343
0
        for (k = 0; k < 8; k++)
344
0
        {
345
0
            dst[k]   = (int16_t)x265_clip3(-32768, 32767, (E[k] + O[k] + add) >> shift);
346
0
            dst[k + 8] = (int16_t)x265_clip3(-32768, 32767, (E[7 - k] - O[7 - k] + add) >> shift);
347
0
        }
348
349
0
        src++;
350
0
        dst += 16;
351
0
    }
352
0
}
353
354
static void partialButterflyInverse32(const int16_t* src, int16_t* dst, int shift, int line)
355
0
{
356
0
    int j, k;
357
0
    int E[16], O[16];
358
0
    int EE[8], EO[8];
359
0
    int EEE[4], EEO[4];
360
0
    int EEEE[2], EEEO[2];
361
0
    int add = 1 << (shift - 1);
362
363
0
    for (j = 0; j < line; j++)
364
0
    {
365
        /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
366
0
        for (k = 0; k < 16; k++)
367
0
        {
368
0
            O[k] = g_t32[1][k] * src[line] + g_t32[3][k] * src[3 * line] + g_t32[5][k] * src[5 * line] + g_t32[7][k] * src[7 * line] +
369
0
                g_t32[9][k] * src[9 * line] + g_t32[11][k] * src[11 * line] + g_t32[13][k] * src[13 * line] + g_t32[15][k] * src[15 * line] +
370
0
                g_t32[17][k] * src[17 * line] + g_t32[19][k] * src[19 * line] + g_t32[21][k] * src[21 * line] + g_t32[23][k] * src[23 * line] +
371
0
                g_t32[25][k] * src[25 * line] + g_t32[27][k] * src[27 * line] + g_t32[29][k] * src[29 * line] + g_t32[31][k] * src[31 * line];
372
0
        }
373
374
0
        for (k = 0; k < 8; k++)
375
0
        {
376
0
            EO[k] = g_t32[2][k] * src[2 * line] + g_t32[6][k] * src[6 * line] + g_t32[10][k] * src[10 * line] + g_t32[14][k] * src[14 * line] +
377
0
                g_t32[18][k] * src[18 * line] + g_t32[22][k] * src[22 * line] + g_t32[26][k] * src[26 * line] + g_t32[30][k] * src[30 * line];
378
0
        }
379
380
0
        for (k = 0; k < 4; k++)
381
0
        {
382
0
            EEO[k] = g_t32[4][k] * src[4 * line] + g_t32[12][k] * src[12 * line] + g_t32[20][k] * src[20 * line] + g_t32[28][k] * src[28 * line];
383
0
        }
384
385
0
        EEEO[0] = g_t32[8][0] * src[8 * line] + g_t32[24][0] * src[24 * line];
386
0
        EEEO[1] = g_t32[8][1] * src[8 * line] + g_t32[24][1] * src[24 * line];
387
0
        EEEE[0] = g_t32[0][0] * src[0] + g_t32[16][0] * src[16 * line];
388
0
        EEEE[1] = g_t32[0][1] * src[0] + g_t32[16][1] * src[16 * line];
389
390
        /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
391
0
        EEE[0] = EEEE[0] + EEEO[0];
392
0
        EEE[3] = EEEE[0] - EEEO[0];
393
0
        EEE[1] = EEEE[1] + EEEO[1];
394
0
        EEE[2] = EEEE[1] - EEEO[1];
395
0
        for (k = 0; k < 4; k++)
396
0
        {
397
0
            EE[k] = EEE[k] + EEO[k];
398
0
            EE[k + 4] = EEE[3 - k] - EEO[3 - k];
399
0
        }
400
401
0
        for (k = 0; k < 8; k++)
402
0
        {
403
0
            E[k] = EE[k] + EO[k];
404
0
            E[k + 8] = EE[7 - k] - EO[7 - k];
405
0
        }
406
407
0
        for (k = 0; k < 16; k++)
408
0
        {
409
0
            dst[k] = (int16_t)x265_clip3(-32768, 32767, (E[k] + O[k] + add) >> shift);
410
0
            dst[k + 16] = (int16_t)x265_clip3(-32768, 32767, (E[15 - k] - O[15 - k] + add) >> shift);
411
0
        }
412
413
0
        src++;
414
0
        dst += 32;
415
0
    }
416
0
}
417
418
static void partialButterfly4(const int16_t* src, int16_t* dst, int shift, int line)
419
13.4M
{
420
13.4M
    int j;
421
13.4M
    int E[2], O[2];
422
13.4M
    int add = 1 << (shift - 1);
423
424
67.4M
    for (j = 0; j < line; j++)
425
53.9M
    {
426
        /* E and O */
427
53.9M
        E[0] = src[0] + src[3];
428
53.9M
        O[0] = src[0] - src[3];
429
53.9M
        E[1] = src[1] + src[2];
430
53.9M
        O[1] = src[1] - src[2];
431
432
53.9M
        dst[0] = (int16_t)((g_t4[0][0] * E[0] + g_t4[0][1] * E[1] + add) >> shift);
433
53.9M
        dst[2 * line] = (int16_t)((g_t4[2][0] * E[0] + g_t4[2][1] * E[1] + add) >> shift);
434
53.9M
        dst[line] = (int16_t)((g_t4[1][0] * O[0] + g_t4[1][1] * O[1] + add) >> shift);
435
53.9M
        dst[3 * line] = (int16_t)((g_t4[3][0] * O[0] + g_t4[3][1] * O[1] + add) >> shift);
436
437
53.9M
        src += 4;
438
53.9M
        dst++;
439
53.9M
    }
440
13.4M
}
441
442
namespace X265_NS {
443
void dst4_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
444
2.54M
{
445
2.54M
    const int shift_1st = 1 + X265_DEPTH - 8;
446
2.54M
    const int shift_2nd = 8;
447
448
2.54M
    ALIGN_VAR_32(int16_t, coef[4 * 4]);
449
2.54M
    ALIGN_VAR_32(int16_t, block[4 * 4]);
450
451
12.7M
    for (int i = 0; i < 4; i++)
452
10.1M
    {
453
10.1M
        memcpy(&block[i * 4], &src[i * srcStride], 4 * sizeof(int16_t));
454
10.1M
    }
455
456
2.54M
    fastForwardDst(block, coef, shift_1st);
457
2.54M
    fastForwardDst(coef, dst, shift_2nd);
458
2.54M
}
459
460
void dct4_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
461
6.76M
{
462
6.76M
    const int shift_1st = 1 + X265_DEPTH - 8;
463
6.76M
    const int shift_2nd = 8;
464
465
6.76M
    ALIGN_VAR_32(int16_t, coef[4 * 4]);
466
6.76M
    ALIGN_VAR_32(int16_t, block[4 * 4]);
467
468
33.8M
    for (int i = 0; i < 4; i++)
469
27.0M
    {
470
27.0M
        memcpy(&block[i * 4], &src[i * srcStride], 4 * sizeof(int16_t));
471
27.0M
    }
472
473
6.76M
    partialButterfly4(block, coef, shift_1st, 4);
474
6.76M
    partialButterfly4(coef, dst, shift_2nd, 4);
475
6.76M
}
476
477
void dct8_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
478
1.77M
{
479
1.77M
    const int shift_1st = 2 + X265_DEPTH - 8;
480
1.77M
    const int shift_2nd = 9;
481
482
1.77M
    ALIGN_VAR_32(int16_t, coef[8 * 8]);
483
1.77M
    ALIGN_VAR_32(int16_t, block[8 * 8]);
484
485
15.9M
    for (int i = 0; i < 8; i++)
486
14.1M
    {
487
14.1M
        memcpy(&block[i * 8], &src[i * srcStride], 8 * sizeof(int16_t));
488
14.1M
    }
489
490
1.77M
    partialButterfly8(block, coef, shift_1st, 8);
491
1.77M
    partialButterfly8(coef, dst, shift_2nd, 8);
492
1.77M
}
493
494
void dct16_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
495
392k
{
496
392k
    const int shift_1st = 3 + X265_DEPTH - 8;
497
392k
    const int shift_2nd = 10;
498
499
392k
    ALIGN_VAR_32(int16_t, coef[16 * 16]);
500
392k
    ALIGN_VAR_32(int16_t, block[16 * 16]);
501
502
6.66M
    for (int i = 0; i < 16; i++)
503
6.27M
    {
504
6.27M
        memcpy(&block[i * 16], &src[i * srcStride], 16 * sizeof(int16_t));
505
6.27M
    }
506
507
392k
    partialButterfly16(block, coef, shift_1st, 16);
508
392k
    partialButterfly16(coef, dst, shift_2nd, 16);
509
392k
}
510
511
void dct32_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
512
43.5k
{
513
43.5k
    const int shift_1st = 4 + X265_DEPTH - 8;
514
43.5k
    const int shift_2nd = 11;
515
516
43.5k
    ALIGN_VAR_32(int16_t, coef[32 * 32]);
517
43.5k
    ALIGN_VAR_32(int16_t, block[32 * 32]);
518
519
1.43M
    for (int i = 0; i < 32; i++)
520
1.39M
    {
521
1.39M
        memcpy(&block[i * 32], &src[i * srcStride], 32 * sizeof(int16_t));
522
1.39M
    }
523
524
43.5k
    partialButterfly32(block, coef, shift_1st, 32);
525
43.5k
    partialButterfly32(coef, dst, shift_2nd, 32);
526
43.5k
}
527
528
void idst4_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
529
6.26k
{
530
6.26k
    const int shift_1st = 7;
531
6.26k
    const int shift_2nd = 12 - (X265_DEPTH - 8);
532
533
6.26k
    ALIGN_VAR_32(int16_t, coef[4 * 4]);
534
6.26k
    ALIGN_VAR_32(int16_t, block[4 * 4]);
535
536
6.26k
    inversedst(src, coef, shift_1st); // Forward DST BY FAST ALGORITHM, block input, coef output
537
6.26k
    inversedst(coef, block, shift_2nd); // Forward DST BY FAST ALGORITHM, coef input, coeff output
538
539
31.3k
    for (int i = 0; i < 4; i++)
540
25.0k
    {
541
25.0k
        memcpy(&dst[i * dstStride], &block[i * 4], 4 * sizeof(int16_t));
542
25.0k
    }
543
6.26k
}
544
545
void idct4_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
546
0
{
547
0
    const int shift_1st = 7;
548
0
    const int shift_2nd = 12 - (X265_DEPTH - 8);
549
550
0
    ALIGN_VAR_32(int16_t, coef[4 * 4]);
551
0
    ALIGN_VAR_32(int16_t, block[4 * 4]);
552
553
0
    partialButterflyInverse4(src, coef, shift_1st, 4); // Forward DST BY FAST ALGORITHM, block input, coef output
554
0
    partialButterflyInverse4(coef, block, shift_2nd, 4); // Forward DST BY FAST ALGORITHM, coef input, coeff output
555
556
0
    for (int i = 0; i < 4; i++)
557
0
    {
558
0
        memcpy(&dst[i * dstStride], &block[i * 4], 4 * sizeof(int16_t));
559
0
    }
560
0
}
561
562
void idct8_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
563
0
{
564
0
    const int shift_1st = 7;
565
0
    const int shift_2nd = 12 - (X265_DEPTH - 8);
566
567
0
    ALIGN_VAR_32(int16_t, coef[8 * 8]);
568
0
    ALIGN_VAR_32(int16_t, block[8 * 8]);
569
570
0
    partialButterflyInverse8(src, coef, shift_1st, 8);
571
0
    partialButterflyInverse8(coef, block, shift_2nd, 8);
572
573
0
    for (int i = 0; i < 8; i++)
574
0
    {
575
0
        memcpy(&dst[i * dstStride], &block[i * 8], 8 * sizeof(int16_t));
576
0
    }
577
0
}
578
579
void idct16_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
580
0
{
581
0
    const int shift_1st = 7;
582
0
    const int shift_2nd = 12 - (X265_DEPTH - 8);
583
584
0
    ALIGN_VAR_32(int16_t, coef[16 * 16]);
585
0
    ALIGN_VAR_32(int16_t, block[16 * 16]);
586
587
0
    partialButterflyInverse16(src, coef, shift_1st, 16);
588
0
    partialButterflyInverse16(coef, block, shift_2nd, 16);
589
590
0
    for (int i = 0; i < 16; i++)
591
0
    {
592
0
        memcpy(&dst[i * dstStride], &block[i * 16], 16 * sizeof(int16_t));
593
0
    }
594
0
}
595
596
void idct32_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
597
0
{
598
0
    const int shift_1st = 7;
599
0
    const int shift_2nd = 12 - (X265_DEPTH - 8);
600
601
0
    ALIGN_VAR_32(int16_t, coef[32 * 32]);
602
0
    ALIGN_VAR_32(int16_t, block[32 * 32]);
603
604
0
    partialButterflyInverse32(src, coef, shift_1st, 32);
605
0
    partialButterflyInverse32(coef, block, shift_2nd, 32);
606
607
0
    for (int i = 0; i < 32; i++)
608
0
    {
609
0
        memcpy(&dst[i * dstStride], &block[i * 32], 32 * sizeof(int16_t));
610
0
    }
611
0
}
612
} // namespace X265_NS
613
614
static void dequant_normal_c(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift)
615
39.9k
{
616
#if HIGH_BIT_DEPTH
617
    X265_CHECK(scale < 32768 || ((scale & 3) == 0 && shift > (X265_DEPTH - 8)), "dequant invalid scale %d\n", scale);
618
#else
619
    // NOTE: maximum of scale is (72 * 256)
620
39.9k
    X265_CHECK(scale < 32768, "dequant invalid scale %d\n", scale);
621
39.9k
#endif
622
39.9k
    X265_CHECK(num <= 32 * 32, "dequant num %d too large\n", num);
623
39.9k
    X265_CHECK((num % 8) == 0, "dequant num %d not multiple of 8\n", num);
624
39.9k
    X265_CHECK(shift <= 10, "shift too large %d\n", shift);
625
39.9k
    X265_CHECK(((intptr_t)coef & 31) == 0, "dequant coef buffer not aligned\n");
626
627
39.9k
    int add, coeffQ;
628
629
39.9k
    add = 1 << (shift - 1);
630
631
6.13M
    for (int n = 0; n < num; n++)
632
6.09M
    {
633
6.09M
        coeffQ = (quantCoef[n] * scale + add) >> shift;
634
6.09M
        coef[n] = (int16_t)x265_clip3(-32768, 32767, coeffQ);
635
6.09M
    }
636
39.9k
}
637
638
static void dequant_scaling_c(const int16_t* quantCoef, const int32_t* deQuantCoef, int16_t* coef, int num, int per, int shift)
639
0
{
640
0
    X265_CHECK(num <= 32 * 32, "dequant num %d too large\n", num);
641
642
0
    int add, coeffQ;
643
644
0
    shift += 4;
645
646
0
    if (shift > per)
647
0
    {
648
0
        add = 1 << (shift - per - 1);
649
650
0
        for (int n = 0; n < num; n++)
651
0
        {
652
0
            coeffQ = ((quantCoef[n] * deQuantCoef[n]) + add) >> (shift - per);
653
0
            coef[n] = (int16_t)x265_clip3(-32768, 32767, coeffQ);
654
0
        }
655
0
    }
656
0
    else
657
0
    {
658
0
        for (int n = 0; n < num; n++)
659
0
        {
660
0
            coeffQ   = x265_clip3(-32768, 32767, quantCoef[n] * deQuantCoef[n]);
661
0
            coef[n] = (int16_t)x265_clip3(-32768, 32767, coeffQ << (per - shift));
662
0
        }
663
0
    }
664
0
}
665
666
static uint32_t quant_c(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff)
667
0
{
668
0
    X265_CHECK(qBits >= 8, "qBits less than 8\n");
669
0
    X265_CHECK((numCoeff % 16) == 0, "numCoeff must be multiple of 16\n");
670
0
    int qBits8 = qBits - 8;
671
0
    uint32_t numSig = 0;
672
673
0
    for (int blockpos = 0; blockpos < numCoeff; blockpos++)
674
0
    {
675
0
        int level = coef[blockpos];
676
0
        int sign  = (level < 0 ? -1 : 1);
677
678
0
        int tmplevel = abs(level) * quantCoeff[blockpos];
679
0
        level = ((tmplevel + add) >> qBits);
680
0
        deltaU[blockpos] = ((tmplevel - (level << qBits)) >> qBits8);
681
0
        if (level)
682
0
            ++numSig;
683
0
        level *= sign;
684
0
        qCoef[blockpos] = (int16_t)x265_clip3(-32768, 32767, level);
685
0
    }
686
687
0
    return numSig;
688
0
}
689
690
static uint32_t nquant_c(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff)
691
8.18M
{
692
8.18M
    X265_CHECK((numCoeff % 16) == 0, "number of quant coeff is not multiple of 4x4\n");
693
8.18M
    X265_CHECK((uint32_t)add < ((uint32_t)1 << qBits), "2 ^ qBits less than add\n");
694
8.18M
    X265_CHECK(((intptr_t)quantCoeff & 31) == 0, "quantCoeff buffer not aligned\n");
695
696
8.18M
    uint32_t numSig = 0;
697
698
273M
    for (int blockpos = 0; blockpos < numCoeff; blockpos++)
699
264M
    {
700
264M
        int level = coef[blockpos];
701
264M
        int sign  = (level < 0 ? -1 : 1);
702
703
264M
        int tmplevel = abs(level) * quantCoeff[blockpos];
704
264M
        level = ((tmplevel + add) >> qBits);
705
264M
        if (level)
706
115k
            ++numSig;
707
264M
        level *= sign;
708
709
        // TODO: when we limit range to [-32767, 32767], we can get more performance with output change
710
        //       But nquant is a little percent in rdoQuant, so I keep old dynamic range for compatible
711
264M
        qCoef[blockpos] = (int16_t)abs(x265_clip3(-32768, 32767, level));
712
264M
    }
713
714
8.18M
    return numSig;
715
8.18M
}
716
template<int trSize>
717
int  count_nonzero_c(const int16_t* quantCoeff)
718
66.7k
{
719
66.7k
    X265_CHECK(((intptr_t)quantCoeff & 15) == 0, "quant buffer not aligned\n");
720
66.7k
    int count = 0;
721
66.7k
    int numCoeff = trSize * trSize;
722
10.4M
    for (int i = 0; i < numCoeff; i++)
723
10.4M
    {
724
10.4M
        count += quantCoeff[i] != 0;
725
10.4M
    }
726
727
66.7k
    return count;
728
66.7k
}
int count_nonzero_c<4>(short const*)
Line
Count
Source
718
29.8k
{
719
29.8k
    X265_CHECK(((intptr_t)quantCoeff & 15) == 0, "quant buffer not aligned\n");
720
29.8k
    int count = 0;
721
29.8k
    int numCoeff = trSize * trSize;
722
506k
    for (int i = 0; i < numCoeff; i++)
723
476k
    {
724
476k
        count += quantCoeff[i] != 0;
725
476k
    }
726
727
29.8k
    return count;
728
29.8k
}
int count_nonzero_c<8>(short const*)
Line
Count
Source
718
16.9k
{
719
16.9k
    X265_CHECK(((intptr_t)quantCoeff & 15) == 0, "quant buffer not aligned\n");
720
16.9k
    int count = 0;
721
16.9k
    int numCoeff = trSize * trSize;
722
1.10M
    for (int i = 0; i < numCoeff; i++)
723
1.08M
    {
724
1.08M
        count += quantCoeff[i] != 0;
725
1.08M
    }
726
727
16.9k
    return count;
728
16.9k
}
int count_nonzero_c<16>(short const*)
Line
Count
Source
718
15.1k
{
719
15.1k
    X265_CHECK(((intptr_t)quantCoeff & 15) == 0, "quant buffer not aligned\n");
720
15.1k
    int count = 0;
721
15.1k
    int numCoeff = trSize * trSize;
722
3.89M
    for (int i = 0; i < numCoeff; i++)
723
3.87M
    {
724
3.87M
        count += quantCoeff[i] != 0;
725
3.87M
    }
726
727
15.1k
    return count;
728
15.1k
}
int count_nonzero_c<32>(short const*)
Line
Count
Source
718
4.87k
{
719
4.87k
    X265_CHECK(((intptr_t)quantCoeff & 15) == 0, "quant buffer not aligned\n");
720
4.87k
    int count = 0;
721
4.87k
    int numCoeff = trSize * trSize;
722
5.00M
    for (int i = 0; i < numCoeff; i++)
723
4.99M
    {
724
4.99M
        count += quantCoeff[i] != 0;
725
4.99M
    }
726
727
4.87k
    return count;
728
4.87k
}
729
730
template<int trSize>
731
uint32_t copy_count(int16_t* coeff, const int16_t* residual, intptr_t resiStride)
732
3.51M
{
733
3.51M
    uint32_t numSig = 0;
734
21.1M
    for (int k = 0; k < trSize; k++)
735
17.6M
    {
736
134M
        for (int j = 0; j < trSize; j++)
737
116M
        {
738
116M
            coeff[k * trSize + j] = residual[k * resiStride + j];
739
116M
            numSig += (residual[k * resiStride + j] != 0);
740
116M
        }
741
17.6M
    }
742
743
3.51M
    return numSig;
744
3.51M
}
unsigned int copy_count<4>(short*, short const*, long)
Line
Count
Source
732
2.90M
{
733
2.90M
    uint32_t numSig = 0;
734
14.5M
    for (int k = 0; k < trSize; k++)
735
11.5M
    {
736
57.9M
        for (int j = 0; j < trSize; j++)
737
46.3M
        {
738
46.3M
            coeff[k * trSize + j] = residual[k * resiStride + j];
739
46.3M
            numSig += (residual[k * resiStride + j] != 0);
740
46.3M
        }
741
11.5M
    }
742
743
2.90M
    return numSig;
744
2.90M
}
unsigned int copy_count<8>(short*, short const*, long)
Line
Count
Source
732
492k
{
733
492k
    uint32_t numSig = 0;
734
4.43M
    for (int k = 0; k < trSize; k++)
735
3.94M
    {
736
35.4M
        for (int j = 0; j < trSize; j++)
737
31.5M
        {
738
31.5M
            coeff[k * trSize + j] = residual[k * resiStride + j];
739
31.5M
            numSig += (residual[k * resiStride + j] != 0);
740
31.5M
        }
741
3.94M
    }
742
743
492k
    return numSig;
744
492k
}
unsigned int copy_count<16>(short*, short const*, long)
Line
Count
Source
732
109k
{
733
109k
    uint32_t numSig = 0;
734
1.86M
    for (int k = 0; k < trSize; k++)
735
1.75M
    {
736
29.8M
        for (int j = 0; j < trSize; j++)
737
28.0M
        {
738
28.0M
            coeff[k * trSize + j] = residual[k * resiStride + j];
739
28.0M
            numSig += (residual[k * resiStride + j] != 0);
740
28.0M
        }
741
1.75M
    }
742
743
109k
    return numSig;
744
109k
}
unsigned int copy_count<32>(short*, short const*, long)
Line
Count
Source
732
10.3k
{
733
10.3k
    uint32_t numSig = 0;
734
341k
    for (int k = 0; k < trSize; k++)
735
331k
    {
736
10.9M
        for (int j = 0; j < trSize; j++)
737
10.6M
        {
738
10.6M
            coeff[k * trSize + j] = residual[k * resiStride + j];
739
10.6M
            numSig += (residual[k * resiStride + j] != 0);
740
10.6M
        }
741
331k
    }
742
743
10.3k
    return numSig;
744
10.3k
}
745
746
static void denoiseDct_c(int16_t* dctCoef, uint32_t* resSum, const uint16_t* offset, int numCoeff)
747
0
{
748
0
    for (int i = 0; i < numCoeff; i++)
749
0
    {
750
0
        int level = dctCoef[i];
751
0
        int sign = level >> 31;
752
0
        level = (level + sign) ^ sign;
753
0
        resSum[i] += level;
754
0
        level -= offset[i];
755
0
        dctCoef[i] = (int16_t)(level < 0 ? 0 : (level ^ sign) - sign);
756
0
    }
757
0
}
758
759
static int scanPosLast_c(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* /*scanCG4x4*/, const int /*trSize*/)
760
115k
{
761
115k
    memset(coeffNum, 0, MLS_GRP_NUM * sizeof(*coeffNum));
762
115k
    memset(coeffFlag, 0, MLS_GRP_NUM * sizeof(*coeffFlag));
763
115k
    memset(coeffSign, 0, MLS_GRP_NUM * sizeof(*coeffSign));
764
765
115k
    int scanPosLast = 0;
766
115k
    do
767
2.29M
    {
768
2.29M
        const uint32_t cgIdx = (uint32_t)scanPosLast >> MLS_CG_SIZE;
769
770
2.29M
        const uint32_t posLast = scan[scanPosLast++];
771
772
2.29M
        const int curCoeff = coeff[posLast];
773
2.29M
        const uint32_t isNZCoeff = (curCoeff != 0);
774
        // get L1 sig map
775
        // NOTE: the new algorithm is complicated, so I keep reference code here
776
        //uint32_t posy   = posLast >> log2TrSize;
777
        //uint32_t posx   = posLast - (posy << log2TrSize);
778
        //uint32_t blkIdx0 = ((posy >> MLS_CG_LOG2_SIZE) << codingParameters.log2TrSizeCG) + (posx >> MLS_CG_LOG2_SIZE);
779
        //const uint32_t blkIdx = ((posLast >> (2 * MLS_CG_LOG2_SIZE)) & ~maskPosXY) + ((posLast >> MLS_CG_LOG2_SIZE) & maskPosXY);
780
        //sigCoeffGroupFlag64 |= ((uint64_t)isNZCoeff << blkIdx);
781
2.29M
        numSig -= isNZCoeff;
782
783
        // TODO: optimize by instruction BTS
784
2.29M
        coeffSign[cgIdx] += (uint16_t)(((uint32_t)curCoeff >> 31) << coeffNum[cgIdx]);
785
2.29M
        coeffFlag[cgIdx] = (coeffFlag[cgIdx] << 1) + (uint16_t)isNZCoeff;
786
2.29M
        coeffNum[cgIdx] += (uint8_t)isNZCoeff;
787
2.29M
    }
788
2.29M
    while (numSig > 0);
789
115k
    return scanPosLast - 1;
790
115k
}
791
792
// NOTE: no defined value on lastNZPosInCG & absSumSign when ALL ZEROS block as input
793
static uint32_t findPosFirstLast_c(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16])
794
5.31k
{
795
5.31k
    int n;
796
797
18.9k
    for (n = SCAN_SET_SIZE - 1; n >= 0; n--)
798
18.9k
    {
799
18.9k
        const uint32_t idx = scanTbl[n];
800
18.9k
        const uint32_t idxY = idx / MLS_CG_SIZE;
801
18.9k
        const uint32_t idxX = idx % MLS_CG_SIZE;
802
18.9k
        if (dstCoeff[idxY * trSize + idxX])
803
5.31k
            break;
804
18.9k
    }
805
806
5.31k
    X265_CHECK(n >= -1, "non-zero coeff scan failuare!\n");
807
808
5.31k
    uint32_t lastNZPosInCG = (uint32_t)n;
809
810
5.31k
    for (n = 0; n < SCAN_SET_SIZE; n++)
811
5.31k
    {
812
5.31k
        const uint32_t idx = scanTbl[n];
813
5.31k
        const uint32_t idxY = idx / MLS_CG_SIZE;
814
5.31k
        const uint32_t idxX = idx % MLS_CG_SIZE;
815
5.31k
        if (dstCoeff[idxY * trSize + idxX])
816
5.31k
            break;
817
5.31k
    }
818
819
5.31k
    uint32_t firstNZPosInCG = (uint32_t)n;
820
821
5.31k
    uint32_t absSumSign = 0;
822
76.5k
    for (n = firstNZPosInCG; n <= (int)lastNZPosInCG; n++)
823
71.2k
    {
824
71.2k
        const uint32_t idx = scanTbl[n];
825
71.2k
        const uint32_t idxY = idx / MLS_CG_SIZE;
826
71.2k
        const uint32_t idxX = idx % MLS_CG_SIZE;
827
71.2k
        absSumSign += dstCoeff[idxY * trSize + idxX];
828
71.2k
    }
829
830
    // NOTE: when coeff block all ZERO, the lastNZPosInCG is undefined and firstNZPosInCG is 16
831
5.31k
    return ((absSumSign << 31) | (lastNZPosInCG << 8) | firstNZPosInCG);
832
5.31k
}
833
834
835
static uint32_t costCoeffNxN_c(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase)
836
133k
{
837
133k
    ALIGN_VAR_32(uint16_t, tmpCoeff[SCAN_SET_SIZE]);
838
133k
    uint32_t numNonZero = (scanPosSigOff < (SCAN_SET_SIZE - 1) ? 1 : 0);
839
133k
    uint32_t sum = 0;
840
841
    // correct offset to match assembly
842
133k
    absCoeff -= numNonZero;
843
844
667k
    for (int i = 0; i < MLS_CG_SIZE; i++)
845
534k
    {
846
534k
        tmpCoeff[i * MLS_CG_SIZE + 0] = (uint16_t)abs(coeff[i * trSize + 0]);
847
534k
        tmpCoeff[i * MLS_CG_SIZE + 1] = (uint16_t)abs(coeff[i * trSize + 1]);
848
534k
        tmpCoeff[i * MLS_CG_SIZE + 2] = (uint16_t)abs(coeff[i * trSize + 2]);
849
534k
        tmpCoeff[i * MLS_CG_SIZE + 3] = (uint16_t)abs(coeff[i * trSize + 3]);
850
534k
    }
851
852
133k
    do
853
2.09M
    {
854
2.09M
        uint32_t blkPos, sig, ctxSig;
855
2.09M
        blkPos = scan[scanPosSigOff];
856
2.09M
        const uint32_t posZeroMask = (subPosBase + scanPosSigOff) ? ~0 : 0;
857
2.09M
        sig     = scanFlagMask & 1;
858
2.09M
        scanFlagMask >>= 1;
859
2.09M
        X265_CHECK((uint32_t)(tmpCoeff[blkPos] != 0) == sig, "sign bit mistake\n");
860
2.09M
        if ((scanPosSigOff != 0) || (subPosBase == 0) || numNonZero)
861
2.09M
        {
862
2.09M
            const uint32_t cnt = tabSigCtx[blkPos] + offset;
863
2.09M
            ctxSig = cnt & posZeroMask;
864
865
            //X265_CHECK(ctxSig == Quant::getSigCtxInc(patternSigCtx, log2TrSize, trSize, codingParameters.scan[subPosBase + scanPosSigOff], bIsLuma, codingParameters.firstSignificanceMapContext), "sigCtx mistake!\n");;
866
            //encodeBin(sig, baseCtx[ctxSig]);
867
2.09M
            const uint32_t mstate = baseCtx[ctxSig];
868
2.09M
            const uint32_t mps = mstate & 1;
869
2.09M
            const uint32_t stateBits = PFX(entropyStateBits)[mstate ^ sig];
870
2.09M
            uint32_t nextState = (stateBits >> 24) + mps;
871
2.09M
            if ((mstate ^ sig) == 1)
872
16.7k
                nextState = sig;
873
2.09M
            X265_CHECK(sbacNext(mstate, sig) == nextState, "nextState check failure\n");
874
2.09M
            X265_CHECK(sbacGetEntropyBits(mstate, sig) == (stateBits & 0xFFFFFF), "entropyBits check failure\n");
875
2.09M
            baseCtx[ctxSig] = (uint8_t)nextState;
876
2.09M
            sum += stateBits;
877
2.09M
        }
878
2.09M
        assert(numNonZero <= 15);
879
2.09M
        assert(blkPos <= 15);
880
2.09M
        absCoeff[numNonZero] = tmpCoeff[blkPos];
881
2.09M
        numNonZero += sig;
882
2.09M
        scanPosSigOff--;
883
2.09M
    }
884
2.09M
    while(scanPosSigOff >= 0);
885
886
133k
    return (sum & 0xFFFFFF);
887
133k
}
888
889
static uint32_t costCoeffRemain_c(uint16_t *absCoeff, int numNonZero, int idx)
890
171k
{
891
171k
    uint32_t goRiceParam = 0;
892
893
171k
    uint32_t sum = 0;
894
171k
    int baseLevel = 3;
895
171k
    do
896
2.14M
    {
897
2.14M
        if (idx >= C1FLAG_NUMBER)
898
1.05M
            baseLevel = 1;
899
900
        // TODO: the IDX is not really idx, so this check inactive
901
        //X265_CHECK(baseLevel == ((idx < C1FLAG_NUMBER) ? (2 + firstCoeff2) : 1), "baseLevel check failurr\n");
902
2.14M
        int codeNumber = absCoeff[idx] - baseLevel;
903
904
2.14M
        if (codeNumber >= 0)
905
2.14M
        {
906
            //writeCoefRemainExGolomb(absCoeff[idx] - baseLevel, goRiceParam);
907
2.14M
            uint32_t length = 0;
908
909
2.14M
            codeNumber = ((uint32_t)codeNumber >> goRiceParam) - COEF_REMAIN_BIN_REDUCTION;
910
2.14M
            if (codeNumber >= 0)
911
2.10M
            {
912
2.10M
                {
913
2.10M
                    unsigned long cidx;
914
2.10M
                    BSR(cidx, codeNumber + 1);
915
2.10M
                    length = cidx;
916
2.10M
                }
917
2.10M
                X265_CHECK((codeNumber != 0) || (length == 0), "length check failure\n");
918
919
2.10M
                codeNumber = (length + length);
920
2.10M
            }
921
2.14M
            sum += (COEF_REMAIN_BIN_REDUCTION + 1 + goRiceParam + codeNumber);
922
923
2.14M
            if (absCoeff[idx] > (COEF_REMAIN_BIN_REDUCTION << goRiceParam))
924
2.11M
                goRiceParam = (goRiceParam + 1) - (goRiceParam >> 2);
925
2.14M
            X265_CHECK(goRiceParam <= 4, "goRiceParam check failure\n");
926
2.14M
        }
927
2.14M
        baseLevel = 2;
928
2.14M
        idx++;
929
2.14M
    }
930
2.14M
    while(idx < numNonZero);
931
932
171k
    return sum;
933
171k
}
934
935
936
static uint32_t costC1C2Flag_c(uint16_t *absCoeff, intptr_t numC1Flag, uint8_t *baseCtxMod, intptr_t ctxOffset)
937
176k
{
938
176k
    uint32_t sum = 0;
939
176k
    uint32_t c1 = 1;
940
176k
    uint32_t firstC2Idx = 8;
941
176k
    uint32_t firstC2Flag = 2;
942
176k
    uint32_t c1Next = 0xFFFFFFFE;
943
944
176k
    int idx = 0;
945
176k
    do
946
1.10M
    {
947
1.10M
        uint32_t symbol1 = absCoeff[idx] > 1;
948
1.10M
        uint32_t symbol2 = absCoeff[idx] > 2;
949
        //encodeBin(symbol1, baseCtxMod[c1]);
950
1.10M
        {
951
1.10M
            const uint32_t mstate = baseCtxMod[c1];
952
1.10M
            baseCtxMod[c1] = sbacNext(mstate, symbol1);
953
1.10M
            sum += sbacGetEntropyBits(mstate, symbol1);
954
1.10M
        }
955
956
1.10M
        if (symbol1)
957
1.09M
            c1Next = 0;
958
959
1.10M
        if (symbol1 + firstC2Flag == 3)
960
171k
            firstC2Flag = symbol2;
961
962
1.10M
        if (symbol1 + firstC2Idx == 9)
963
171k
            firstC2Idx  = idx;
964
965
1.10M
        c1 = (c1Next & 3);
966
1.10M
        c1Next >>= 2;
967
1.10M
        X265_CHECK(c1 <= 3, "c1 check failure\n");
968
1.10M
        idx++;
969
1.10M
    }
970
1.10M
    while(idx < numC1Flag);
971
972
176k
    if (!c1)
973
171k
    {
974
171k
        X265_CHECK((firstC2Flag <= 1), "firstC2FlagIdx check failure\n");
975
976
171k
        baseCtxMod += ctxOffset;
977
978
        //encodeBin(firstC2Flag, baseCtxMod[0]);
979
171k
        {
980
171k
            const uint32_t mstate = baseCtxMod[0];
981
171k
            baseCtxMod[0] = sbacNext(mstate, firstC2Flag);
982
171k
            sum += sbacGetEntropyBits(mstate, firstC2Flag);
983
171k
        }
984
171k
    }
985
176k
    return (sum & 0x00FFFFFF) + (c1 << 26) + (firstC2Idx << 28);
986
176k
}
987
template<int log2TrSize>
988
static void nonPsyRdoQuant_c(int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, uint32_t blkPos)
989
99.0k
{
990
99.0k
    const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
991
99.0k
    const int scaleBits = SCALE_BITS - 2 * transformShift;
992
99.0k
    const uint32_t trSize = 1 << log2TrSize;
993
994
495k
    for (int y = 0; y < MLS_CG_SIZE; y++)
995
396k
    {
996
1.98M
        for (int x = 0; x < MLS_CG_SIZE; x++)
997
1.58M
        {
998
1.58M
             int64_t signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
999
1.58M
             costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits));
1000
1.58M
             *totalUncodedCost += costUncoded[blkPos + x];
1001
1.58M
             *totalRdCost += costUncoded[blkPos + x];
1002
1.58M
        }
1003
396k
        blkPos += trSize;
1004
396k
    }
1005
99.0k
}
Unexecuted instantiation: dct.cpp:void nonPsyRdoQuant_c<2>(short*, long*, long*, long*, unsigned int)
dct.cpp:void nonPsyRdoQuant_c<3>(short*, long*, long*, long*, unsigned int)
Line
Count
Source
989
39.4k
{
990
39.4k
    const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
991
39.4k
    const int scaleBits = SCALE_BITS - 2 * transformShift;
992
39.4k
    const uint32_t trSize = 1 << log2TrSize;
993
994
197k
    for (int y = 0; y < MLS_CG_SIZE; y++)
995
157k
    {
996
788k
        for (int x = 0; x < MLS_CG_SIZE; x++)
997
630k
        {
998
630k
             int64_t signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
999
630k
             costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits));
1000
630k
             *totalUncodedCost += costUncoded[blkPos + x];
1001
630k
             *totalRdCost += costUncoded[blkPos + x];
1002
630k
        }
1003
157k
        blkPos += trSize;
1004
157k
    }
1005
39.4k
}
dct.cpp:void nonPsyRdoQuant_c<4>(short*, long*, long*, long*, unsigned int)
Line
Count
Source
989
59.6k
{
990
59.6k
    const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
991
59.6k
    const int scaleBits = SCALE_BITS - 2 * transformShift;
992
59.6k
    const uint32_t trSize = 1 << log2TrSize;
993
994
298k
    for (int y = 0; y < MLS_CG_SIZE; y++)
995
238k
    {
996
1.19M
        for (int x = 0; x < MLS_CG_SIZE; x++)
997
953k
        {
998
953k
             int64_t signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
999
953k
             costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits));
1000
953k
             *totalUncodedCost += costUncoded[blkPos + x];
1001
953k
             *totalRdCost += costUncoded[blkPos + x];
1002
953k
        }
1003
238k
        blkPos += trSize;
1004
238k
    }
1005
59.6k
}
Unexecuted instantiation: dct.cpp:void nonPsyRdoQuant_c<5>(short*, long*, long*, long*, unsigned int)
1006
template<int log2TrSize>
1007
static void psyRdoQuant_c(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos)
1008
0
{
1009
0
    const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
1010
0
    const int scaleBits = SCALE_BITS - 2 * transformShift;
1011
0
    const uint32_t trSize = 1 << log2TrSize;
1012
0
    int max = X265_MAX(0, (2 * transformShift + 1));
1013
1014
0
    for (int y = 0; y < MLS_CG_SIZE; y++)
1015
0
    {
1016
0
        for (int x = 0; x < MLS_CG_SIZE; x++)
1017
0
        {
1018
0
            int64_t signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
1019
0
            int64_t predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/
1020
1021
0
            costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits));
1022
1023
            /* when no residual coefficient is coded, predicted coef == recon coef */
1024
0
            costUncoded[blkPos + x] -= static_cast<int64_t>((double)(((*psyScale) * predictedCoef) >> max));
1025
1026
0
            *totalUncodedCost += costUncoded[blkPos + x];
1027
0
            *totalRdCost += costUncoded[blkPos + x];
1028
0
        }
1029
0
        blkPos += trSize;
1030
0
    }
1031
0
}
Unexecuted instantiation: dct.cpp:void psyRdoQuant_c<2>(short*, short*, long*, long*, long*, long*, unsigned int)
Unexecuted instantiation: dct.cpp:void psyRdoQuant_c<3>(short*, short*, long*, long*, long*, long*, unsigned int)
Unexecuted instantiation: dct.cpp:void psyRdoQuant_c<4>(short*, short*, long*, long*, long*, long*, unsigned int)
Unexecuted instantiation: dct.cpp:void psyRdoQuant_c<5>(short*, short*, long*, long*, long*, long*, unsigned int)
1032
template<int log2TrSize>
1033
static void psyRdoQuant_c_1(int16_t *m_resiDctCoeff, /*int16_t  *m_fencDctCoeff, */ int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, /* int64_t *psyScale,*/ uint32_t blkPos)
1034
277k
{
1035
277k
  const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
1036
277k
  const int scaleBits = SCALE_BITS - 2 * transformShift;
1037
277k
  const uint32_t trSize = 1 << log2TrSize;
1038
1039
1.38M
  for (int y = 0; y < MLS_CG_SIZE; y++)
1040
1.11M
  {
1041
5.55M
    for (int x = 0; x < MLS_CG_SIZE; x++)
1042
4.44M
    {
1043
4.44M
      int64_t signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
1044
4.44M
      costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits));
1045
4.44M
      *totalUncodedCost += costUncoded[blkPos + x];
1046
4.44M
      *totalRdCost += costUncoded[blkPos + x];
1047
4.44M
    }
1048
1.11M
    blkPos += trSize;
1049
1.11M
  }
1050
277k
}
Unexecuted instantiation: dct.cpp:void psyRdoQuant_c_1<2>(short*, long*, long*, long*, unsigned int)
dct.cpp:void psyRdoQuant_c_1<3>(short*, long*, long*, long*, unsigned int)
Line
Count
Source
1034
15.4k
{
1035
15.4k
  const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
1036
15.4k
  const int scaleBits = SCALE_BITS - 2 * transformShift;
1037
15.4k
  const uint32_t trSize = 1 << log2TrSize;
1038
1039
77.1k
  for (int y = 0; y < MLS_CG_SIZE; y++)
1040
61.7k
  {
1041
308k
    for (int x = 0; x < MLS_CG_SIZE; x++)
1042
246k
    {
1043
246k
      int64_t signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
1044
246k
      costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits));
1045
246k
      *totalUncodedCost += costUncoded[blkPos + x];
1046
246k
      *totalRdCost += costUncoded[blkPos + x];
1047
246k
    }
1048
61.7k
    blkPos += trSize;
1049
61.7k
  }
1050
15.4k
}
dct.cpp:void psyRdoQuant_c_1<4>(short*, long*, long*, long*, unsigned int)
Line
Count
Source
1034
98.8k
{
1035
98.8k
  const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
1036
98.8k
  const int scaleBits = SCALE_BITS - 2 * transformShift;
1037
98.8k
  const uint32_t trSize = 1 << log2TrSize;
1038
1039
494k
  for (int y = 0; y < MLS_CG_SIZE; y++)
1040
395k
  {
1041
1.97M
    for (int x = 0; x < MLS_CG_SIZE; x++)
1042
1.58M
    {
1043
1.58M
      int64_t signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
1044
1.58M
      costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits));
1045
1.58M
      *totalUncodedCost += costUncoded[blkPos + x];
1046
1.58M
      *totalRdCost += costUncoded[blkPos + x];
1047
1.58M
    }
1048
395k
    blkPos += trSize;
1049
395k
  }
1050
98.8k
}
dct.cpp:void psyRdoQuant_c_1<5>(short*, long*, long*, long*, unsigned int)
Line
Count
Source
1034
163k
{
1035
163k
  const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
1036
163k
  const int scaleBits = SCALE_BITS - 2 * transformShift;
1037
163k
  const uint32_t trSize = 1 << log2TrSize;
1038
1039
818k
  for (int y = 0; y < MLS_CG_SIZE; y++)
1040
654k
  {
1041
3.27M
    for (int x = 0; x < MLS_CG_SIZE; x++)
1042
2.61M
    {
1043
2.61M
      int64_t signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
1044
2.61M
      costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits));
1045
2.61M
      *totalUncodedCost += costUncoded[blkPos + x];
1046
2.61M
      *totalRdCost += costUncoded[blkPos + x];
1047
2.61M
    }
1048
654k
    blkPos += trSize;
1049
654k
  }
1050
163k
}
1051
template<int log2TrSize>
1052
static void psyRdoQuant_c_2(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos)
1053
277k
{
1054
277k
  const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
1055
1056
277k
  const uint32_t trSize = 1 << log2TrSize;
1057
277k
  int max = X265_MAX(0, (2 * transformShift + 1));
1058
1059
1.38M
  for (int y = 0; y < MLS_CG_SIZE; y++)
1060
1.11M
  {
1061
5.55M
    for (int x = 0; x < MLS_CG_SIZE; x++)
1062
4.44M
    {
1063
4.44M
      int64_t signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
1064
4.44M
      int64_t predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/
1065
4.44M
      costUncoded[blkPos + x] -= static_cast<int64_t>((double)(((*psyScale) * predictedCoef) >> max));
1066
4.44M
      *totalUncodedCost += costUncoded[blkPos + x];
1067
4.44M
      *totalRdCost += costUncoded[blkPos + x];
1068
4.44M
    }
1069
1.11M
    blkPos += trSize;
1070
1.11M
  }
1071
277k
}
Unexecuted instantiation: dct.cpp:void psyRdoQuant_c_2<2>(short*, short*, long*, long*, long*, long*, unsigned int)
dct.cpp:void psyRdoQuant_c_2<3>(short*, short*, long*, long*, long*, long*, unsigned int)
Line
Count
Source
1053
15.4k
{
1054
15.4k
  const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
1055
1056
15.4k
  const uint32_t trSize = 1 << log2TrSize;
1057
15.4k
  int max = X265_MAX(0, (2 * transformShift + 1));
1058
1059
77.1k
  for (int y = 0; y < MLS_CG_SIZE; y++)
1060
61.7k
  {
1061
308k
    for (int x = 0; x < MLS_CG_SIZE; x++)
1062
246k
    {
1063
246k
      int64_t signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
1064
246k
      int64_t predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/
1065
246k
      costUncoded[blkPos + x] -= static_cast<int64_t>((double)(((*psyScale) * predictedCoef) >> max));
1066
246k
      *totalUncodedCost += costUncoded[blkPos + x];
1067
246k
      *totalRdCost += costUncoded[blkPos + x];
1068
246k
    }
1069
61.7k
    blkPos += trSize;
1070
61.7k
  }
1071
15.4k
}
dct.cpp:void psyRdoQuant_c_2<4>(short*, short*, long*, long*, long*, long*, unsigned int)
Line
Count
Source
1053
98.8k
{
1054
98.8k
  const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
1055
1056
98.8k
  const uint32_t trSize = 1 << log2TrSize;
1057
98.8k
  int max = X265_MAX(0, (2 * transformShift + 1));
1058
1059
494k
  for (int y = 0; y < MLS_CG_SIZE; y++)
1060
395k
  {
1061
1.97M
    for (int x = 0; x < MLS_CG_SIZE; x++)
1062
1.58M
    {
1063
1.58M
      int64_t signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
1064
1.58M
      int64_t predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/
1065
1.58M
      costUncoded[blkPos + x] -= static_cast<int64_t>((double)(((*psyScale) * predictedCoef) >> max));
1066
1.58M
      *totalUncodedCost += costUncoded[blkPos + x];
1067
1.58M
      *totalRdCost += costUncoded[blkPos + x];
1068
1.58M
    }
1069
395k
    blkPos += trSize;
1070
395k
  }
1071
98.8k
}
dct.cpp:void psyRdoQuant_c_2<5>(short*, short*, long*, long*, long*, long*, unsigned int)
Line
Count
Source
1053
163k
{
1054
163k
  const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
1055
1056
163k
  const uint32_t trSize = 1 << log2TrSize;
1057
163k
  int max = X265_MAX(0, (2 * transformShift + 1));
1058
1059
818k
  for (int y = 0; y < MLS_CG_SIZE; y++)
1060
654k
  {
1061
3.27M
    for (int x = 0; x < MLS_CG_SIZE; x++)
1062
2.61M
    {
1063
2.61M
      int64_t signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
1064
2.61M
      int64_t predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/
1065
2.61M
      costUncoded[blkPos + x] -= static_cast<int64_t>((double)(((*psyScale) * predictedCoef) >> max));
1066
2.61M
      *totalUncodedCost += costUncoded[blkPos + x];
1067
2.61M
      *totalRdCost += costUncoded[blkPos + x];
1068
2.61M
    }
1069
654k
    blkPos += trSize;
1070
654k
  }
1071
163k
}
1072
1073
namespace X265_NS {
1074
// x265 private namespace
1075
void setupDCTPrimitives_c(EncoderPrimitives& p)
1076
1
{
1077
1
    p.dequant_scaling = dequant_scaling_c;
1078
1
    p.dequant_normal = dequant_normal_c;
1079
1
    p.quant = quant_c;
1080
1
    p.nquant = nquant_c;
1081
1
    p.cu[BLOCK_4x4].nonPsyRdoQuant   = nonPsyRdoQuant_c<2>;
1082
1
    p.cu[BLOCK_8x8].nonPsyRdoQuant   = nonPsyRdoQuant_c<3>;
1083
1
    p.cu[BLOCK_16x16].nonPsyRdoQuant = nonPsyRdoQuant_c<4>;
1084
1
    p.cu[BLOCK_32x32].nonPsyRdoQuant = nonPsyRdoQuant_c<5>;
1085
1
    p.cu[BLOCK_4x4].psyRdoQuant = psyRdoQuant_c<2>;
1086
1
    p.cu[BLOCK_8x8].psyRdoQuant = psyRdoQuant_c<3>;
1087
1
    p.cu[BLOCK_16x16].psyRdoQuant = psyRdoQuant_c<4>;
1088
1
    p.cu[BLOCK_32x32].psyRdoQuant = psyRdoQuant_c<5>;
1089
1
    p.dst4x4 = dst4_c;
1090
1
    p.cu[BLOCK_4x4].dct   = dct4_c;
1091
1
    p.cu[BLOCK_8x8].dct   = dct8_c;
1092
1
    p.cu[BLOCK_16x16].dct = dct16_c;
1093
1
    p.cu[BLOCK_32x32].dct = dct32_c;
1094
1
    p.idst4x4 = idst4_c;
1095
1
    p.cu[BLOCK_4x4].idct   = idct4_c;
1096
1
    p.cu[BLOCK_8x8].idct   = idct8_c;
1097
1
    p.cu[BLOCK_16x16].idct = idct16_c;
1098
1
    p.cu[BLOCK_32x32].idct = idct32_c;
1099
1
    p.denoiseDct = denoiseDct_c;
1100
1
    p.cu[BLOCK_4x4].count_nonzero = count_nonzero_c<4>;
1101
1
    p.cu[BLOCK_8x8].count_nonzero = count_nonzero_c<8>;
1102
1
    p.cu[BLOCK_16x16].count_nonzero = count_nonzero_c<16>;
1103
1
    p.cu[BLOCK_32x32].count_nonzero = count_nonzero_c<32>;
1104
1105
1
    p.cu[BLOCK_4x4].copy_cnt   = copy_count<4>;
1106
1
    p.cu[BLOCK_8x8].copy_cnt   = copy_count<8>;
1107
1
    p.cu[BLOCK_16x16].copy_cnt = copy_count<16>;
1108
1
    p.cu[BLOCK_32x32].copy_cnt = copy_count<32>;
1109
1
  p.cu[BLOCK_4x4].psyRdoQuant_1p = psyRdoQuant_c_1<2>;
1110
1
  p.cu[BLOCK_4x4].psyRdoQuant_2p = psyRdoQuant_c_2<2>;
1111
1
  p.cu[BLOCK_8x8].psyRdoQuant_1p = psyRdoQuant_c_1<3>;
1112
1
  p.cu[BLOCK_8x8].psyRdoQuant_2p = psyRdoQuant_c_2<3>;
1113
1
  p.cu[BLOCK_16x16].psyRdoQuant_1p = psyRdoQuant_c_1<4>;
1114
1
  p.cu[BLOCK_16x16].psyRdoQuant_2p = psyRdoQuant_c_2<4>;
1115
1
  p.cu[BLOCK_32x32].psyRdoQuant_1p = psyRdoQuant_c_1<5>;
1116
1
  p.cu[BLOCK_32x32].psyRdoQuant_2p = psyRdoQuant_c_2<5>;
1117
1
    p.scanPosLast = scanPosLast_c;
1118
1
    p.findPosFirstLast = findPosFirstLast_c;
1119
1
    p.costCoeffNxN = costCoeffNxN_c;
1120
1
    p.costCoeffRemain = costCoeffRemain_c;
1121
1
    p.costC1C2Flag = costC1C2Flag_c;
1122
1
}
1123
}