Coverage Report

Created: 2026-06-10 07:00

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/work/x265/source/common/dct.cpp
Line
Count
Source
1
/*****************************************************************************
2
 * Copyright (C) 2013-2020 MulticoreWare, Inc
3
 *
4
 * Authors: Mandar Gurav <mandar@multicorewareinc.com>
5
 *          Deepthi Devaki Akkoorath <deepthidevaki@multicorewareinc.com>
6
 *          Mahesh Pittala <mahesh@multicorewareinc.com>
7
 *          Rajesh Paulraj <rajesh@multicorewareinc.com>
8
 *          Min Chen <min.chen@multicorewareinc.com>
9
 *          Praveen Kumar Tiwari <praveen@multicorewareinc.com>
10
 *          Nabajit Deka <nabajit@multicorewareinc.com>
11
 *
12
 * This program is free software; you can redistribute it and/or modify
13
 * it under the terms of the GNU General Public License as published by
14
 * the Free Software Foundation; either version 2 of the License, or
15
 * (at your option) any later version.
16
 *
17
 * This program is distributed in the hope that it will be useful,
18
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20
 * GNU General Public License for more details.
21
 *
22
 * You should have received a copy of the GNU General Public License
23
 * along with this program; if not, write to the Free Software
24
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
25
 *
26
 * This program is also available under a commercial proprietary license.
27
 * For more information, contact us at license @ x265.com.
28
 *****************************************************************************/
29
30
#include "common.h"
31
#include "primitives.h"
32
#include "contexts.h"   // costCoeffNxN_c
33
#include "threading.h"  // BSR
34
35
using namespace X265_NS;
36
37
#if _MSC_VER
38
#pragma warning(disable: 4127) // conditional expression is constant, typical for templated functions
39
#endif
40
41
// Fast DST Algorithm. Full matrix multiplication for DST and Fast DST algorithm
42
// give identical results
43
static void fastForwardDst(const int16_t* block, int16_t* coeff, int shift)  // input block, output coeff
44
5.36M
{
45
5.36M
    int c[4];
46
5.36M
    int rnd_factor = 1 << (shift - 1);
47
48
26.8M
    for (int i = 0; i < 4; i++)
49
21.4M
    {
50
        // Intermediate Variables
51
21.4M
        c[0] = block[4 * i + 0] + block[4 * i + 3];
52
21.4M
        c[1] = block[4 * i + 1] + block[4 * i + 3];
53
21.4M
        c[2] = block[4 * i + 0] - block[4 * i + 1];
54
21.4M
        c[3] = 74 * block[4 * i + 2];
55
56
21.4M
        coeff[i] =      (int16_t)((29 * c[0] + 55 * c[1]  + c[3] + rnd_factor) >> shift);
57
21.4M
        coeff[4 + i] =  (int16_t)((74 * (block[4 * i + 0] + block[4 * i + 1] - block[4 * i + 3]) + rnd_factor) >> shift);
58
21.4M
        coeff[8 + i] =  (int16_t)((29 * c[2] + 55 * c[0]  - c[3] + rnd_factor) >> shift);
59
21.4M
        coeff[12 + i] = (int16_t)((55 * c[2] - 29 * c[1] + c[3] + rnd_factor) >> shift);
60
21.4M
    }
61
5.36M
}
62
63
static void inversedst(const int16_t* tmp, int16_t* block, int shift)  // input tmp, output block
64
11.4k
{
65
11.4k
    int i, c[4];
66
11.4k
    int rnd_factor = 1 << (shift - 1);
67
68
57.3k
    for (i = 0; i < 4; i++)
69
45.8k
    {
70
        // Intermediate Variables
71
45.8k
        c[0] = tmp[i] + tmp[8 + i];
72
45.8k
        c[1] = tmp[8 + i] + tmp[12 + i];
73
45.8k
        c[2] = tmp[i] - tmp[12 + i];
74
45.8k
        c[3] = 74 * tmp[4 + i];
75
76
45.8k
        block[4 * i + 0] = (int16_t)x265_clip3(-32768, 32767, (29 * c[0] + 55 * c[1]     + c[3]               + rnd_factor) >> shift);
77
45.8k
        block[4 * i + 1] = (int16_t)x265_clip3(-32768, 32767, (55 * c[2] - 29 * c[1]     + c[3]               + rnd_factor) >> shift);
78
45.8k
        block[4 * i + 2] = (int16_t)x265_clip3(-32768, 32767, (74 * (tmp[i] - tmp[8 + i]  + tmp[12 + i])      + rnd_factor) >> shift);
79
45.8k
        block[4 * i + 3] = (int16_t)x265_clip3(-32768, 32767, (55 * c[0] + 29 * c[2]     - c[3]               + rnd_factor) >> shift);
80
45.8k
    }
81
11.4k
}
82
83
static void partialButterfly16(const int16_t* src, int16_t* dst, int shift, int line)
84
842k
{
85
842k
    int j, k;
86
842k
    int E[8], O[8];
87
842k
    int EE[4], EO[4];
88
842k
    int EEE[2], EEO[2];
89
842k
    int add = 1 << (shift - 1);
90
91
14.2M
    for (j = 0; j < line; j++)
92
13.3M
    {
93
        /* E and O */
94
120M
        for (k = 0; k < 8; k++)
95
107M
        {
96
107M
            E[k] = src[k] + src[15 - k];
97
107M
            O[k] = src[k] - src[15 - k];
98
107M
        }
99
100
        /* EE and EO */
101
67.0M
        for (k = 0; k < 4; k++)
102
53.6M
        {
103
53.6M
            EE[k] = E[k] + E[7 - k];
104
53.6M
            EO[k] = E[k] - E[7 - k];
105
53.6M
        }
106
107
        /* EEE and EEO */
108
13.3M
        EEE[0] = EE[0] + EE[3];
109
13.3M
        EEO[0] = EE[0] - EE[3];
110
13.3M
        EEE[1] = EE[1] + EE[2];
111
13.3M
        EEO[1] = EE[1] - EE[2];
112
113
13.3M
        dst[0] = (int16_t)((g_t16[0][0] * EEE[0] + g_t16[0][1] * EEE[1] + add) >> shift);
114
13.3M
        dst[8 * line] = (int16_t)((g_t16[8][0] * EEE[0] + g_t16[8][1] * EEE[1] + add) >> shift);
115
13.3M
        dst[4 * line] = (int16_t)((g_t16[4][0] * EEO[0] + g_t16[4][1] * EEO[1] + add) >> shift);
116
13.3M
        dst[12 * line] = (int16_t)((g_t16[12][0] * EEO[0] + g_t16[12][1] * EEO[1] + add) >> shift);
117
118
66.9M
        for (k = 2; k < 16; k += 4)
119
53.6M
        {
120
53.6M
            dst[k * line] = (int16_t)((g_t16[k][0] * EO[0] + g_t16[k][1] * EO[1] + g_t16[k][2] * EO[2] +
121
53.6M
                                       g_t16[k][3] * EO[3] + add) >> shift);
122
53.6M
        }
123
124
120M
        for (k = 1; k < 16; k += 2)
125
107M
        {
126
107M
            dst[k * line] =  (int16_t)((g_t16[k][0] * O[0] + g_t16[k][1] * O[1] + g_t16[k][2] * O[2] + g_t16[k][3] * O[3] +
127
107M
                                        g_t16[k][4] * O[4] + g_t16[k][5] * O[5] + g_t16[k][6] * O[6] + g_t16[k][7] * O[7] +
128
107M
                                        add) >> shift);
129
107M
        }
130
131
13.3M
        src += 16;
132
13.3M
        dst++;
133
13.3M
    }
134
842k
}
135
136
static void partialButterfly32(const int16_t* src, int16_t* dst, int shift, int line)
137
95.3k
{
138
95.3k
    int j, k;
139
95.3k
    int E[16], O[16];
140
95.3k
    int EE[8], EO[8];
141
95.3k
    int EEE[4], EEO[4];
142
95.3k
    int EEEE[2], EEEO[2];
143
95.3k
    int add = 1 << (shift - 1);
144
145
3.12M
    for (j = 0; j < line; j++)
146
3.03M
    {
147
        /* E and O*/
148
51.4M
        for (k = 0; k < 16; k++)
149
48.4M
        {
150
48.4M
            E[k] = src[k] + src[31 - k];
151
48.4M
            O[k] = src[k] - src[31 - k];
152
48.4M
        }
153
154
        /* EE and EO */
155
27.2M
        for (k = 0; k < 8; k++)
156
24.2M
        {
157
24.2M
            EE[k] = E[k] + E[15 - k];
158
24.2M
            EO[k] = E[k] - E[15 - k];
159
24.2M
        }
160
161
        /* EEE and EEO */
162
15.1M
        for (k = 0; k < 4; k++)
163
12.0M
        {
164
12.0M
            EEE[k] = EE[k] + EE[7 - k];
165
12.0M
            EEO[k] = EE[k] - EE[7 - k];
166
12.0M
        }
167
168
        /* EEEE and EEEO */
169
3.03M
        EEEE[0] = EEE[0] + EEE[3];
170
3.03M
        EEEO[0] = EEE[0] - EEE[3];
171
3.03M
        EEEE[1] = EEE[1] + EEE[2];
172
3.03M
        EEEO[1] = EEE[1] - EEE[2];
173
174
3.03M
        dst[0] = (int16_t)((g_t32[0][0] * EEEE[0] + g_t32[0][1] * EEEE[1] + add) >> shift);
175
3.03M
        dst[16 * line] = (int16_t)((g_t32[16][0] * EEEE[0] + g_t32[16][1] * EEEE[1] + add) >> shift);
176
3.03M
        dst[8 * line] = (int16_t)((g_t32[8][0] * EEEO[0] + g_t32[8][1] * EEEO[1] + add) >> shift);
177
3.03M
        dst[24 * line] = (int16_t)((g_t32[24][0] * EEEO[0] + g_t32[24][1] * EEEO[1] + add) >> shift);
178
15.1M
        for (k = 4; k < 32; k += 8)
179
12.1M
        {
180
12.1M
            dst[k * line] = (int16_t)((g_t32[k][0] * EEO[0] + g_t32[k][1] * EEO[1] + g_t32[k][2] * EEO[2] +
181
12.1M
                                       g_t32[k][3] * EEO[3] + add) >> shift);
182
12.1M
        }
183
184
27.3M
        for (k = 2; k < 32; k += 4)
185
24.3M
        {
186
24.3M
            dst[k * line] = (int16_t)((g_t32[k][0] * EO[0] + g_t32[k][1] * EO[1] + g_t32[k][2] * EO[2] +
187
24.3M
                                       g_t32[k][3] * EO[3] + g_t32[k][4] * EO[4] + g_t32[k][5] * EO[5] +
188
24.3M
                                       g_t32[k][6] * EO[6] + g_t32[k][7] * EO[7] + add) >> shift);
189
24.3M
        }
190
191
51.4M
        for (k = 1; k < 32; k += 2)
192
48.3M
        {
193
48.3M
            dst[k * line] = (int16_t)((g_t32[k][0] * O[0] + g_t32[k][1] * O[1] + g_t32[k][2] * O[2] + g_t32[k][3] * O[3] +
194
48.3M
                                       g_t32[k][4] * O[4] + g_t32[k][5] * O[5] + g_t32[k][6] * O[6] + g_t32[k][7] * O[7] +
195
48.3M
                                       g_t32[k][8] * O[8] + g_t32[k][9] * O[9] + g_t32[k][10] * O[10] + g_t32[k][11] *
196
48.3M
                                       O[11] + g_t32[k][12] * O[12] + g_t32[k][13] * O[13] + g_t32[k][14] * O[14] +
197
48.3M
                                       g_t32[k][15] * O[15] + add) >> shift);
198
48.3M
        }
199
200
3.03M
        src += 32;
201
3.03M
        dst++;
202
3.03M
    }
203
95.3k
}
204
205
static void partialButterfly8(const int16_t* src, int16_t* dst, int shift, int line)
206
3.73M
{
207
3.73M
    int j, k;
208
3.73M
    int E[4], O[4];
209
3.73M
    int EE[2], EO[2];
210
3.73M
    int add = 1 << (shift - 1);
211
212
33.5M
    for (j = 0; j < line; j++)
213
29.8M
    {
214
        /* E and O*/
215
149M
        for (k = 0; k < 4; k++)
216
119M
        {
217
119M
            E[k] = src[k] + src[7 - k];
218
119M
            O[k] = src[k] - src[7 - k];
219
119M
        }
220
221
        /* EE and EO */
222
29.8M
        EE[0] = E[0] + E[3];
223
29.8M
        EO[0] = E[0] - E[3];
224
29.8M
        EE[1] = E[1] + E[2];
225
29.8M
        EO[1] = E[1] - E[2];
226
227
29.8M
        dst[0] = (int16_t)((g_t8[0][0] * EE[0] + g_t8[0][1] * EE[1] + add) >> shift);
228
29.8M
        dst[4 * line] = (int16_t)((g_t8[4][0] * EE[0] + g_t8[4][1] * EE[1] + add) >> shift);
229
29.8M
        dst[2 * line] = (int16_t)((g_t8[2][0] * EO[0] + g_t8[2][1] * EO[1] + add) >> shift);
230
29.8M
        dst[6 * line] = (int16_t)((g_t8[6][0] * EO[0] + g_t8[6][1] * EO[1] + add) >> shift);
231
232
29.8M
        dst[line] = (int16_t)((g_t8[1][0] * O[0] + g_t8[1][1] * O[1] + g_t8[1][2] * O[2] + g_t8[1][3] * O[3] + add) >> shift);
233
29.8M
        dst[3 * line] = (int16_t)((g_t8[3][0] * O[0] + g_t8[3][1] * O[1] + g_t8[3][2] * O[2] + g_t8[3][3] * O[3] + add) >> shift);
234
29.8M
        dst[5 * line] = (int16_t)((g_t8[5][0] * O[0] + g_t8[5][1] * O[1] + g_t8[5][2] * O[2] + g_t8[5][3] * O[3] + add) >> shift);
235
29.8M
        dst[7 * line] = (int16_t)((g_t8[7][0] * O[0] + g_t8[7][1] * O[1] + g_t8[7][2] * O[2] + g_t8[7][3] * O[3] + add) >> shift);
236
237
29.8M
        src += 8;
238
29.8M
        dst++;
239
29.8M
    }
240
3.73M
}
241
242
static void partialButterflyInverse4(const int16_t* src, int16_t* dst, int shift, int line)
243
0
{
244
0
    int j;
245
0
    int E[2], O[2];
246
0
    int add = 1 << (shift - 1);
247
248
0
    for (j = 0; j < line; j++)
249
0
    {
250
        /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
251
0
        O[0] = g_t4[1][0] * src[line] + g_t4[3][0] * src[3 * line];
252
0
        O[1] = g_t4[1][1] * src[line] + g_t4[3][1] * src[3 * line];
253
0
        E[0] = g_t4[0][0] * src[0] + g_t4[2][0] * src[2 * line];
254
0
        E[1] = g_t4[0][1] * src[0] + g_t4[2][1] * src[2 * line];
255
256
        /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
257
0
        dst[0] = (int16_t)(x265_clip3(-32768, 32767, (E[0] + O[0] + add) >> shift));
258
0
        dst[1] = (int16_t)(x265_clip3(-32768, 32767, (E[1] + O[1] + add) >> shift));
259
0
        dst[2] = (int16_t)(x265_clip3(-32768, 32767, (E[1] - O[1] + add) >> shift));
260
0
        dst[3] = (int16_t)(x265_clip3(-32768, 32767, (E[0] - O[0] + add) >> shift));
261
262
0
        src++;
263
0
        dst += 4;
264
0
    }
265
0
}
266
267
static void partialButterflyInverse8(const int16_t* src, int16_t* dst, int shift, int line)
268
0
{
269
0
    int j, k;
270
0
    int E[4], O[4];
271
0
    int EE[2], EO[2];
272
0
    int add = 1 << (shift - 1);
273
274
0
    for (j = 0; j < line; j++)
275
0
    {
276
        /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
277
0
        for (k = 0; k < 4; k++)
278
0
        {
279
0
            O[k] = g_t8[1][k] * src[line] + g_t8[3][k] * src[3 * line] + g_t8[5][k] * src[5 * line] + g_t8[7][k] * src[7 * line];
280
0
        }
281
282
0
        EO[0] = g_t8[2][0] * src[2 * line] + g_t8[6][0] * src[6 * line];
283
0
        EO[1] = g_t8[2][1] * src[2 * line] + g_t8[6][1] * src[6 * line];
284
0
        EE[0] = g_t8[0][0] * src[0] + g_t8[4][0] * src[4 * line];
285
0
        EE[1] = g_t8[0][1] * src[0] + g_t8[4][1] * src[4 * line];
286
287
        /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
288
0
        E[0] = EE[0] + EO[0];
289
0
        E[3] = EE[0] - EO[0];
290
0
        E[1] = EE[1] + EO[1];
291
0
        E[2] = EE[1] - EO[1];
292
0
        for (k = 0; k < 4; k++)
293
0
        {
294
0
            dst[k] = (int16_t)x265_clip3(-32768, 32767, (E[k] + O[k] + add) >> shift);
295
0
            dst[k + 4] = (int16_t)x265_clip3(-32768, 32767, (E[3 - k] - O[3 - k] + add) >> shift);
296
0
        }
297
298
0
        src++;
299
0
        dst += 8;
300
0
    }
301
0
}
302
303
static void partialButterflyInverse16(const int16_t* src, int16_t* dst, int shift, int line)
304
0
{
305
0
    int j, k;
306
0
    int E[8], O[8];
307
0
    int EE[4], EO[4];
308
0
    int EEE[2], EEO[2];
309
0
    int add = 1 << (shift - 1);
310
311
0
    for (j = 0; j < line; j++)
312
0
    {
313
        /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
314
0
        for (k = 0; k < 8; k++)
315
0
        {
316
0
            O[k] = g_t16[1][k] * src[line] + g_t16[3][k] * src[3 * line] + g_t16[5][k] * src[5 * line] + g_t16[7][k] * src[7 * line] +
317
0
                g_t16[9][k] * src[9 * line] + g_t16[11][k] * src[11 * line] + g_t16[13][k] * src[13 * line] + g_t16[15][k] * src[15 * line];
318
0
        }
319
320
0
        for (k = 0; k < 4; k++)
321
0
        {
322
0
            EO[k] = g_t16[2][k] * src[2 * line] + g_t16[6][k] * src[6 * line] + g_t16[10][k] * src[10 * line] + g_t16[14][k] * src[14 * line];
323
0
        }
324
325
0
        EEO[0] = g_t16[4][0] * src[4 * line] + g_t16[12][0] * src[12 * line];
326
0
        EEE[0] = g_t16[0][0] * src[0] + g_t16[8][0] * src[8 * line];
327
0
        EEO[1] = g_t16[4][1] * src[4 * line] + g_t16[12][1] * src[12 * line];
328
0
        EEE[1] = g_t16[0][1] * src[0] + g_t16[8][1] * src[8 * line];
329
330
        /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
331
0
        for (k = 0; k < 2; k++)
332
0
        {
333
0
            EE[k] = EEE[k] + EEO[k];
334
0
            EE[k + 2] = EEE[1 - k] - EEO[1 - k];
335
0
        }
336
337
0
        for (k = 0; k < 4; k++)
338
0
        {
339
0
            E[k] = EE[k] + EO[k];
340
0
            E[k + 4] = EE[3 - k] - EO[3 - k];
341
0
        }
342
343
0
        for (k = 0; k < 8; k++)
344
0
        {
345
0
            dst[k]   = (int16_t)x265_clip3(-32768, 32767, (E[k] + O[k] + add) >> shift);
346
0
            dst[k + 8] = (int16_t)x265_clip3(-32768, 32767, (E[7 - k] - O[7 - k] + add) >> shift);
347
0
        }
348
349
0
        src++;
350
0
        dst += 16;
351
0
    }
352
0
}
353
354
static void partialButterflyInverse32(const int16_t* src, int16_t* dst, int shift, int line)
355
0
{
356
0
    int j, k;
357
0
    int E[16], O[16];
358
0
    int EE[8], EO[8];
359
0
    int EEE[4], EEO[4];
360
0
    int EEEE[2], EEEO[2];
361
0
    int add = 1 << (shift - 1);
362
363
0
    for (j = 0; j < line; j++)
364
0
    {
365
        /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
366
0
        for (k = 0; k < 16; k++)
367
0
        {
368
0
            O[k] = g_t32[1][k] * src[line] + g_t32[3][k] * src[3 * line] + g_t32[5][k] * src[5 * line] + g_t32[7][k] * src[7 * line] +
369
0
                g_t32[9][k] * src[9 * line] + g_t32[11][k] * src[11 * line] + g_t32[13][k] * src[13 * line] + g_t32[15][k] * src[15 * line] +
370
0
                g_t32[17][k] * src[17 * line] + g_t32[19][k] * src[19 * line] + g_t32[21][k] * src[21 * line] + g_t32[23][k] * src[23 * line] +
371
0
                g_t32[25][k] * src[25 * line] + g_t32[27][k] * src[27 * line] + g_t32[29][k] * src[29 * line] + g_t32[31][k] * src[31 * line];
372
0
        }
373
374
0
        for (k = 0; k < 8; k++)
375
0
        {
376
0
            EO[k] = g_t32[2][k] * src[2 * line] + g_t32[6][k] * src[6 * line] + g_t32[10][k] * src[10 * line] + g_t32[14][k] * src[14 * line] +
377
0
                g_t32[18][k] * src[18 * line] + g_t32[22][k] * src[22 * line] + g_t32[26][k] * src[26 * line] + g_t32[30][k] * src[30 * line];
378
0
        }
379
380
0
        for (k = 0; k < 4; k++)
381
0
        {
382
0
            EEO[k] = g_t32[4][k] * src[4 * line] + g_t32[12][k] * src[12 * line] + g_t32[20][k] * src[20 * line] + g_t32[28][k] * src[28 * line];
383
0
        }
384
385
0
        EEEO[0] = g_t32[8][0] * src[8 * line] + g_t32[24][0] * src[24 * line];
386
0
        EEEO[1] = g_t32[8][1] * src[8 * line] + g_t32[24][1] * src[24 * line];
387
0
        EEEE[0] = g_t32[0][0] * src[0] + g_t32[16][0] * src[16 * line];
388
0
        EEEE[1] = g_t32[0][1] * src[0] + g_t32[16][1] * src[16 * line];
389
390
        /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
391
0
        EEE[0] = EEEE[0] + EEEO[0];
392
0
        EEE[3] = EEEE[0] - EEEO[0];
393
0
        EEE[1] = EEEE[1] + EEEO[1];
394
0
        EEE[2] = EEEE[1] - EEEO[1];
395
0
        for (k = 0; k < 4; k++)
396
0
        {
397
0
            EE[k] = EEE[k] + EEO[k];
398
0
            EE[k + 4] = EEE[3 - k] - EEO[3 - k];
399
0
        }
400
401
0
        for (k = 0; k < 8; k++)
402
0
        {
403
0
            E[k] = EE[k] + EO[k];
404
0
            E[k + 8] = EE[7 - k] - EO[7 - k];
405
0
        }
406
407
0
        for (k = 0; k < 16; k++)
408
0
        {
409
0
            dst[k] = (int16_t)x265_clip3(-32768, 32767, (E[k] + O[k] + add) >> shift);
410
0
            dst[k + 16] = (int16_t)x265_clip3(-32768, 32767, (E[15 - k] - O[15 - k] + add) >> shift);
411
0
        }
412
413
0
        src++;
414
0
        dst += 32;
415
0
    }
416
0
}
417
418
static void partialButterfly4(const int16_t* src, int16_t* dst, int shift, int line)
419
14.2M
{
420
14.2M
    int j;
421
14.2M
    int E[2], O[2];
422
14.2M
    int add = 1 << (shift - 1);
423
424
71.1M
    for (j = 0; j < line; j++)
425
56.9M
    {
426
        /* E and O */
427
56.9M
        E[0] = src[0] + src[3];
428
56.9M
        O[0] = src[0] - src[3];
429
56.9M
        E[1] = src[1] + src[2];
430
56.9M
        O[1] = src[1] - src[2];
431
432
56.9M
        dst[0] = (int16_t)((g_t4[0][0] * E[0] + g_t4[0][1] * E[1] + add) >> shift);
433
56.9M
        dst[2 * line] = (int16_t)((g_t4[2][0] * E[0] + g_t4[2][1] * E[1] + add) >> shift);
434
56.9M
        dst[line] = (int16_t)((g_t4[1][0] * O[0] + g_t4[1][1] * O[1] + add) >> shift);
435
56.9M
        dst[3 * line] = (int16_t)((g_t4[3][0] * O[0] + g_t4[3][1] * O[1] + add) >> shift);
436
437
56.9M
        src += 4;
438
56.9M
        dst++;
439
56.9M
    }
440
14.2M
}
441
442
namespace X265_NS {
443
void dst4_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
444
2.68M
{
445
2.68M
    const int shift_1st = 1 + X265_DEPTH - 8;
446
2.68M
    const int shift_2nd = 8;
447
448
2.68M
    ALIGN_VAR_32(int16_t, coef[4 * 4]);
449
2.68M
    ALIGN_VAR_32(int16_t, block[4 * 4]);
450
451
13.4M
    for (int i = 0; i < 4; i++)
452
10.7M
    {
453
10.7M
        memcpy(&block[i * 4], &src[i * srcStride], 4 * sizeof(int16_t));
454
10.7M
    }
455
456
2.68M
    fastForwardDst(block, coef, shift_1st);
457
2.68M
    fastForwardDst(coef, dst, shift_2nd);
458
2.68M
}
459
460
void dct4_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
461
7.13M
{
462
7.13M
    const int shift_1st = 1 + X265_DEPTH - 8;
463
7.13M
    const int shift_2nd = 8;
464
465
7.13M
    ALIGN_VAR_32(int16_t, coef[4 * 4]);
466
7.13M
    ALIGN_VAR_32(int16_t, block[4 * 4]);
467
468
35.6M
    for (int i = 0; i < 4; i++)
469
28.5M
    {
470
28.5M
        memcpy(&block[i * 4], &src[i * srcStride], 4 * sizeof(int16_t));
471
28.5M
    }
472
473
7.13M
    partialButterfly4(block, coef, shift_1st, 4);
474
7.13M
    partialButterfly4(coef, dst, shift_2nd, 4);
475
7.13M
}
476
477
void dct8_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
478
1.86M
{
479
1.86M
    const int shift_1st = 2 + X265_DEPTH - 8;
480
1.86M
    const int shift_2nd = 9;
481
482
1.86M
    ALIGN_VAR_32(int16_t, coef[8 * 8]);
483
1.86M
    ALIGN_VAR_32(int16_t, block[8 * 8]);
484
485
16.8M
    for (int i = 0; i < 8; i++)
486
14.9M
    {
487
14.9M
        memcpy(&block[i * 8], &src[i * srcStride], 8 * sizeof(int16_t));
488
14.9M
    }
489
490
1.86M
    partialButterfly8(block, coef, shift_1st, 8);
491
1.86M
    partialButterfly8(coef, dst, shift_2nd, 8);
492
1.86M
}
493
494
void dct16_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
495
421k
{
496
421k
    const int shift_1st = 3 + X265_DEPTH - 8;
497
421k
    const int shift_2nd = 10;
498
499
421k
    ALIGN_VAR_32(int16_t, coef[16 * 16]);
500
421k
    ALIGN_VAR_32(int16_t, block[16 * 16]);
501
502
7.16M
    for (int i = 0; i < 16; i++)
503
6.74M
    {
504
6.74M
        memcpy(&block[i * 16], &src[i * srcStride], 16 * sizeof(int16_t));
505
6.74M
    }
506
507
421k
    partialButterfly16(block, coef, shift_1st, 16);
508
421k
    partialButterfly16(coef, dst, shift_2nd, 16);
509
421k
}
510
511
void dct32_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
512
47.6k
{
513
47.6k
    const int shift_1st = 4 + X265_DEPTH - 8;
514
47.6k
    const int shift_2nd = 11;
515
516
47.6k
    ALIGN_VAR_32(int16_t, coef[32 * 32]);
517
47.6k
    ALIGN_VAR_32(int16_t, block[32 * 32]);
518
519
1.57M
    for (int i = 0; i < 32; i++)
520
1.52M
    {
521
1.52M
        memcpy(&block[i * 32], &src[i * srcStride], 32 * sizeof(int16_t));
522
1.52M
    }
523
524
47.6k
    partialButterfly32(block, coef, shift_1st, 32);
525
47.6k
    partialButterfly32(coef, dst, shift_2nd, 32);
526
47.6k
}
527
528
void idst4_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
529
5.73k
{
530
5.73k
    const int shift_1st = 7;
531
5.73k
    const int shift_2nd = 12 - (X265_DEPTH - 8);
532
533
5.73k
    ALIGN_VAR_32(int16_t, coef[4 * 4]);
534
5.73k
    ALIGN_VAR_32(int16_t, block[4 * 4]);
535
536
5.73k
    inversedst(src, coef, shift_1st); // Forward DST BY FAST ALGORITHM, block input, coef output
537
5.73k
    inversedst(coef, block, shift_2nd); // Forward DST BY FAST ALGORITHM, coef input, coeff output
538
539
28.6k
    for (int i = 0; i < 4; i++)
540
22.9k
    {
541
22.9k
        memcpy(&dst[i * dstStride], &block[i * 4], 4 * sizeof(int16_t));
542
22.9k
    }
543
5.73k
}
544
545
void idct4_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
546
0
{
547
0
    const int shift_1st = 7;
548
0
    const int shift_2nd = 12 - (X265_DEPTH - 8);
549
550
0
    ALIGN_VAR_32(int16_t, coef[4 * 4]);
551
0
    ALIGN_VAR_32(int16_t, block[4 * 4]);
552
553
0
    partialButterflyInverse4(src, coef, shift_1st, 4); // Forward DST BY FAST ALGORITHM, block input, coef output
554
0
    partialButterflyInverse4(coef, block, shift_2nd, 4); // Forward DST BY FAST ALGORITHM, coef input, coeff output
555
556
0
    for (int i = 0; i < 4; i++)
557
0
    {
558
0
        memcpy(&dst[i * dstStride], &block[i * 4], 4 * sizeof(int16_t));
559
0
    }
560
0
}
561
562
void idct8_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
563
0
{
564
0
    const int shift_1st = 7;
565
0
    const int shift_2nd = 12 - (X265_DEPTH - 8);
566
567
0
    ALIGN_VAR_32(int16_t, coef[8 * 8]);
568
0
    ALIGN_VAR_32(int16_t, block[8 * 8]);
569
570
0
    partialButterflyInverse8(src, coef, shift_1st, 8);
571
0
    partialButterflyInverse8(coef, block, shift_2nd, 8);
572
573
0
    for (int i = 0; i < 8; i++)
574
0
    {
575
0
        memcpy(&dst[i * dstStride], &block[i * 8], 8 * sizeof(int16_t));
576
0
    }
577
0
}
578
579
void idct16_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
580
0
{
581
0
    const int shift_1st = 7;
582
0
    const int shift_2nd = 12 - (X265_DEPTH - 8);
583
584
0
    ALIGN_VAR_32(int16_t, coef[16 * 16]);
585
0
    ALIGN_VAR_32(int16_t, block[16 * 16]);
586
587
0
    partialButterflyInverse16(src, coef, shift_1st, 16);
588
0
    partialButterflyInverse16(coef, block, shift_2nd, 16);
589
590
0
    for (int i = 0; i < 16; i++)
591
0
    {
592
0
        memcpy(&dst[i * dstStride], &block[i * 16], 16 * sizeof(int16_t));
593
0
    }
594
0
}
595
596
void idct32_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
597
0
{
598
0
    const int shift_1st = 7;
599
0
    const int shift_2nd = 12 - (X265_DEPTH - 8);
600
601
0
    ALIGN_VAR_32(int16_t, coef[32 * 32]);
602
0
    ALIGN_VAR_32(int16_t, block[32 * 32]);
603
604
0
    partialButterflyInverse32(src, coef, shift_1st, 32);
605
0
    partialButterflyInverse32(coef, block, shift_2nd, 32);
606
607
0
    for (int i = 0; i < 32; i++)
608
0
    {
609
0
        memcpy(&dst[i * dstStride], &block[i * 32], 32 * sizeof(int16_t));
610
0
    }
611
0
}
612
} // namespace X265_NS
613
614
static void dequant_normal_c(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift)
615
37.9k
{
616
#if HIGH_BIT_DEPTH
617
    X265_CHECK(scale < 32768 || ((scale & 3) == 0 && shift > (X265_DEPTH - 8)), "dequant invalid scale %d\n", scale);
618
#else
619
    // NOTE: maximum of scale is (72 * 256)
620
37.9k
    X265_CHECK(scale < 32768, "dequant invalid scale %d\n", scale);
621
37.9k
#endif
622
37.9k
    X265_CHECK(num <= 32 * 32, "dequant num %d too large\n", num);
623
37.9k
    X265_CHECK((num % 8) == 0, "dequant num %d not multiple of 8\n", num);
624
37.9k
    X265_CHECK(shift <= 10, "shift too large %d\n", shift);
625
37.9k
    X265_CHECK(((intptr_t)coef & 31) == 0, "dequant coef buffer not aligned\n");
626
627
37.9k
    int add, coeffQ;
628
629
37.9k
    add = 1 << (shift - 1);
630
631
5.91M
    for (int n = 0; n < num; n++)
632
5.87M
    {
633
5.87M
        coeffQ = (quantCoef[n] * scale + add) >> shift;
634
5.87M
        coef[n] = (int16_t)x265_clip3(-32768, 32767, coeffQ);
635
5.87M
    }
636
37.9k
}
637
638
static void dequant_scaling_c(const int16_t* quantCoef, const int32_t* deQuantCoef, int16_t* coef, int num, int per, int shift)
639
0
{
640
0
    X265_CHECK(num <= 32 * 32, "dequant num %d too large\n", num);
641
642
0
    int add, coeffQ;
643
644
0
    shift += 4;
645
646
0
    if (shift > per)
647
0
    {
648
0
        add = 1 << (shift - per - 1);
649
650
0
        for (int n = 0; n < num; n++)
651
0
        {
652
0
            coeffQ = ((quantCoef[n] * deQuantCoef[n]) + add) >> (shift - per);
653
0
            coef[n] = (int16_t)x265_clip3(-32768, 32767, coeffQ);
654
0
        }
655
0
    }
656
0
    else
657
0
    {
658
0
        for (int n = 0; n < num; n++)
659
0
        {
660
0
            coeffQ   = x265_clip3(-32768, 32767, quantCoef[n] * deQuantCoef[n]);
661
0
            coef[n] = (int16_t)x265_clip3(-32768, 32767, coeffQ << (per - shift));
662
0
        }
663
0
    }
664
0
}
665
666
static uint32_t quant_c(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff)
667
0
{
668
0
    X265_CHECK(qBits >= 8, "qBits less than 8\n");
669
0
    X265_CHECK((numCoeff % 16) == 0, "numCoeff must be multiple of 16\n");
670
0
    int qBits8 = qBits - 8;
671
0
    uint32_t numSig = 0;
672
673
0
    for (int blockpos = 0; blockpos < numCoeff; blockpos++)
674
0
    {
675
0
        int level = coef[blockpos];
676
0
        int sign  = (level < 0 ? -1 : 1);
677
678
0
        int tmplevel = abs(level) * quantCoeff[blockpos];
679
0
        level = ((tmplevel + add) >> qBits);
680
0
        deltaU[blockpos] = ((tmplevel - (level << qBits)) >> qBits8);
681
0
        if (level)
682
0
            ++numSig;
683
0
        level *= sign;
684
0
        qCoef[blockpos] = (int16_t)x265_clip3(-32768, 32767, level);
685
0
    }
686
687
0
    return numSig;
688
0
}
689
690
static uint32_t nquant_c(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff)
691
8.61M
{
692
8.61M
    X265_CHECK((numCoeff % 16) == 0, "number of quant coeff is not multiple of 4x4\n");
693
8.61M
    X265_CHECK((uint32_t)add < ((uint32_t)1 << qBits), "2 ^ qBits less than add\n");
694
8.61M
    X265_CHECK(((intptr_t)quantCoeff & 31) == 0, "quantCoeff buffer not aligned\n");
695
696
8.61M
    uint32_t numSig = 0;
697
698
287M
    for (int blockpos = 0; blockpos < numCoeff; blockpos++)
699
279M
    {
700
279M
        int level = coef[blockpos];
701
279M
        int sign  = (level < 0 ? -1 : 1);
702
703
279M
        int tmplevel = abs(level) * quantCoeff[blockpos];
704
279M
        level = ((tmplevel + add) >> qBits);
705
279M
        if (level)
706
112k
            ++numSig;
707
279M
        level *= sign;
708
709
        // TODO: when we limit range to [-32767, 32767], we can get more performance with output change
710
        //       But nquant is a little percent in rdoQuant, so I keep old dynamic range for compatible
711
279M
        qCoef[blockpos] = (int16_t)abs(x265_clip3(-32768, 32767, level));
712
279M
    }
713
714
8.61M
    return numSig;
715
8.61M
}
716
template<int trSize>
717
int  count_nonzero_c(const int16_t* quantCoeff)
718
59.1k
{
719
59.1k
    X265_CHECK(((intptr_t)quantCoeff & 15) == 0, "quant buffer not aligned\n");
720
59.1k
    int count = 0;
721
59.1k
    int numCoeff = trSize * trSize;
722
9.50M
    for (int i = 0; i < numCoeff; i++)
723
9.44M
    {
724
9.44M
        count += quantCoeff[i] != 0;
725
9.44M
    }
726
727
59.1k
    return count;
728
59.1k
}
int count_nonzero_c<4>(short const*)
Line
Count
Source
718
25.2k
{
719
25.2k
    X265_CHECK(((intptr_t)quantCoeff & 15) == 0, "quant buffer not aligned\n");
720
25.2k
    int count = 0;
721
25.2k
    int numCoeff = trSize * trSize;
722
428k
    for (int i = 0; i < numCoeff; i++)
723
403k
    {
724
403k
        count += quantCoeff[i] != 0;
725
403k
    }
726
727
25.2k
    return count;
728
25.2k
}
int count_nonzero_c<8>(short const*)
Line
Count
Source
718
15.4k
{
719
15.4k
    X265_CHECK(((intptr_t)quantCoeff & 15) == 0, "quant buffer not aligned\n");
720
15.4k
    int count = 0;
721
15.4k
    int numCoeff = trSize * trSize;
722
1.00M
    for (int i = 0; i < numCoeff; i++)
723
989k
    {
724
989k
        count += quantCoeff[i] != 0;
725
989k
    }
726
727
15.4k
    return count;
728
15.4k
}
int count_nonzero_c<16>(short const*)
Line
Count
Source
718
14.1k
{
719
14.1k
    X265_CHECK(((intptr_t)quantCoeff & 15) == 0, "quant buffer not aligned\n");
720
14.1k
    int count = 0;
721
14.1k
    int numCoeff = trSize * trSize;
722
3.63M
    for (int i = 0; i < numCoeff; i++)
723
3.61M
    {
724
3.61M
        count += quantCoeff[i] != 0;
725
3.61M
    }
726
727
14.1k
    return count;
728
14.1k
}
int count_nonzero_c<32>(short const*)
Line
Count
Source
718
4.33k
{
719
4.33k
    X265_CHECK(((intptr_t)quantCoeff & 15) == 0, "quant buffer not aligned\n");
720
4.33k
    int count = 0;
721
4.33k
    int numCoeff = trSize * trSize;
722
4.43M
    for (int i = 0; i < numCoeff; i++)
723
4.43M
    {
724
4.43M
        count += quantCoeff[i] != 0;
725
4.43M
    }
726
727
4.33k
    return count;
728
4.33k
}
729
730
template<int trSize>
731
uint32_t copy_count(int16_t* coeff, const int16_t* residual, intptr_t resiStride)
732
2.54M
{
733
2.54M
    uint32_t numSig = 0;
734
15.2M
    for (int k = 0; k < trSize; k++)
735
12.7M
    {
736
96.6M
        for (int j = 0; j < trSize; j++)
737
83.9M
        {
738
83.9M
            coeff[k * trSize + j] = residual[k * resiStride + j];
739
83.9M
            numSig += (residual[k * resiStride + j] != 0);
740
83.9M
        }
741
12.7M
    }
742
743
2.54M
    return numSig;
744
2.54M
}
unsigned int copy_count<4>(short*, short const*, long)
Line
Count
Source
732
2.09M
{
733
2.09M
    uint32_t numSig = 0;
734
10.4M
    for (int k = 0; k < trSize; k++)
735
8.37M
    {
736
41.8M
        for (int j = 0; j < trSize; j++)
737
33.4M
        {
738
33.4M
            coeff[k * trSize + j] = residual[k * resiStride + j];
739
33.4M
            numSig += (residual[k * resiStride + j] != 0);
740
33.4M
        }
741
8.37M
    }
742
743
2.09M
    return numSig;
744
2.09M
}
unsigned int copy_count<8>(short*, short const*, long)
Line
Count
Source
732
355k
{
733
355k
    uint32_t numSig = 0;
734
3.19M
    for (int k = 0; k < trSize; k++)
735
2.83M
    {
736
25.5M
        for (int j = 0; j < trSize; j++)
737
22.6M
        {
738
22.6M
            coeff[k * trSize + j] = residual[k * resiStride + j];
739
22.6M
            numSig += (residual[k * resiStride + j] != 0);
740
22.6M
        }
741
2.83M
    }
742
743
355k
    return numSig;
744
355k
}
unsigned int copy_count<16>(short*, short const*, long)
Line
Count
Source
732
79.1k
{
733
79.1k
    uint32_t numSig = 0;
734
1.34M
    for (int k = 0; k < trSize; k++)
735
1.26M
    {
736
21.4M
        for (int j = 0; j < trSize; j++)
737
20.2M
        {
738
20.2M
            coeff[k * trSize + j] = residual[k * resiStride + j];
739
20.2M
            numSig += (residual[k * resiStride + j] != 0);
740
20.2M
        }
741
1.26M
    }
742
743
79.1k
    return numSig;
744
79.1k
}
unsigned int copy_count<32>(short*, short const*, long)
Line
Count
Source
732
7.41k
{
733
7.41k
    uint32_t numSig = 0;
734
244k
    for (int k = 0; k < trSize; k++)
735
237k
    {
736
7.82M
        for (int j = 0; j < trSize; j++)
737
7.58M
        {
738
7.58M
            coeff[k * trSize + j] = residual[k * resiStride + j];
739
7.58M
            numSig += (residual[k * resiStride + j] != 0);
740
7.58M
        }
741
237k
    }
742
743
7.41k
    return numSig;
744
7.41k
}
745
746
static void denoiseDct_c(int16_t* dctCoef, uint32_t* resSum, const uint16_t* offset, int numCoeff)
747
0
{
748
0
    for (int i = 0; i < numCoeff; i++)
749
0
    {
750
0
        int level = dctCoef[i];
751
0
        int sign = level >> 31;
752
0
        level = (level + sign) ^ sign;
753
0
        resSum[i] += level;
754
0
        level -= offset[i];
755
0
        dctCoef[i] = (int16_t)(level < 0 ? 0 : (level ^ sign) - sign);
756
0
    }
757
0
}
758
759
static int scanPosLast_c(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* /*scanCG4x4*/, const int /*trSize*/)
760
111k
{
761
111k
    memset(coeffNum, 0, MLS_GRP_NUM * sizeof(*coeffNum));
762
111k
    memset(coeffFlag, 0, MLS_GRP_NUM * sizeof(*coeffFlag));
763
111k
    memset(coeffSign, 0, MLS_GRP_NUM * sizeof(*coeffSign));
764
765
111k
    int scanPosLast = 0;
766
111k
    do
767
1.60M
    {
768
1.60M
        const uint32_t cgIdx = (uint32_t)scanPosLast >> MLS_CG_SIZE;
769
770
1.60M
        const uint32_t posLast = scan[scanPosLast++];
771
772
1.60M
        const int curCoeff = coeff[posLast];
773
1.60M
        const uint32_t isNZCoeff = (curCoeff != 0);
774
        // get L1 sig map
775
        // NOTE: the new algorithm is complicated, so I keep reference code here
776
        //uint32_t posy   = posLast >> log2TrSize;
777
        //uint32_t posx   = posLast - (posy << log2TrSize);
778
        //uint32_t blkIdx0 = ((posy >> MLS_CG_LOG2_SIZE) << codingParameters.log2TrSizeCG) + (posx >> MLS_CG_LOG2_SIZE);
779
        //const uint32_t blkIdx = ((posLast >> (2 * MLS_CG_LOG2_SIZE)) & ~maskPosXY) + ((posLast >> MLS_CG_LOG2_SIZE) & maskPosXY);
780
        //sigCoeffGroupFlag64 |= ((uint64_t)isNZCoeff << blkIdx);
781
1.60M
        numSig -= isNZCoeff;
782
783
        // TODO: optimize by instruction BTS
784
1.60M
        coeffSign[cgIdx] += (uint16_t)(((uint32_t)curCoeff >> 31) << coeffNum[cgIdx]);
785
1.60M
        coeffFlag[cgIdx] = (coeffFlag[cgIdx] << 1) + (uint16_t)isNZCoeff;
786
1.60M
        coeffNum[cgIdx] += (uint8_t)isNZCoeff;
787
1.60M
    }
788
1.60M
    while (numSig > 0);
789
111k
    return scanPosLast - 1;
790
111k
}
791
792
// NOTE: no defined value on lastNZPosInCG & absSumSign when ALL ZEROS block as input
793
static uint32_t findPosFirstLast_c(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16])
794
4.83k
{
795
4.83k
    int n;
796
797
17.7k
    for (n = SCAN_SET_SIZE - 1; n >= 0; n--)
798
17.7k
    {
799
17.7k
        const uint32_t idx = scanTbl[n];
800
17.7k
        const uint32_t idxY = idx / MLS_CG_SIZE;
801
17.7k
        const uint32_t idxX = idx % MLS_CG_SIZE;
802
17.7k
        if (dstCoeff[idxY * trSize + idxX])
803
4.83k
            break;
804
17.7k
    }
805
806
4.83k
    X265_CHECK(n >= -1, "non-zero coeff scan failuare!\n");
807
808
4.83k
    uint32_t lastNZPosInCG = (uint32_t)n;
809
810
4.83k
    for (n = 0; n < SCAN_SET_SIZE; n++)
811
4.83k
    {
812
4.83k
        const uint32_t idx = scanTbl[n];
813
4.83k
        const uint32_t idxY = idx / MLS_CG_SIZE;
814
4.83k
        const uint32_t idxX = idx % MLS_CG_SIZE;
815
4.83k
        if (dstCoeff[idxY * trSize + idxX])
816
4.83k
            break;
817
4.83k
    }
818
819
4.83k
    uint32_t firstNZPosInCG = (uint32_t)n;
820
821
4.83k
    uint32_t absSumSign = 0;
822
69.3k
    for (n = firstNZPosInCG; n <= (int)lastNZPosInCG; n++)
823
64.5k
    {
824
64.5k
        const uint32_t idx = scanTbl[n];
825
64.5k
        const uint32_t idxY = idx / MLS_CG_SIZE;
826
64.5k
        const uint32_t idxX = idx % MLS_CG_SIZE;
827
64.5k
        absSumSign += dstCoeff[idxY * trSize + idxX];
828
64.5k
    }
829
830
    // NOTE: when coeff block all ZERO, the lastNZPosInCG is undefined and firstNZPosInCG is 16
831
4.83k
    return ((absSumSign << 31) | (lastNZPosInCG << 8) | firstNZPosInCG);
832
4.83k
}
833
834
835
static uint32_t costCoeffNxN_c(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase)
836
90.8k
{
837
90.8k
    ALIGN_VAR_32(uint16_t, tmpCoeff[SCAN_SET_SIZE]);
838
90.8k
    uint32_t numNonZero = (scanPosSigOff < (SCAN_SET_SIZE - 1) ? 1 : 0);
839
90.8k
    uint32_t sum = 0;
840
841
    // correct offset to match assembly
842
90.8k
    absCoeff -= numNonZero;
843
844
454k
    for (int i = 0; i < MLS_CG_SIZE; i++)
845
363k
    {
846
363k
        tmpCoeff[i * MLS_CG_SIZE + 0] = (uint16_t)abs(coeff[i * trSize + 0]);
847
363k
        tmpCoeff[i * MLS_CG_SIZE + 1] = (uint16_t)abs(coeff[i * trSize + 1]);
848
363k
        tmpCoeff[i * MLS_CG_SIZE + 2] = (uint16_t)abs(coeff[i * trSize + 2]);
849
363k
        tmpCoeff[i * MLS_CG_SIZE + 3] = (uint16_t)abs(coeff[i * trSize + 3]);
850
363k
    }
851
852
90.8k
    do
853
1.42M
    {
854
1.42M
        uint32_t blkPos, sig, ctxSig;
855
1.42M
        blkPos = scan[scanPosSigOff];
856
1.42M
        const uint32_t posZeroMask = (subPosBase + scanPosSigOff) ? ~0 : 0;
857
1.42M
        sig     = scanFlagMask & 1;
858
1.42M
        scanFlagMask >>= 1;
859
1.42M
        X265_CHECK((uint32_t)(tmpCoeff[blkPos] != 0) == sig, "sign bit mistake\n");
860
1.42M
        if ((scanPosSigOff != 0) || (subPosBase == 0) || numNonZero)
861
1.42M
        {
862
1.42M
            const uint32_t cnt = tabSigCtx[blkPos] + offset;
863
1.42M
            ctxSig = cnt & posZeroMask;
864
865
            //X265_CHECK(ctxSig == Quant::getSigCtxInc(patternSigCtx, log2TrSize, trSize, codingParameters.scan[subPosBase + scanPosSigOff], bIsLuma, codingParameters.firstSignificanceMapContext), "sigCtx mistake!\n");;
866
            //encodeBin(sig, baseCtx[ctxSig]);
867
1.42M
            const uint32_t mstate = baseCtx[ctxSig];
868
1.42M
            const uint32_t mps = mstate & 1;
869
1.42M
            const uint32_t stateBits = PFX(entropyStateBits)[mstate ^ sig];
870
1.42M
            uint32_t nextState = (stateBits >> 24) + mps;
871
1.42M
            if ((mstate ^ sig) == 1)
872
11.5k
                nextState = sig;
873
1.42M
            X265_CHECK(sbacNext(mstate, sig) == nextState, "nextState check failure\n");
874
1.42M
            X265_CHECK(sbacGetEntropyBits(mstate, sig) == (stateBits & 0xFFFFFF), "entropyBits check failure\n");
875
1.42M
            baseCtx[ctxSig] = (uint8_t)nextState;
876
1.42M
            sum += stateBits;
877
1.42M
        }
878
1.42M
        assert(numNonZero <= 15);
879
1.42M
        assert(blkPos <= 15);
880
1.42M
        absCoeff[numNonZero] = tmpCoeff[blkPos];
881
1.42M
        numNonZero += sig;
882
1.42M
        scanPosSigOff--;
883
1.42M
    }
884
1.42M
    while(scanPosSigOff >= 0);
885
886
90.8k
    return (sum & 0xFFFFFF);
887
90.8k
}
888
889
static uint32_t costCoeffRemain_c(uint16_t *absCoeff, int numNonZero, int idx)
890
126k
{
891
126k
    uint32_t goRiceParam = 0;
892
893
126k
    uint32_t sum = 0;
894
126k
    int baseLevel = 3;
895
126k
    do
896
1.46M
    {
897
1.46M
        if (idx >= C1FLAG_NUMBER)
898
711k
            baseLevel = 1;
899
900
        // TODO: the IDX is not really idx, so this check inactive
901
        //X265_CHECK(baseLevel == ((idx < C1FLAG_NUMBER) ? (2 + firstCoeff2) : 1), "baseLevel check failurr\n");
902
1.46M
        int codeNumber = absCoeff[idx] - baseLevel;
903
904
1.46M
        if (codeNumber >= 0)
905
1.46M
        {
906
            //writeCoefRemainExGolomb(absCoeff[idx] - baseLevel, goRiceParam);
907
1.46M
            uint32_t length = 0;
908
909
1.46M
            codeNumber = ((uint32_t)codeNumber >> goRiceParam) - COEF_REMAIN_BIN_REDUCTION;
910
1.46M
            if (codeNumber >= 0)
911
1.42M
            {
912
1.42M
                {
913
1.42M
                    unsigned long cidx;
914
1.42M
                    BSR(cidx, codeNumber + 1);
915
1.42M
                    length = cidx;
916
1.42M
                }
917
1.42M
                X265_CHECK((codeNumber != 0) || (length == 0), "length check failure\n");
918
919
1.42M
                codeNumber = (length + length);
920
1.42M
            }
921
1.46M
            sum += (COEF_REMAIN_BIN_REDUCTION + 1 + goRiceParam + codeNumber);
922
923
1.46M
            if (absCoeff[idx] > (COEF_REMAIN_BIN_REDUCTION << goRiceParam))
924
1.43M
                goRiceParam = (goRiceParam + 1) - (goRiceParam >> 2);
925
1.46M
            X265_CHECK(goRiceParam <= 4, "goRiceParam check failure\n");
926
1.46M
        }
927
1.46M
        baseLevel = 2;
928
1.46M
        idx++;
929
1.46M
    }
930
1.46M
    while(idx < numNonZero);
931
932
126k
    return sum;
933
126k
}
934
935
936
static uint32_t costC1C2Flag_c(uint16_t *absCoeff, intptr_t numC1Flag, uint8_t *baseCtxMod, intptr_t ctxOffset)
937
131k
{
938
131k
    uint32_t sum = 0;
939
131k
    uint32_t c1 = 1;
940
131k
    uint32_t firstC2Idx = 8;
941
131k
    uint32_t firstC2Flag = 2;
942
131k
    uint32_t c1Next = 0xFFFFFFFE;
943
944
131k
    int idx = 0;
945
131k
    do
946
767k
    {
947
767k
        uint32_t symbol1 = absCoeff[idx] > 1;
948
767k
        uint32_t symbol2 = absCoeff[idx] > 2;
949
        //encodeBin(symbol1, baseCtxMod[c1]);
950
767k
        {
951
767k
            const uint32_t mstate = baseCtxMod[c1];
952
767k
            baseCtxMod[c1] = sbacNext(mstate, symbol1);
953
767k
            sum += sbacGetEntropyBits(mstate, symbol1);
954
767k
        }
955
956
767k
        if (symbol1)
957
750k
            c1Next = 0;
958
959
767k
        if (symbol1 + firstC2Flag == 3)
960
126k
            firstC2Flag = symbol2;
961
962
767k
        if (symbol1 + firstC2Idx == 9)
963
126k
            firstC2Idx  = idx;
964
965
767k
        c1 = (c1Next & 3);
966
767k
        c1Next >>= 2;
967
767k
        X265_CHECK(c1 <= 3, "c1 check failure\n");
968
767k
        idx++;
969
767k
    }
970
767k
    while(idx < numC1Flag);
971
972
131k
    if (!c1)
973
126k
    {
974
126k
        X265_CHECK((firstC2Flag <= 1), "firstC2FlagIdx check failure\n");
975
976
126k
        baseCtxMod += ctxOffset;
977
978
        //encodeBin(firstC2Flag, baseCtxMod[0]);
979
126k
        {
980
126k
            const uint32_t mstate = baseCtxMod[0];
981
126k
            baseCtxMod[0] = sbacNext(mstate, firstC2Flag);
982
126k
            sum += sbacGetEntropyBits(mstate, firstC2Flag);
983
126k
        }
984
126k
    }
985
131k
    return (sum & 0x00FFFFFF) + (c1 << 26) + (firstC2Idx << 28);
986
131k
}
987
template<int log2TrSize>
988
static void nonPsyRdoQuant_c(int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, uint32_t blkPos)
989
114k
{
990
114k
    const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
991
114k
    const int scaleBits = SCALE_BITS - 2 * transformShift;
992
114k
    const uint32_t trSize = 1 << log2TrSize;
993
994
572k
    for (int y = 0; y < MLS_CG_SIZE; y++)
995
458k
    {
996
2.29M
        for (int x = 0; x < MLS_CG_SIZE; x++)
997
1.83M
        {
998
1.83M
             int64_t signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
999
1.83M
             costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits));
1000
1.83M
             *totalUncodedCost += costUncoded[blkPos + x];
1001
1.83M
             *totalRdCost += costUncoded[blkPos + x];
1002
1.83M
        }
1003
458k
        blkPos += trSize;
1004
458k
    }
1005
114k
}
Unexecuted instantiation: dct.cpp:void nonPsyRdoQuant_c<2>(short*, long*, long*, long*, unsigned int)
dct.cpp:void nonPsyRdoQuant_c<3>(short*, long*, long*, long*, unsigned int)
Line
Count
Source
989
54.9k
{
990
54.9k
    const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
991
54.9k
    const int scaleBits = SCALE_BITS - 2 * transformShift;
992
54.9k
    const uint32_t trSize = 1 << log2TrSize;
993
994
274k
    for (int y = 0; y < MLS_CG_SIZE; y++)
995
219k
    {
996
1.09M
        for (int x = 0; x < MLS_CG_SIZE; x++)
997
878k
        {
998
878k
             int64_t signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
999
878k
             costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits));
1000
878k
             *totalUncodedCost += costUncoded[blkPos + x];
1001
878k
             *totalRdCost += costUncoded[blkPos + x];
1002
878k
        }
1003
219k
        blkPos += trSize;
1004
219k
    }
1005
54.9k
}
dct.cpp:void nonPsyRdoQuant_c<4>(short*, long*, long*, long*, unsigned int)
Line
Count
Source
989
59.6k
{
990
59.6k
    const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
991
59.6k
    const int scaleBits = SCALE_BITS - 2 * transformShift;
992
59.6k
    const uint32_t trSize = 1 << log2TrSize;
993
994
298k
    for (int y = 0; y < MLS_CG_SIZE; y++)
995
238k
    {
996
1.19M
        for (int x = 0; x < MLS_CG_SIZE; x++)
997
954k
        {
998
954k
             int64_t signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
999
954k
             costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits));
1000
954k
             *totalUncodedCost += costUncoded[blkPos + x];
1001
954k
             *totalRdCost += costUncoded[blkPos + x];
1002
954k
        }
1003
238k
        blkPos += trSize;
1004
238k
    }
1005
59.6k
}
Unexecuted instantiation: dct.cpp:void nonPsyRdoQuant_c<5>(short*, long*, long*, long*, unsigned int)
1006
template<int log2TrSize>
1007
static void psyRdoQuant_c(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos)
1008
0
{
1009
0
    const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
1010
0
    const int scaleBits = SCALE_BITS - 2 * transformShift;
1011
0
    const uint32_t trSize = 1 << log2TrSize;
1012
0
    int max = X265_MAX(0, (2 * transformShift + 1));
1013
1014
0
    for (int y = 0; y < MLS_CG_SIZE; y++)
1015
0
    {
1016
0
        for (int x = 0; x < MLS_CG_SIZE; x++)
1017
0
        {
1018
0
            int64_t signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
1019
0
            int64_t predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/
1020
1021
0
            costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits));
1022
1023
            /* when no residual coefficient is coded, predicted coef == recon coef */
1024
0
            costUncoded[blkPos + x] -= static_cast<int64_t>((double)(((*psyScale) * predictedCoef) >> max));
1025
1026
0
            *totalUncodedCost += costUncoded[blkPos + x];
1027
0
            *totalRdCost += costUncoded[blkPos + x];
1028
0
        }
1029
0
        blkPos += trSize;
1030
0
    }
1031
0
}
Unexecuted instantiation: dct.cpp:void psyRdoQuant_c<2>(short*, short*, long*, long*, long*, long*, unsigned int)
Unexecuted instantiation: dct.cpp:void psyRdoQuant_c<3>(short*, short*, long*, long*, long*, long*, unsigned int)
Unexecuted instantiation: dct.cpp:void psyRdoQuant_c<4>(short*, short*, long*, long*, long*, long*, unsigned int)
Unexecuted instantiation: dct.cpp:void psyRdoQuant_c<5>(short*, short*, long*, long*, long*, long*, unsigned int)
1032
template<int log2TrSize>
1033
static void psyRdoQuant_c_1(int16_t *m_resiDctCoeff, /*int16_t  *m_fencDctCoeff, */ int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, /* int64_t *psyScale,*/ uint32_t blkPos)
1034
264k
{
1035
264k
  const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
1036
264k
  const int scaleBits = SCALE_BITS - 2 * transformShift;
1037
264k
  const uint32_t trSize = 1 << log2TrSize;
1038
1039
1.32M
  for (int y = 0; y < MLS_CG_SIZE; y++)
1040
1.05M
  {
1041
5.29M
    for (int x = 0; x < MLS_CG_SIZE; x++)
1042
4.23M
    {
1043
4.23M
      int64_t signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
1044
4.23M
      costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits));
1045
4.23M
      *totalUncodedCost += costUncoded[blkPos + x];
1046
4.23M
      *totalRdCost += costUncoded[blkPos + x];
1047
4.23M
    }
1048
1.05M
    blkPos += trSize;
1049
1.05M
  }
1050
264k
}
Unexecuted instantiation: dct.cpp:void psyRdoQuant_c_1<2>(short*, long*, long*, long*, unsigned int)
dct.cpp:void psyRdoQuant_c_1<3>(short*, long*, long*, long*, unsigned int)
Line
Count
Source
1034
15.2k
{
1035
15.2k
  const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
1036
15.2k
  const int scaleBits = SCALE_BITS - 2 * transformShift;
1037
15.2k
  const uint32_t trSize = 1 << log2TrSize;
1038
1039
76.2k
  for (int y = 0; y < MLS_CG_SIZE; y++)
1040
61.0k
  {
1041
305k
    for (int x = 0; x < MLS_CG_SIZE; x++)
1042
244k
    {
1043
244k
      int64_t signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
1044
244k
      costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits));
1045
244k
      *totalUncodedCost += costUncoded[blkPos + x];
1046
244k
      *totalRdCost += costUncoded[blkPos + x];
1047
244k
    }
1048
61.0k
    blkPos += trSize;
1049
61.0k
  }
1050
15.2k
}
dct.cpp:void psyRdoQuant_c_1<4>(short*, long*, long*, long*, unsigned int)
Line
Count
Source
1034
92.6k
{
1035
92.6k
  const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
1036
92.6k
  const int scaleBits = SCALE_BITS - 2 * transformShift;
1037
92.6k
  const uint32_t trSize = 1 << log2TrSize;
1038
1039
463k
  for (int y = 0; y < MLS_CG_SIZE; y++)
1040
370k
  {
1041
1.85M
    for (int x = 0; x < MLS_CG_SIZE; x++)
1042
1.48M
    {
1043
1.48M
      int64_t signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
1044
1.48M
      costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits));
1045
1.48M
      *totalUncodedCost += costUncoded[blkPos + x];
1046
1.48M
      *totalRdCost += costUncoded[blkPos + x];
1047
1.48M
    }
1048
370k
    blkPos += trSize;
1049
370k
  }
1050
92.6k
}
dct.cpp:void psyRdoQuant_c_1<5>(short*, long*, long*, long*, unsigned int)
Line
Count
Source
1034
156k
{
1035
156k
  const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
1036
156k
  const int scaleBits = SCALE_BITS - 2 * transformShift;
1037
156k
  const uint32_t trSize = 1 << log2TrSize;
1038
1039
784k
  for (int y = 0; y < MLS_CG_SIZE; y++)
1040
627k
  {
1041
3.13M
    for (int x = 0; x < MLS_CG_SIZE; x++)
1042
2.51M
    {
1043
2.51M
      int64_t signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
1044
2.51M
      costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits));
1045
2.51M
      *totalUncodedCost += costUncoded[blkPos + x];
1046
2.51M
      *totalRdCost += costUncoded[blkPos + x];
1047
2.51M
    }
1048
627k
    blkPos += trSize;
1049
627k
  }
1050
156k
}
1051
template<int log2TrSize>
1052
static void psyRdoQuant_c_2(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos)
1053
264k
{
1054
264k
  const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
1055
1056
264k
  const uint32_t trSize = 1 << log2TrSize;
1057
264k
  int max = X265_MAX(0, (2 * transformShift + 1));
1058
1059
1.32M
  for (int y = 0; y < MLS_CG_SIZE; y++)
1060
1.05M
  {
1061
5.29M
    for (int x = 0; x < MLS_CG_SIZE; x++)
1062
4.23M
    {
1063
4.23M
      int64_t signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
1064
4.23M
      int64_t predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/
1065
4.23M
      costUncoded[blkPos + x] -= static_cast<int64_t>((double)(((*psyScale) * predictedCoef) >> max));
1066
4.23M
      *totalUncodedCost += costUncoded[blkPos + x];
1067
4.23M
      *totalRdCost += costUncoded[blkPos + x];
1068
4.23M
    }
1069
1.05M
    blkPos += trSize;
1070
1.05M
  }
1071
264k
}
Unexecuted instantiation: dct.cpp:void psyRdoQuant_c_2<2>(short*, short*, long*, long*, long*, long*, unsigned int)
dct.cpp:void psyRdoQuant_c_2<3>(short*, short*, long*, long*, long*, long*, unsigned int)
Line
Count
Source
1053
15.2k
{
1054
15.2k
  const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
1055
1056
15.2k
  const uint32_t trSize = 1 << log2TrSize;
1057
15.2k
  int max = X265_MAX(0, (2 * transformShift + 1));
1058
1059
76.2k
  for (int y = 0; y < MLS_CG_SIZE; y++)
1060
61.0k
  {
1061
305k
    for (int x = 0; x < MLS_CG_SIZE; x++)
1062
244k
    {
1063
244k
      int64_t signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
1064
244k
      int64_t predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/
1065
244k
      costUncoded[blkPos + x] -= static_cast<int64_t>((double)(((*psyScale) * predictedCoef) >> max));
1066
244k
      *totalUncodedCost += costUncoded[blkPos + x];
1067
244k
      *totalRdCost += costUncoded[blkPos + x];
1068
244k
    }
1069
61.0k
    blkPos += trSize;
1070
61.0k
  }
1071
15.2k
}
dct.cpp:void psyRdoQuant_c_2<4>(short*, short*, long*, long*, long*, long*, unsigned int)
Line
Count
Source
1053
92.6k
{
1054
92.6k
  const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
1055
1056
92.6k
  const uint32_t trSize = 1 << log2TrSize;
1057
92.6k
  int max = X265_MAX(0, (2 * transformShift + 1));
1058
1059
463k
  for (int y = 0; y < MLS_CG_SIZE; y++)
1060
370k
  {
1061
1.85M
    for (int x = 0; x < MLS_CG_SIZE; x++)
1062
1.48M
    {
1063
1.48M
      int64_t signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
1064
1.48M
      int64_t predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/
1065
1.48M
      costUncoded[blkPos + x] -= static_cast<int64_t>((double)(((*psyScale) * predictedCoef) >> max));
1066
1.48M
      *totalUncodedCost += costUncoded[blkPos + x];
1067
1.48M
      *totalRdCost += costUncoded[blkPos + x];
1068
1.48M
    }
1069
370k
    blkPos += trSize;
1070
370k
  }
1071
92.6k
}
dct.cpp:void psyRdoQuant_c_2<5>(short*, short*, long*, long*, long*, long*, unsigned int)
Line
Count
Source
1053
156k
{
1054
156k
  const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
1055
1056
156k
  const uint32_t trSize = 1 << log2TrSize;
1057
156k
  int max = X265_MAX(0, (2 * transformShift + 1));
1058
1059
784k
  for (int y = 0; y < MLS_CG_SIZE; y++)
1060
627k
  {
1061
3.13M
    for (int x = 0; x < MLS_CG_SIZE; x++)
1062
2.51M
    {
1063
2.51M
      int64_t signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
1064
2.51M
      int64_t predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/
1065
2.51M
      costUncoded[blkPos + x] -= static_cast<int64_t>((double)(((*psyScale) * predictedCoef) >> max));
1066
2.51M
      *totalUncodedCost += costUncoded[blkPos + x];
1067
2.51M
      *totalRdCost += costUncoded[blkPos + x];
1068
2.51M
    }
1069
627k
    blkPos += trSize;
1070
627k
  }
1071
156k
}
1072
1073
namespace X265_NS {
1074
// x265 private namespace
1075
void setupDCTPrimitives_c(EncoderPrimitives& p)
1076
1
{
1077
1
    p.dequant_scaling = dequant_scaling_c;
1078
1
    p.dequant_normal = dequant_normal_c;
1079
1
    p.quant = quant_c;
1080
1
    p.nquant = nquant_c;
1081
1
    p.cu[BLOCK_4x4].nonPsyRdoQuant   = nonPsyRdoQuant_c<2>;
1082
1
    p.cu[BLOCK_8x8].nonPsyRdoQuant   = nonPsyRdoQuant_c<3>;
1083
1
    p.cu[BLOCK_16x16].nonPsyRdoQuant = nonPsyRdoQuant_c<4>;
1084
1
    p.cu[BLOCK_32x32].nonPsyRdoQuant = nonPsyRdoQuant_c<5>;
1085
1
    p.cu[BLOCK_4x4].psyRdoQuant = psyRdoQuant_c<2>;
1086
1
    p.cu[BLOCK_8x8].psyRdoQuant = psyRdoQuant_c<3>;
1087
1
    p.cu[BLOCK_16x16].psyRdoQuant = psyRdoQuant_c<4>;
1088
1
    p.cu[BLOCK_32x32].psyRdoQuant = psyRdoQuant_c<5>;
1089
1
    p.dst4x4 = dst4_c;
1090
1
    p.cu[BLOCK_4x4].dct   = dct4_c;
1091
1
    p.cu[BLOCK_8x8].dct   = dct8_c;
1092
1
    p.cu[BLOCK_16x16].dct = dct16_c;
1093
1
    p.cu[BLOCK_32x32].dct = dct32_c;
1094
1
    p.idst4x4 = idst4_c;
1095
1
    p.cu[BLOCK_4x4].idct   = idct4_c;
1096
1
    p.cu[BLOCK_8x8].idct   = idct8_c;
1097
1
    p.cu[BLOCK_16x16].idct = idct16_c;
1098
1
    p.cu[BLOCK_32x32].idct = idct32_c;
1099
1
    p.denoiseDct = denoiseDct_c;
1100
1
    p.cu[BLOCK_4x4].count_nonzero = count_nonzero_c<4>;
1101
1
    p.cu[BLOCK_8x8].count_nonzero = count_nonzero_c<8>;
1102
1
    p.cu[BLOCK_16x16].count_nonzero = count_nonzero_c<16>;
1103
1
    p.cu[BLOCK_32x32].count_nonzero = count_nonzero_c<32>;
1104
1105
1
    p.cu[BLOCK_4x4].copy_cnt   = copy_count<4>;
1106
1
    p.cu[BLOCK_8x8].copy_cnt   = copy_count<8>;
1107
1
    p.cu[BLOCK_16x16].copy_cnt = copy_count<16>;
1108
1
    p.cu[BLOCK_32x32].copy_cnt = copy_count<32>;
1109
1
  p.cu[BLOCK_4x4].psyRdoQuant_1p = psyRdoQuant_c_1<2>;
1110
1
  p.cu[BLOCK_4x4].psyRdoQuant_2p = psyRdoQuant_c_2<2>;
1111
1
  p.cu[BLOCK_8x8].psyRdoQuant_1p = psyRdoQuant_c_1<3>;
1112
1
  p.cu[BLOCK_8x8].psyRdoQuant_2p = psyRdoQuant_c_2<3>;
1113
1
  p.cu[BLOCK_16x16].psyRdoQuant_1p = psyRdoQuant_c_1<4>;
1114
1
  p.cu[BLOCK_16x16].psyRdoQuant_2p = psyRdoQuant_c_2<4>;
1115
1
  p.cu[BLOCK_32x32].psyRdoQuant_1p = psyRdoQuant_c_1<5>;
1116
1
  p.cu[BLOCK_32x32].psyRdoQuant_2p = psyRdoQuant_c_2<5>;
1117
1
    p.scanPosLast = scanPosLast_c;
1118
1
    p.findPosFirstLast = findPosFirstLast_c;
1119
1
    p.costCoeffNxN = costCoeffNxN_c;
1120
1
    p.costCoeffRemain = costCoeffRemain_c;
1121
1
    p.costC1C2Flag = costC1C2Flag_c;
1122
1
}
1123
}