/src/x265/source/common/dct.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /***************************************************************************** |
2 | | * Copyright (C) 2013-2020 MulticoreWare, Inc |
3 | | * |
4 | | * Authors: Mandar Gurav <mandar@multicorewareinc.com> |
5 | | * Deepthi Devaki Akkoorath <deepthidevaki@multicorewareinc.com> |
6 | | * Mahesh Pittala <mahesh@multicorewareinc.com> |
7 | | * Rajesh Paulraj <rajesh@multicorewareinc.com> |
8 | | * Min Chen <min.chen@multicorewareinc.com> |
9 | | * Praveen Kumar Tiwari <praveen@multicorewareinc.com> |
10 | | * Nabajit Deka <nabajit@multicorewareinc.com> |
11 | | * |
12 | | * This program is free software; you can redistribute it and/or modify |
13 | | * it under the terms of the GNU General Public License as published by |
14 | | * the Free Software Foundation; either version 2 of the License, or |
15 | | * (at your option) any later version. |
16 | | * |
17 | | * This program is distributed in the hope that it will be useful, |
18 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
19 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
20 | | * GNU General Public License for more details. |
21 | | * |
22 | | * You should have received a copy of the GNU General Public License |
23 | | * along with this program; if not, write to the Free Software |
24 | | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. |
25 | | * |
26 | | * This program is also available under a commercial proprietary license. |
27 | | * For more information, contact us at license @ x265.com. |
28 | | *****************************************************************************/ |
29 | | |
30 | | #include "common.h" |
31 | | #include "primitives.h" |
32 | | #include "contexts.h" // costCoeffNxN_c |
33 | | #include "threading.h" // CLZ |
34 | | |
35 | | using namespace X265_NS; |
36 | | |
37 | | #if _MSC_VER |
38 | | #pragma warning(disable: 4127) // conditional expression is constant, typical for templated functions |
39 | | #endif |
40 | | |
41 | | // Fast DST Algorithm. Full matrix multiplication for DST and Fast DST algorithm |
42 | | // give identical results |
43 | | static void fastForwardDst(const int16_t* block, int16_t* coeff, int shift) // input block, output coeff |
44 | 0 | { |
45 | 0 | int c[4]; |
46 | 0 | int rnd_factor = 1 << (shift - 1); |
47 | |
|
48 | 0 | for (int i = 0; i < 4; i++) |
49 | 0 | { |
50 | | // Intermediate Variables |
51 | 0 | c[0] = block[4 * i + 0] + block[4 * i + 3]; |
52 | 0 | c[1] = block[4 * i + 1] + block[4 * i + 3]; |
53 | 0 | c[2] = block[4 * i + 0] - block[4 * i + 1]; |
54 | 0 | c[3] = 74 * block[4 * i + 2]; |
55 | |
|
56 | 0 | coeff[i] = (int16_t)((29 * c[0] + 55 * c[1] + c[3] + rnd_factor) >> shift); |
57 | 0 | coeff[4 + i] = (int16_t)((74 * (block[4 * i + 0] + block[4 * i + 1] - block[4 * i + 3]) + rnd_factor) >> shift); |
58 | 0 | coeff[8 + i] = (int16_t)((29 * c[2] + 55 * c[0] - c[3] + rnd_factor) >> shift); |
59 | 0 | coeff[12 + i] = (int16_t)((55 * c[2] - 29 * c[1] + c[3] + rnd_factor) >> shift); |
60 | 0 | } |
61 | 0 | } |
62 | | |
63 | | static void inversedst(const int16_t* tmp, int16_t* block, int shift) // input tmp, output block |
64 | 0 | { |
65 | 0 | int i, c[4]; |
66 | 0 | int rnd_factor = 1 << (shift - 1); |
67 | |
|
68 | 0 | for (i = 0; i < 4; i++) |
69 | 0 | { |
70 | | // Intermediate Variables |
71 | 0 | c[0] = tmp[i] + tmp[8 + i]; |
72 | 0 | c[1] = tmp[8 + i] + tmp[12 + i]; |
73 | 0 | c[2] = tmp[i] - tmp[12 + i]; |
74 | 0 | c[3] = 74 * tmp[4 + i]; |
75 | |
|
76 | 0 | block[4 * i + 0] = (int16_t)x265_clip3(-32768, 32767, (29 * c[0] + 55 * c[1] + c[3] + rnd_factor) >> shift); |
77 | 0 | block[4 * i + 1] = (int16_t)x265_clip3(-32768, 32767, (55 * c[2] - 29 * c[1] + c[3] + rnd_factor) >> shift); |
78 | 0 | block[4 * i + 2] = (int16_t)x265_clip3(-32768, 32767, (74 * (tmp[i] - tmp[8 + i] + tmp[12 + i]) + rnd_factor) >> shift); |
79 | 0 | block[4 * i + 3] = (int16_t)x265_clip3(-32768, 32767, (55 * c[0] + 29 * c[2] - c[3] + rnd_factor) >> shift); |
80 | 0 | } |
81 | 0 | } |
82 | | |
83 | | static void partialButterfly16(const int16_t* src, int16_t* dst, int shift, int line) |
84 | 0 | { |
85 | 0 | int j, k; |
86 | 0 | int E[8], O[8]; |
87 | 0 | int EE[4], EO[4]; |
88 | 0 | int EEE[2], EEO[2]; |
89 | 0 | int add = 1 << (shift - 1); |
90 | |
|
91 | 0 | for (j = 0; j < line; j++) |
92 | 0 | { |
93 | | /* E and O */ |
94 | 0 | for (k = 0; k < 8; k++) |
95 | 0 | { |
96 | 0 | E[k] = src[k] + src[15 - k]; |
97 | 0 | O[k] = src[k] - src[15 - k]; |
98 | 0 | } |
99 | | |
100 | | /* EE and EO */ |
101 | 0 | for (k = 0; k < 4; k++) |
102 | 0 | { |
103 | 0 | EE[k] = E[k] + E[7 - k]; |
104 | 0 | EO[k] = E[k] - E[7 - k]; |
105 | 0 | } |
106 | | |
107 | | /* EEE and EEO */ |
108 | 0 | EEE[0] = EE[0] + EE[3]; |
109 | 0 | EEO[0] = EE[0] - EE[3]; |
110 | 0 | EEE[1] = EE[1] + EE[2]; |
111 | 0 | EEO[1] = EE[1] - EE[2]; |
112 | |
|
113 | 0 | dst[0] = (int16_t)((g_t16[0][0] * EEE[0] + g_t16[0][1] * EEE[1] + add) >> shift); |
114 | 0 | dst[8 * line] = (int16_t)((g_t16[8][0] * EEE[0] + g_t16[8][1] * EEE[1] + add) >> shift); |
115 | 0 | dst[4 * line] = (int16_t)((g_t16[4][0] * EEO[0] + g_t16[4][1] * EEO[1] + add) >> shift); |
116 | 0 | dst[12 * line] = (int16_t)((g_t16[12][0] * EEO[0] + g_t16[12][1] * EEO[1] + add) >> shift); |
117 | |
|
118 | 0 | for (k = 2; k < 16; k += 4) |
119 | 0 | { |
120 | 0 | dst[k * line] = (int16_t)((g_t16[k][0] * EO[0] + g_t16[k][1] * EO[1] + g_t16[k][2] * EO[2] + |
121 | 0 | g_t16[k][3] * EO[3] + add) >> shift); |
122 | 0 | } |
123 | |
|
124 | 0 | for (k = 1; k < 16; k += 2) |
125 | 0 | { |
126 | 0 | dst[k * line] = (int16_t)((g_t16[k][0] * O[0] + g_t16[k][1] * O[1] + g_t16[k][2] * O[2] + g_t16[k][3] * O[3] + |
127 | 0 | g_t16[k][4] * O[4] + g_t16[k][5] * O[5] + g_t16[k][6] * O[6] + g_t16[k][7] * O[7] + |
128 | 0 | add) >> shift); |
129 | 0 | } |
130 | |
|
131 | 0 | src += 16; |
132 | 0 | dst++; |
133 | 0 | } |
134 | 0 | } |
135 | | |
136 | | static void partialButterfly32(const int16_t* src, int16_t* dst, int shift, int line) |
137 | 0 | { |
138 | 0 | int j, k; |
139 | 0 | int E[16], O[16]; |
140 | 0 | int EE[8], EO[8]; |
141 | 0 | int EEE[4], EEO[4]; |
142 | 0 | int EEEE[2], EEEO[2]; |
143 | 0 | int add = 1 << (shift - 1); |
144 | |
|
145 | 0 | for (j = 0; j < line; j++) |
146 | 0 | { |
147 | | /* E and O*/ |
148 | 0 | for (k = 0; k < 16; k++) |
149 | 0 | { |
150 | 0 | E[k] = src[k] + src[31 - k]; |
151 | 0 | O[k] = src[k] - src[31 - k]; |
152 | 0 | } |
153 | | |
154 | | /* EE and EO */ |
155 | 0 | for (k = 0; k < 8; k++) |
156 | 0 | { |
157 | 0 | EE[k] = E[k] + E[15 - k]; |
158 | 0 | EO[k] = E[k] - E[15 - k]; |
159 | 0 | } |
160 | | |
161 | | /* EEE and EEO */ |
162 | 0 | for (k = 0; k < 4; k++) |
163 | 0 | { |
164 | 0 | EEE[k] = EE[k] + EE[7 - k]; |
165 | 0 | EEO[k] = EE[k] - EE[7 - k]; |
166 | 0 | } |
167 | | |
168 | | /* EEEE and EEEO */ |
169 | 0 | EEEE[0] = EEE[0] + EEE[3]; |
170 | 0 | EEEO[0] = EEE[0] - EEE[3]; |
171 | 0 | EEEE[1] = EEE[1] + EEE[2]; |
172 | 0 | EEEO[1] = EEE[1] - EEE[2]; |
173 | |
|
174 | 0 | dst[0] = (int16_t)((g_t32[0][0] * EEEE[0] + g_t32[0][1] * EEEE[1] + add) >> shift); |
175 | 0 | dst[16 * line] = (int16_t)((g_t32[16][0] * EEEE[0] + g_t32[16][1] * EEEE[1] + add) >> shift); |
176 | 0 | dst[8 * line] = (int16_t)((g_t32[8][0] * EEEO[0] + g_t32[8][1] * EEEO[1] + add) >> shift); |
177 | 0 | dst[24 * line] = (int16_t)((g_t32[24][0] * EEEO[0] + g_t32[24][1] * EEEO[1] + add) >> shift); |
178 | 0 | for (k = 4; k < 32; k += 8) |
179 | 0 | { |
180 | 0 | dst[k * line] = (int16_t)((g_t32[k][0] * EEO[0] + g_t32[k][1] * EEO[1] + g_t32[k][2] * EEO[2] + |
181 | 0 | g_t32[k][3] * EEO[3] + add) >> shift); |
182 | 0 | } |
183 | |
|
184 | 0 | for (k = 2; k < 32; k += 4) |
185 | 0 | { |
186 | 0 | dst[k * line] = (int16_t)((g_t32[k][0] * EO[0] + g_t32[k][1] * EO[1] + g_t32[k][2] * EO[2] + |
187 | 0 | g_t32[k][3] * EO[3] + g_t32[k][4] * EO[4] + g_t32[k][5] * EO[5] + |
188 | 0 | g_t32[k][6] * EO[6] + g_t32[k][7] * EO[7] + add) >> shift); |
189 | 0 | } |
190 | |
|
191 | 0 | for (k = 1; k < 32; k += 2) |
192 | 0 | { |
193 | 0 | dst[k * line] = (int16_t)((g_t32[k][0] * O[0] + g_t32[k][1] * O[1] + g_t32[k][2] * O[2] + g_t32[k][3] * O[3] + |
194 | 0 | g_t32[k][4] * O[4] + g_t32[k][5] * O[5] + g_t32[k][6] * O[6] + g_t32[k][7] * O[7] + |
195 | 0 | g_t32[k][8] * O[8] + g_t32[k][9] * O[9] + g_t32[k][10] * O[10] + g_t32[k][11] * |
196 | 0 | O[11] + g_t32[k][12] * O[12] + g_t32[k][13] * O[13] + g_t32[k][14] * O[14] + |
197 | 0 | g_t32[k][15] * O[15] + add) >> shift); |
198 | 0 | } |
199 | |
|
200 | 0 | src += 32; |
201 | 0 | dst++; |
202 | 0 | } |
203 | 0 | } |
204 | | |
205 | | static void partialButterfly8(const int16_t* src, int16_t* dst, int shift, int line) |
206 | 0 | { |
207 | 0 | int j, k; |
208 | 0 | int E[4], O[4]; |
209 | 0 | int EE[2], EO[2]; |
210 | 0 | int add = 1 << (shift - 1); |
211 | |
|
212 | 0 | for (j = 0; j < line; j++) |
213 | 0 | { |
214 | | /* E and O*/ |
215 | 0 | for (k = 0; k < 4; k++) |
216 | 0 | { |
217 | 0 | E[k] = src[k] + src[7 - k]; |
218 | 0 | O[k] = src[k] - src[7 - k]; |
219 | 0 | } |
220 | | |
221 | | /* EE and EO */ |
222 | 0 | EE[0] = E[0] + E[3]; |
223 | 0 | EO[0] = E[0] - E[3]; |
224 | 0 | EE[1] = E[1] + E[2]; |
225 | 0 | EO[1] = E[1] - E[2]; |
226 | |
|
227 | 0 | dst[0] = (int16_t)((g_t8[0][0] * EE[0] + g_t8[0][1] * EE[1] + add) >> shift); |
228 | 0 | dst[4 * line] = (int16_t)((g_t8[4][0] * EE[0] + g_t8[4][1] * EE[1] + add) >> shift); |
229 | 0 | dst[2 * line] = (int16_t)((g_t8[2][0] * EO[0] + g_t8[2][1] * EO[1] + add) >> shift); |
230 | 0 | dst[6 * line] = (int16_t)((g_t8[6][0] * EO[0] + g_t8[6][1] * EO[1] + add) >> shift); |
231 | |
|
232 | 0 | dst[line] = (int16_t)((g_t8[1][0] * O[0] + g_t8[1][1] * O[1] + g_t8[1][2] * O[2] + g_t8[1][3] * O[3] + add) >> shift); |
233 | 0 | dst[3 * line] = (int16_t)((g_t8[3][0] * O[0] + g_t8[3][1] * O[1] + g_t8[3][2] * O[2] + g_t8[3][3] * O[3] + add) >> shift); |
234 | 0 | dst[5 * line] = (int16_t)((g_t8[5][0] * O[0] + g_t8[5][1] * O[1] + g_t8[5][2] * O[2] + g_t8[5][3] * O[3] + add) >> shift); |
235 | 0 | dst[7 * line] = (int16_t)((g_t8[7][0] * O[0] + g_t8[7][1] * O[1] + g_t8[7][2] * O[2] + g_t8[7][3] * O[3] + add) >> shift); |
236 | |
|
237 | 0 | src += 8; |
238 | 0 | dst++; |
239 | 0 | } |
240 | 0 | } |
241 | | |
242 | | static void partialButterflyInverse4(const int16_t* src, int16_t* dst, int shift, int line) |
243 | 0 | { |
244 | 0 | int j; |
245 | 0 | int E[2], O[2]; |
246 | 0 | int add = 1 << (shift - 1); |
247 | |
|
248 | 0 | for (j = 0; j < line; j++) |
249 | 0 | { |
250 | | /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ |
251 | 0 | O[0] = g_t4[1][0] * src[line] + g_t4[3][0] * src[3 * line]; |
252 | 0 | O[1] = g_t4[1][1] * src[line] + g_t4[3][1] * src[3 * line]; |
253 | 0 | E[0] = g_t4[0][0] * src[0] + g_t4[2][0] * src[2 * line]; |
254 | 0 | E[1] = g_t4[0][1] * src[0] + g_t4[2][1] * src[2 * line]; |
255 | | |
256 | | /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */ |
257 | 0 | dst[0] = (int16_t)(x265_clip3(-32768, 32767, (E[0] + O[0] + add) >> shift)); |
258 | 0 | dst[1] = (int16_t)(x265_clip3(-32768, 32767, (E[1] + O[1] + add) >> shift)); |
259 | 0 | dst[2] = (int16_t)(x265_clip3(-32768, 32767, (E[1] - O[1] + add) >> shift)); |
260 | 0 | dst[3] = (int16_t)(x265_clip3(-32768, 32767, (E[0] - O[0] + add) >> shift)); |
261 | |
|
262 | 0 | src++; |
263 | 0 | dst += 4; |
264 | 0 | } |
265 | 0 | } |
266 | | |
267 | | static void partialButterflyInverse8(const int16_t* src, int16_t* dst, int shift, int line) |
268 | 0 | { |
269 | 0 | int j, k; |
270 | 0 | int E[4], O[4]; |
271 | 0 | int EE[2], EO[2]; |
272 | 0 | int add = 1 << (shift - 1); |
273 | |
|
274 | 0 | for (j = 0; j < line; j++) |
275 | 0 | { |
276 | | /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ |
277 | 0 | for (k = 0; k < 4; k++) |
278 | 0 | { |
279 | 0 | O[k] = g_t8[1][k] * src[line] + g_t8[3][k] * src[3 * line] + g_t8[5][k] * src[5 * line] + g_t8[7][k] * src[7 * line]; |
280 | 0 | } |
281 | |
|
282 | 0 | EO[0] = g_t8[2][0] * src[2 * line] + g_t8[6][0] * src[6 * line]; |
283 | 0 | EO[1] = g_t8[2][1] * src[2 * line] + g_t8[6][1] * src[6 * line]; |
284 | 0 | EE[0] = g_t8[0][0] * src[0] + g_t8[4][0] * src[4 * line]; |
285 | 0 | EE[1] = g_t8[0][1] * src[0] + g_t8[4][1] * src[4 * line]; |
286 | | |
287 | | /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */ |
288 | 0 | E[0] = EE[0] + EO[0]; |
289 | 0 | E[3] = EE[0] - EO[0]; |
290 | 0 | E[1] = EE[1] + EO[1]; |
291 | 0 | E[2] = EE[1] - EO[1]; |
292 | 0 | for (k = 0; k < 4; k++) |
293 | 0 | { |
294 | 0 | dst[k] = (int16_t)x265_clip3(-32768, 32767, (E[k] + O[k] + add) >> shift); |
295 | 0 | dst[k + 4] = (int16_t)x265_clip3(-32768, 32767, (E[3 - k] - O[3 - k] + add) >> shift); |
296 | 0 | } |
297 | |
|
298 | 0 | src++; |
299 | 0 | dst += 8; |
300 | 0 | } |
301 | 0 | } |
302 | | |
303 | | static void partialButterflyInverse16(const int16_t* src, int16_t* dst, int shift, int line) |
304 | 0 | { |
305 | 0 | int j, k; |
306 | 0 | int E[8], O[8]; |
307 | 0 | int EE[4], EO[4]; |
308 | 0 | int EEE[2], EEO[2]; |
309 | 0 | int add = 1 << (shift - 1); |
310 | |
|
311 | 0 | for (j = 0; j < line; j++) |
312 | 0 | { |
313 | | /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ |
314 | 0 | for (k = 0; k < 8; k++) |
315 | 0 | { |
316 | 0 | O[k] = g_t16[1][k] * src[line] + g_t16[3][k] * src[3 * line] + g_t16[5][k] * src[5 * line] + g_t16[7][k] * src[7 * line] + |
317 | 0 | g_t16[9][k] * src[9 * line] + g_t16[11][k] * src[11 * line] + g_t16[13][k] * src[13 * line] + g_t16[15][k] * src[15 * line]; |
318 | 0 | } |
319 | |
|
320 | 0 | for (k = 0; k < 4; k++) |
321 | 0 | { |
322 | 0 | EO[k] = g_t16[2][k] * src[2 * line] + g_t16[6][k] * src[6 * line] + g_t16[10][k] * src[10 * line] + g_t16[14][k] * src[14 * line]; |
323 | 0 | } |
324 | |
|
325 | 0 | EEO[0] = g_t16[4][0] * src[4 * line] + g_t16[12][0] * src[12 * line]; |
326 | 0 | EEE[0] = g_t16[0][0] * src[0] + g_t16[8][0] * src[8 * line]; |
327 | 0 | EEO[1] = g_t16[4][1] * src[4 * line] + g_t16[12][1] * src[12 * line]; |
328 | 0 | EEE[1] = g_t16[0][1] * src[0] + g_t16[8][1] * src[8 * line]; |
329 | | |
330 | | /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */ |
331 | 0 | for (k = 0; k < 2; k++) |
332 | 0 | { |
333 | 0 | EE[k] = EEE[k] + EEO[k]; |
334 | 0 | EE[k + 2] = EEE[1 - k] - EEO[1 - k]; |
335 | 0 | } |
336 | |
|
337 | 0 | for (k = 0; k < 4; k++) |
338 | 0 | { |
339 | 0 | E[k] = EE[k] + EO[k]; |
340 | 0 | E[k + 4] = EE[3 - k] - EO[3 - k]; |
341 | 0 | } |
342 | |
|
343 | 0 | for (k = 0; k < 8; k++) |
344 | 0 | { |
345 | 0 | dst[k] = (int16_t)x265_clip3(-32768, 32767, (E[k] + O[k] + add) >> shift); |
346 | 0 | dst[k + 8] = (int16_t)x265_clip3(-32768, 32767, (E[7 - k] - O[7 - k] + add) >> shift); |
347 | 0 | } |
348 | |
|
349 | 0 | src++; |
350 | 0 | dst += 16; |
351 | 0 | } |
352 | 0 | } |
353 | | |
354 | | static void partialButterflyInverse32(const int16_t* src, int16_t* dst, int shift, int line) |
355 | 0 | { |
356 | 0 | int j, k; |
357 | 0 | int E[16], O[16]; |
358 | 0 | int EE[8], EO[8]; |
359 | 0 | int EEE[4], EEO[4]; |
360 | 0 | int EEEE[2], EEEO[2]; |
361 | 0 | int add = 1 << (shift - 1); |
362 | |
|
363 | 0 | for (j = 0; j < line; j++) |
364 | 0 | { |
365 | | /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ |
366 | 0 | for (k = 0; k < 16; k++) |
367 | 0 | { |
368 | 0 | O[k] = g_t32[1][k] * src[line] + g_t32[3][k] * src[3 * line] + g_t32[5][k] * src[5 * line] + g_t32[7][k] * src[7 * line] + |
369 | 0 | g_t32[9][k] * src[9 * line] + g_t32[11][k] * src[11 * line] + g_t32[13][k] * src[13 * line] + g_t32[15][k] * src[15 * line] + |
370 | 0 | g_t32[17][k] * src[17 * line] + g_t32[19][k] * src[19 * line] + g_t32[21][k] * src[21 * line] + g_t32[23][k] * src[23 * line] + |
371 | 0 | g_t32[25][k] * src[25 * line] + g_t32[27][k] * src[27 * line] + g_t32[29][k] * src[29 * line] + g_t32[31][k] * src[31 * line]; |
372 | 0 | } |
373 | |
|
374 | 0 | for (k = 0; k < 8; k++) |
375 | 0 | { |
376 | 0 | EO[k] = g_t32[2][k] * src[2 * line] + g_t32[6][k] * src[6 * line] + g_t32[10][k] * src[10 * line] + g_t32[14][k] * src[14 * line] + |
377 | 0 | g_t32[18][k] * src[18 * line] + g_t32[22][k] * src[22 * line] + g_t32[26][k] * src[26 * line] + g_t32[30][k] * src[30 * line]; |
378 | 0 | } |
379 | |
|
380 | 0 | for (k = 0; k < 4; k++) |
381 | 0 | { |
382 | 0 | EEO[k] = g_t32[4][k] * src[4 * line] + g_t32[12][k] * src[12 * line] + g_t32[20][k] * src[20 * line] + g_t32[28][k] * src[28 * line]; |
383 | 0 | } |
384 | |
|
385 | 0 | EEEO[0] = g_t32[8][0] * src[8 * line] + g_t32[24][0] * src[24 * line]; |
386 | 0 | EEEO[1] = g_t32[8][1] * src[8 * line] + g_t32[24][1] * src[24 * line]; |
387 | 0 | EEEE[0] = g_t32[0][0] * src[0] + g_t32[16][0] * src[16 * line]; |
388 | 0 | EEEE[1] = g_t32[0][1] * src[0] + g_t32[16][1] * src[16 * line]; |
389 | | |
390 | | /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */ |
391 | 0 | EEE[0] = EEEE[0] + EEEO[0]; |
392 | 0 | EEE[3] = EEEE[0] - EEEO[0]; |
393 | 0 | EEE[1] = EEEE[1] + EEEO[1]; |
394 | 0 | EEE[2] = EEEE[1] - EEEO[1]; |
395 | 0 | for (k = 0; k < 4; k++) |
396 | 0 | { |
397 | 0 | EE[k] = EEE[k] + EEO[k]; |
398 | 0 | EE[k + 4] = EEE[3 - k] - EEO[3 - k]; |
399 | 0 | } |
400 | |
|
401 | 0 | for (k = 0; k < 8; k++) |
402 | 0 | { |
403 | 0 | E[k] = EE[k] + EO[k]; |
404 | 0 | E[k + 8] = EE[7 - k] - EO[7 - k]; |
405 | 0 | } |
406 | |
|
407 | 0 | for (k = 0; k < 16; k++) |
408 | 0 | { |
409 | 0 | dst[k] = (int16_t)x265_clip3(-32768, 32767, (E[k] + O[k] + add) >> shift); |
410 | 0 | dst[k + 16] = (int16_t)x265_clip3(-32768, 32767, (E[15 - k] - O[15 - k] + add) >> shift); |
411 | 0 | } |
412 | |
|
413 | 0 | src++; |
414 | 0 | dst += 32; |
415 | 0 | } |
416 | 0 | } |
417 | | |
418 | | static void partialButterfly4(const int16_t* src, int16_t* dst, int shift, int line) |
419 | 0 | { |
420 | 0 | int j; |
421 | 0 | int E[2], O[2]; |
422 | 0 | int add = 1 << (shift - 1); |
423 | |
|
424 | 0 | for (j = 0; j < line; j++) |
425 | 0 | { |
426 | | /* E and O */ |
427 | 0 | E[0] = src[0] + src[3]; |
428 | 0 | O[0] = src[0] - src[3]; |
429 | 0 | E[1] = src[1] + src[2]; |
430 | 0 | O[1] = src[1] - src[2]; |
431 | |
|
432 | 0 | dst[0] = (int16_t)((g_t4[0][0] * E[0] + g_t4[0][1] * E[1] + add) >> shift); |
433 | 0 | dst[2 * line] = (int16_t)((g_t4[2][0] * E[0] + g_t4[2][1] * E[1] + add) >> shift); |
434 | 0 | dst[line] = (int16_t)((g_t4[1][0] * O[0] + g_t4[1][1] * O[1] + add) >> shift); |
435 | 0 | dst[3 * line] = (int16_t)((g_t4[3][0] * O[0] + g_t4[3][1] * O[1] + add) >> shift); |
436 | |
|
437 | 0 | src += 4; |
438 | 0 | dst++; |
439 | 0 | } |
440 | 0 | } |
441 | | |
442 | | static void dst4_c(const int16_t* src, int16_t* dst, intptr_t srcStride) |
443 | 0 | { |
444 | 0 | const int shift_1st = 1 + X265_DEPTH - 8; |
445 | 0 | const int shift_2nd = 8; |
446 | |
|
447 | 0 | ALIGN_VAR_32(int16_t, coef[4 * 4]); |
448 | 0 | ALIGN_VAR_32(int16_t, block[4 * 4]); |
449 | |
|
450 | 0 | for (int i = 0; i < 4; i++) |
451 | 0 | { |
452 | 0 | memcpy(&block[i * 4], &src[i * srcStride], 4 * sizeof(int16_t)); |
453 | 0 | } |
454 | |
|
455 | 0 | fastForwardDst(block, coef, shift_1st); |
456 | 0 | fastForwardDst(coef, dst, shift_2nd); |
457 | 0 | } |
458 | | |
459 | | static void dct4_c(const int16_t* src, int16_t* dst, intptr_t srcStride) |
460 | 0 | { |
461 | 0 | const int shift_1st = 1 + X265_DEPTH - 8; |
462 | 0 | const int shift_2nd = 8; |
463 | |
|
464 | 0 | ALIGN_VAR_32(int16_t, coef[4 * 4]); |
465 | 0 | ALIGN_VAR_32(int16_t, block[4 * 4]); |
466 | |
|
467 | 0 | for (int i = 0; i < 4; i++) |
468 | 0 | { |
469 | 0 | memcpy(&block[i * 4], &src[i * srcStride], 4 * sizeof(int16_t)); |
470 | 0 | } |
471 | |
|
472 | 0 | partialButterfly4(block, coef, shift_1st, 4); |
473 | 0 | partialButterfly4(coef, dst, shift_2nd, 4); |
474 | 0 | } |
475 | | |
476 | | static void dct8_c(const int16_t* src, int16_t* dst, intptr_t srcStride) |
477 | 0 | { |
478 | 0 | const int shift_1st = 2 + X265_DEPTH - 8; |
479 | 0 | const int shift_2nd = 9; |
480 | |
|
481 | 0 | ALIGN_VAR_32(int16_t, coef[8 * 8]); |
482 | 0 | ALIGN_VAR_32(int16_t, block[8 * 8]); |
483 | |
|
484 | 0 | for (int i = 0; i < 8; i++) |
485 | 0 | { |
486 | 0 | memcpy(&block[i * 8], &src[i * srcStride], 8 * sizeof(int16_t)); |
487 | 0 | } |
488 | |
|
489 | 0 | partialButterfly8(block, coef, shift_1st, 8); |
490 | 0 | partialButterfly8(coef, dst, shift_2nd, 8); |
491 | 0 | } |
492 | | |
493 | | static void dct16_c(const int16_t* src, int16_t* dst, intptr_t srcStride) |
494 | 0 | { |
495 | 0 | const int shift_1st = 3 + X265_DEPTH - 8; |
496 | 0 | const int shift_2nd = 10; |
497 | |
|
498 | 0 | ALIGN_VAR_32(int16_t, coef[16 * 16]); |
499 | 0 | ALIGN_VAR_32(int16_t, block[16 * 16]); |
500 | |
|
501 | 0 | for (int i = 0; i < 16; i++) |
502 | 0 | { |
503 | 0 | memcpy(&block[i * 16], &src[i * srcStride], 16 * sizeof(int16_t)); |
504 | 0 | } |
505 | |
|
506 | 0 | partialButterfly16(block, coef, shift_1st, 16); |
507 | 0 | partialButterfly16(coef, dst, shift_2nd, 16); |
508 | 0 | } |
509 | | |
510 | | static void dct32_c(const int16_t* src, int16_t* dst, intptr_t srcStride) |
511 | 0 | { |
512 | 0 | const int shift_1st = 4 + X265_DEPTH - 8; |
513 | 0 | const int shift_2nd = 11; |
514 | |
|
515 | 0 | ALIGN_VAR_32(int16_t, coef[32 * 32]); |
516 | 0 | ALIGN_VAR_32(int16_t, block[32 * 32]); |
517 | |
|
518 | 0 | for (int i = 0; i < 32; i++) |
519 | 0 | { |
520 | 0 | memcpy(&block[i * 32], &src[i * srcStride], 32 * sizeof(int16_t)); |
521 | 0 | } |
522 | |
|
523 | 0 | partialButterfly32(block, coef, shift_1st, 32); |
524 | 0 | partialButterfly32(coef, dst, shift_2nd, 32); |
525 | 0 | } |
526 | | |
527 | | static void idst4_c(const int16_t* src, int16_t* dst, intptr_t dstStride) |
528 | 0 | { |
529 | 0 | const int shift_1st = 7; |
530 | 0 | const int shift_2nd = 12 - (X265_DEPTH - 8); |
531 | |
|
532 | 0 | ALIGN_VAR_32(int16_t, coef[4 * 4]); |
533 | 0 | ALIGN_VAR_32(int16_t, block[4 * 4]); |
534 | |
|
535 | 0 | inversedst(src, coef, shift_1st); // Forward DST BY FAST ALGORITHM, block input, coef output |
536 | 0 | inversedst(coef, block, shift_2nd); // Forward DST BY FAST ALGORITHM, coef input, coeff output |
537 | |
|
538 | 0 | for (int i = 0; i < 4; i++) |
539 | 0 | { |
540 | 0 | memcpy(&dst[i * dstStride], &block[i * 4], 4 * sizeof(int16_t)); |
541 | 0 | } |
542 | 0 | } |
543 | | |
544 | | static void idct4_c(const int16_t* src, int16_t* dst, intptr_t dstStride) |
545 | 0 | { |
546 | 0 | const int shift_1st = 7; |
547 | 0 | const int shift_2nd = 12 - (X265_DEPTH - 8); |
548 | |
|
549 | 0 | ALIGN_VAR_32(int16_t, coef[4 * 4]); |
550 | 0 | ALIGN_VAR_32(int16_t, block[4 * 4]); |
551 | |
|
552 | 0 | partialButterflyInverse4(src, coef, shift_1st, 4); // Forward DST BY FAST ALGORITHM, block input, coef output |
553 | 0 | partialButterflyInverse4(coef, block, shift_2nd, 4); // Forward DST BY FAST ALGORITHM, coef input, coeff output |
554 | |
|
555 | 0 | for (int i = 0; i < 4; i++) |
556 | 0 | { |
557 | 0 | memcpy(&dst[i * dstStride], &block[i * 4], 4 * sizeof(int16_t)); |
558 | 0 | } |
559 | 0 | } |
560 | | |
561 | | static void idct8_c(const int16_t* src, int16_t* dst, intptr_t dstStride) |
562 | 0 | { |
563 | 0 | const int shift_1st = 7; |
564 | 0 | const int shift_2nd = 12 - (X265_DEPTH - 8); |
565 | |
|
566 | 0 | ALIGN_VAR_32(int16_t, coef[8 * 8]); |
567 | 0 | ALIGN_VAR_32(int16_t, block[8 * 8]); |
568 | |
|
569 | 0 | partialButterflyInverse8(src, coef, shift_1st, 8); |
570 | 0 | partialButterflyInverse8(coef, block, shift_2nd, 8); |
571 | |
|
572 | 0 | for (int i = 0; i < 8; i++) |
573 | 0 | { |
574 | 0 | memcpy(&dst[i * dstStride], &block[i * 8], 8 * sizeof(int16_t)); |
575 | 0 | } |
576 | 0 | } |
577 | | |
578 | | static void idct16_c(const int16_t* src, int16_t* dst, intptr_t dstStride) |
579 | 0 | { |
580 | 0 | const int shift_1st = 7; |
581 | 0 | const int shift_2nd = 12 - (X265_DEPTH - 8); |
582 | |
|
583 | 0 | ALIGN_VAR_32(int16_t, coef[16 * 16]); |
584 | 0 | ALIGN_VAR_32(int16_t, block[16 * 16]); |
585 | |
|
586 | 0 | partialButterflyInverse16(src, coef, shift_1st, 16); |
587 | 0 | partialButterflyInverse16(coef, block, shift_2nd, 16); |
588 | |
|
589 | 0 | for (int i = 0; i < 16; i++) |
590 | 0 | { |
591 | 0 | memcpy(&dst[i * dstStride], &block[i * 16], 16 * sizeof(int16_t)); |
592 | 0 | } |
593 | 0 | } |
594 | | |
595 | | static void idct32_c(const int16_t* src, int16_t* dst, intptr_t dstStride) |
596 | 0 | { |
597 | 0 | const int shift_1st = 7; |
598 | 0 | const int shift_2nd = 12 - (X265_DEPTH - 8); |
599 | |
|
600 | 0 | ALIGN_VAR_32(int16_t, coef[32 * 32]); |
601 | 0 | ALIGN_VAR_32(int16_t, block[32 * 32]); |
602 | |
|
603 | 0 | partialButterflyInverse32(src, coef, shift_1st, 32); |
604 | 0 | partialButterflyInverse32(coef, block, shift_2nd, 32); |
605 | |
|
606 | 0 | for (int i = 0; i < 32; i++) |
607 | 0 | { |
608 | 0 | memcpy(&dst[i * dstStride], &block[i * 32], 32 * sizeof(int16_t)); |
609 | 0 | } |
610 | 0 | } |
611 | | |
612 | | static void dequant_normal_c(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift) |
613 | 0 | { |
614 | | #if HIGH_BIT_DEPTH |
615 | | X265_CHECK(scale < 32768 || ((scale & 3) == 0 && shift > (X265_DEPTH - 8)), "dequant invalid scale %d\n", scale); |
616 | | #else |
617 | | // NOTE: maximum of scale is (72 * 256) |
618 | 0 | X265_CHECK(scale < 32768, "dequant invalid scale %d\n", scale); |
619 | 0 | #endif |
620 | 0 | X265_CHECK(num <= 32 * 32, "dequant num %d too large\n", num); |
621 | 0 | X265_CHECK((num % 8) == 0, "dequant num %d not multiple of 8\n", num); |
622 | 0 | X265_CHECK(shift <= 10, "shift too large %d\n", shift); |
623 | 0 | X265_CHECK(((intptr_t)coef & 31) == 0, "dequant coef buffer not aligned\n"); |
624 | |
|
625 | 0 | int add, coeffQ; |
626 | |
|
627 | 0 | add = 1 << (shift - 1); |
628 | |
|
629 | 0 | for (int n = 0; n < num; n++) |
630 | 0 | { |
631 | 0 | coeffQ = (quantCoef[n] * scale + add) >> shift; |
632 | 0 | coef[n] = (int16_t)x265_clip3(-32768, 32767, coeffQ); |
633 | 0 | } |
634 | 0 | } |
635 | | |
636 | | static void dequant_scaling_c(const int16_t* quantCoef, const int32_t* deQuantCoef, int16_t* coef, int num, int per, int shift) |
637 | 0 | { |
638 | 0 | X265_CHECK(num <= 32 * 32, "dequant num %d too large\n", num); |
639 | |
|
640 | 0 | int add, coeffQ; |
641 | |
|
642 | 0 | shift += 4; |
643 | |
|
644 | 0 | if (shift > per) |
645 | 0 | { |
646 | 0 | add = 1 << (shift - per - 1); |
647 | |
|
648 | 0 | for (int n = 0; n < num; n++) |
649 | 0 | { |
650 | 0 | coeffQ = ((quantCoef[n] * deQuantCoef[n]) + add) >> (shift - per); |
651 | 0 | coef[n] = (int16_t)x265_clip3(-32768, 32767, coeffQ); |
652 | 0 | } |
653 | 0 | } |
654 | 0 | else |
655 | 0 | { |
656 | 0 | for (int n = 0; n < num; n++) |
657 | 0 | { |
658 | 0 | coeffQ = x265_clip3(-32768, 32767, quantCoef[n] * deQuantCoef[n]); |
659 | 0 | coef[n] = (int16_t)x265_clip3(-32768, 32767, coeffQ << (per - shift)); |
660 | 0 | } |
661 | 0 | } |
662 | 0 | } |
663 | | |
664 | | static uint32_t quant_c(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff) |
665 | 0 | { |
666 | 0 | X265_CHECK(qBits >= 8, "qBits less than 8\n"); |
667 | 0 | X265_CHECK((numCoeff % 16) == 0, "numCoeff must be multiple of 16\n"); |
668 | 0 | int qBits8 = qBits - 8; |
669 | 0 | uint32_t numSig = 0; |
670 | |
|
671 | 0 | for (int blockpos = 0; blockpos < numCoeff; blockpos++) |
672 | 0 | { |
673 | 0 | int level = coef[blockpos]; |
674 | 0 | int sign = (level < 0 ? -1 : 1); |
675 | |
|
676 | 0 | int tmplevel = abs(level) * quantCoeff[blockpos]; |
677 | 0 | level = ((tmplevel + add) >> qBits); |
678 | 0 | deltaU[blockpos] = ((tmplevel - (level << qBits)) >> qBits8); |
679 | 0 | if (level) |
680 | 0 | ++numSig; |
681 | 0 | level *= sign; |
682 | 0 | qCoef[blockpos] = (int16_t)x265_clip3(-32768, 32767, level); |
683 | 0 | } |
684 | |
|
685 | 0 | return numSig; |
686 | 0 | } |
687 | | |
688 | | static uint32_t nquant_c(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff) |
689 | 0 | { |
690 | 0 | X265_CHECK((numCoeff % 16) == 0, "number of quant coeff is not multiple of 4x4\n"); |
691 | 0 | X265_CHECK((uint32_t)add < ((uint32_t)1 << qBits), "2 ^ qBits less than add\n"); |
692 | 0 | X265_CHECK(((intptr_t)quantCoeff & 31) == 0, "quantCoeff buffer not aligned\n"); |
693 | |
|
694 | 0 | uint32_t numSig = 0; |
695 | |
|
696 | 0 | for (int blockpos = 0; blockpos < numCoeff; blockpos++) |
697 | 0 | { |
698 | 0 | int level = coef[blockpos]; |
699 | 0 | int sign = (level < 0 ? -1 : 1); |
700 | |
|
701 | 0 | int tmplevel = abs(level) * quantCoeff[blockpos]; |
702 | 0 | level = ((tmplevel + add) >> qBits); |
703 | 0 | if (level) |
704 | 0 | ++numSig; |
705 | 0 | level *= sign; |
706 | | |
707 | | // TODO: when we limit range to [-32767, 32767], we can get more performance with output change |
708 | | // But nquant is a little percent in rdoQuant, so I keep old dynamic range for compatible |
709 | 0 | qCoef[blockpos] = (int16_t)abs(x265_clip3(-32768, 32767, level)); |
710 | 0 | } |
711 | |
|
712 | 0 | return numSig; |
713 | 0 | } |
714 | | template<int trSize> |
715 | | int count_nonzero_c(const int16_t* quantCoeff) |
716 | 0 | { |
717 | 0 | X265_CHECK(((intptr_t)quantCoeff & 15) == 0, "quant buffer not aligned\n"); |
718 | 0 | int count = 0; |
719 | 0 | int numCoeff = trSize * trSize; |
720 | 0 | for (int i = 0; i < numCoeff; i++) |
721 | 0 | { |
722 | 0 | count += quantCoeff[i] != 0; |
723 | 0 | } |
724 | |
|
725 | 0 | return count; |
726 | 0 | } Unexecuted instantiation: int count_nonzero_c<4>(short const*) Unexecuted instantiation: int count_nonzero_c<8>(short const*) Unexecuted instantiation: int count_nonzero_c<16>(short const*) Unexecuted instantiation: int count_nonzero_c<32>(short const*) |
727 | | |
728 | | template<int trSize> |
729 | | uint32_t copy_count(int16_t* coeff, const int16_t* residual, intptr_t resiStride) |
730 | 0 | { |
731 | 0 | uint32_t numSig = 0; |
732 | 0 | for (int k = 0; k < trSize; k++) |
733 | 0 | { |
734 | 0 | for (int j = 0; j < trSize; j++) |
735 | 0 | { |
736 | 0 | coeff[k * trSize + j] = residual[k * resiStride + j]; |
737 | 0 | numSig += (residual[k * resiStride + j] != 0); |
738 | 0 | } |
739 | 0 | } |
740 | |
|
741 | 0 | return numSig; |
742 | 0 | } Unexecuted instantiation: unsigned int copy_count<4>(short*, short const*, long) Unexecuted instantiation: unsigned int copy_count<8>(short*, short const*, long) Unexecuted instantiation: unsigned int copy_count<16>(short*, short const*, long) Unexecuted instantiation: unsigned int copy_count<32>(short*, short const*, long) |
743 | | |
744 | | static void denoiseDct_c(int16_t* dctCoef, uint32_t* resSum, const uint16_t* offset, int numCoeff) |
745 | 0 | { |
746 | 0 | for (int i = 0; i < numCoeff; i++) |
747 | 0 | { |
748 | 0 | int level = dctCoef[i]; |
749 | 0 | int sign = level >> 31; |
750 | 0 | level = (level + sign) ^ sign; |
751 | 0 | resSum[i] += level; |
752 | 0 | level -= offset[i]; |
753 | 0 | dctCoef[i] = (int16_t)(level < 0 ? 0 : (level ^ sign) - sign); |
754 | 0 | } |
755 | 0 | } |
756 | | |
757 | | static int scanPosLast_c(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* /*scanCG4x4*/, const int /*trSize*/) |
758 | 0 | { |
759 | 0 | memset(coeffNum, 0, MLS_GRP_NUM * sizeof(*coeffNum)); |
760 | 0 | memset(coeffFlag, 0, MLS_GRP_NUM * sizeof(*coeffFlag)); |
761 | 0 | memset(coeffSign, 0, MLS_GRP_NUM * sizeof(*coeffSign)); |
762 | |
|
763 | 0 | int scanPosLast = 0; |
764 | 0 | do |
765 | 0 | { |
766 | 0 | const uint32_t cgIdx = (uint32_t)scanPosLast >> MLS_CG_SIZE; |
767 | |
|
768 | 0 | const uint32_t posLast = scan[scanPosLast++]; |
769 | |
|
770 | 0 | const int curCoeff = coeff[posLast]; |
771 | 0 | const uint32_t isNZCoeff = (curCoeff != 0); |
772 | | // get L1 sig map |
773 | | // NOTE: the new algorithm is complicated, so I keep reference code here |
774 | | //uint32_t posy = posLast >> log2TrSize; |
775 | | //uint32_t posx = posLast - (posy << log2TrSize); |
776 | | //uint32_t blkIdx0 = ((posy >> MLS_CG_LOG2_SIZE) << codingParameters.log2TrSizeCG) + (posx >> MLS_CG_LOG2_SIZE); |
777 | | //const uint32_t blkIdx = ((posLast >> (2 * MLS_CG_LOG2_SIZE)) & ~maskPosXY) + ((posLast >> MLS_CG_LOG2_SIZE) & maskPosXY); |
778 | | //sigCoeffGroupFlag64 |= ((uint64_t)isNZCoeff << blkIdx); |
779 | 0 | numSig -= isNZCoeff; |
780 | | |
781 | | // TODO: optimize by instruction BTS |
782 | 0 | coeffSign[cgIdx] += (uint16_t)(((uint32_t)curCoeff >> 31) << coeffNum[cgIdx]); |
783 | 0 | coeffFlag[cgIdx] = (coeffFlag[cgIdx] << 1) + (uint16_t)isNZCoeff; |
784 | 0 | coeffNum[cgIdx] += (uint8_t)isNZCoeff; |
785 | 0 | } |
786 | 0 | while (numSig > 0); |
787 | 0 | return scanPosLast - 1; |
788 | 0 | } |
789 | | |
790 | | // NOTE: no defined value on lastNZPosInCG & absSumSign when ALL ZEROS block as input |
791 | | static uint32_t findPosFirstLast_c(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16]) |
792 | 0 | { |
793 | 0 | int n; |
794 | |
|
795 | 0 | for (n = SCAN_SET_SIZE - 1; n >= 0; n--) |
796 | 0 | { |
797 | 0 | const uint32_t idx = scanTbl[n]; |
798 | 0 | const uint32_t idxY = idx / MLS_CG_SIZE; |
799 | 0 | const uint32_t idxX = idx % MLS_CG_SIZE; |
800 | 0 | if (dstCoeff[idxY * trSize + idxX]) |
801 | 0 | break; |
802 | 0 | } |
803 | |
|
804 | 0 | X265_CHECK(n >= -1, "non-zero coeff scan failuare!\n"); |
805 | |
|
806 | 0 | uint32_t lastNZPosInCG = (uint32_t)n; |
807 | |
|
808 | 0 | for (n = 0; n < SCAN_SET_SIZE; n++) |
809 | 0 | { |
810 | 0 | const uint32_t idx = scanTbl[n]; |
811 | 0 | const uint32_t idxY = idx / MLS_CG_SIZE; |
812 | 0 | const uint32_t idxX = idx % MLS_CG_SIZE; |
813 | 0 | if (dstCoeff[idxY * trSize + idxX]) |
814 | 0 | break; |
815 | 0 | } |
816 | |
|
817 | 0 | uint32_t firstNZPosInCG = (uint32_t)n; |
818 | |
|
819 | 0 | uint32_t absSumSign = 0; |
820 | 0 | for (n = firstNZPosInCG; n <= (int)lastNZPosInCG; n++) |
821 | 0 | { |
822 | 0 | const uint32_t idx = scanTbl[n]; |
823 | 0 | const uint32_t idxY = idx / MLS_CG_SIZE; |
824 | 0 | const uint32_t idxX = idx % MLS_CG_SIZE; |
825 | 0 | absSumSign += dstCoeff[idxY * trSize + idxX]; |
826 | 0 | } |
827 | | |
828 | | // NOTE: when coeff block all ZERO, the lastNZPosInCG is undefined and firstNZPosInCG is 16 |
829 | 0 | return ((absSumSign << 31) | (lastNZPosInCG << 8) | firstNZPosInCG); |
830 | 0 | } |
831 | | |
832 | | |
833 | | static uint32_t costCoeffNxN_c(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase) |
834 | 0 | { |
835 | 0 | ALIGN_VAR_32(uint16_t, tmpCoeff[SCAN_SET_SIZE]); |
836 | 0 | uint32_t numNonZero = (scanPosSigOff < (SCAN_SET_SIZE - 1) ? 1 : 0); |
837 | 0 | uint32_t sum = 0; |
838 | | |
839 | | // correct offset to match assembly |
840 | 0 | absCoeff -= numNonZero; |
841 | |
|
842 | 0 | for (int i = 0; i < MLS_CG_SIZE; i++) |
843 | 0 | { |
844 | 0 | tmpCoeff[i * MLS_CG_SIZE + 0] = (uint16_t)abs(coeff[i * trSize + 0]); |
845 | 0 | tmpCoeff[i * MLS_CG_SIZE + 1] = (uint16_t)abs(coeff[i * trSize + 1]); |
846 | 0 | tmpCoeff[i * MLS_CG_SIZE + 2] = (uint16_t)abs(coeff[i * trSize + 2]); |
847 | 0 | tmpCoeff[i * MLS_CG_SIZE + 3] = (uint16_t)abs(coeff[i * trSize + 3]); |
848 | 0 | } |
849 | |
|
850 | 0 | do |
851 | 0 | { |
852 | 0 | uint32_t blkPos, sig, ctxSig; |
853 | 0 | blkPos = scan[scanPosSigOff]; |
854 | 0 | const uint32_t posZeroMask = (subPosBase + scanPosSigOff) ? ~0 : 0; |
855 | 0 | sig = scanFlagMask & 1; |
856 | 0 | scanFlagMask >>= 1; |
857 | 0 | X265_CHECK((uint32_t)(tmpCoeff[blkPos] != 0) == sig, "sign bit mistake\n"); |
858 | 0 | if ((scanPosSigOff != 0) || (subPosBase == 0) || numNonZero) |
859 | 0 | { |
860 | 0 | const uint32_t cnt = tabSigCtx[blkPos] + offset; |
861 | 0 | ctxSig = cnt & posZeroMask; |
862 | | |
863 | | //X265_CHECK(ctxSig == Quant::getSigCtxInc(patternSigCtx, log2TrSize, trSize, codingParameters.scan[subPosBase + scanPosSigOff], bIsLuma, codingParameters.firstSignificanceMapContext), "sigCtx mistake!\n");; |
864 | | //encodeBin(sig, baseCtx[ctxSig]); |
865 | 0 | const uint32_t mstate = baseCtx[ctxSig]; |
866 | 0 | const uint32_t mps = mstate & 1; |
867 | 0 | const uint32_t stateBits = PFX(entropyStateBits)[mstate ^ sig]; |
868 | 0 | uint32_t nextState = (stateBits >> 24) + mps; |
869 | 0 | if ((mstate ^ sig) == 1) |
870 | 0 | nextState = sig; |
871 | 0 | X265_CHECK(sbacNext(mstate, sig) == nextState, "nextState check failure\n"); |
872 | 0 | X265_CHECK(sbacGetEntropyBits(mstate, sig) == (stateBits & 0xFFFFFF), "entropyBits check failure\n"); |
873 | 0 | baseCtx[ctxSig] = (uint8_t)nextState; |
874 | 0 | sum += stateBits; |
875 | 0 | } |
876 | 0 | assert(numNonZero <= 15); |
877 | 0 | assert(blkPos <= 15); |
878 | 0 | absCoeff[numNonZero] = tmpCoeff[blkPos]; |
879 | 0 | numNonZero += sig; |
880 | 0 | scanPosSigOff--; |
881 | 0 | } |
882 | 0 | while(scanPosSigOff >= 0); |
883 | |
|
884 | 0 | return (sum & 0xFFFFFF); |
885 | 0 | } |
886 | | |
887 | | static uint32_t costCoeffRemain_c(uint16_t *absCoeff, int numNonZero, int idx) |
888 | 0 | { |
889 | 0 | uint32_t goRiceParam = 0; |
890 | |
|
891 | 0 | uint32_t sum = 0; |
892 | 0 | int baseLevel = 3; |
893 | 0 | do |
894 | 0 | { |
895 | 0 | if (idx >= C1FLAG_NUMBER) |
896 | 0 | baseLevel = 1; |
897 | | |
898 | | // TODO: the IDX is not really idx, so this check inactive |
899 | | //X265_CHECK(baseLevel == ((idx < C1FLAG_NUMBER) ? (2 + firstCoeff2) : 1), "baseLevel check failurr\n"); |
900 | 0 | int codeNumber = absCoeff[idx] - baseLevel; |
901 | |
|
902 | 0 | if (codeNumber >= 0) |
903 | 0 | { |
904 | | //writeCoefRemainExGolomb(absCoeff[idx] - baseLevel, goRiceParam); |
905 | 0 | uint32_t length = 0; |
906 | |
|
907 | 0 | codeNumber = ((uint32_t)codeNumber >> goRiceParam) - COEF_REMAIN_BIN_REDUCTION; |
908 | 0 | if (codeNumber >= 0) |
909 | 0 | { |
910 | 0 | { |
911 | 0 | unsigned long cidx; |
912 | 0 | CLZ(cidx, codeNumber + 1); |
913 | 0 | length = cidx; |
914 | 0 | } |
915 | 0 | X265_CHECK((codeNumber != 0) || (length == 0), "length check failure\n"); |
916 | |
|
917 | 0 | codeNumber = (length + length); |
918 | 0 | } |
919 | 0 | sum += (COEF_REMAIN_BIN_REDUCTION + 1 + goRiceParam + codeNumber); |
920 | |
|
921 | 0 | if (absCoeff[idx] > (COEF_REMAIN_BIN_REDUCTION << goRiceParam)) |
922 | 0 | goRiceParam = (goRiceParam + 1) - (goRiceParam >> 2); |
923 | 0 | X265_CHECK(goRiceParam <= 4, "goRiceParam check failure\n"); |
924 | 0 | } |
925 | 0 | baseLevel = 2; |
926 | 0 | idx++; |
927 | 0 | } |
928 | 0 | while(idx < numNonZero); |
929 | |
|
930 | 0 | return sum; |
931 | 0 | } |
932 | | |
933 | | |
934 | | static uint32_t costC1C2Flag_c(uint16_t *absCoeff, intptr_t numC1Flag, uint8_t *baseCtxMod, intptr_t ctxOffset) |
935 | 0 | { |
936 | 0 | uint32_t sum = 0; |
937 | 0 | uint32_t c1 = 1; |
938 | 0 | uint32_t firstC2Idx = 8; |
939 | 0 | uint32_t firstC2Flag = 2; |
940 | 0 | uint32_t c1Next = 0xFFFFFFFE; |
941 | |
|
942 | 0 | int idx = 0; |
943 | 0 | do |
944 | 0 | { |
945 | 0 | uint32_t symbol1 = absCoeff[idx] > 1; |
946 | 0 | uint32_t symbol2 = absCoeff[idx] > 2; |
947 | | //encodeBin(symbol1, baseCtxMod[c1]); |
948 | 0 | { |
949 | 0 | const uint32_t mstate = baseCtxMod[c1]; |
950 | 0 | baseCtxMod[c1] = sbacNext(mstate, symbol1); |
951 | 0 | sum += sbacGetEntropyBits(mstate, symbol1); |
952 | 0 | } |
953 | |
|
954 | 0 | if (symbol1) |
955 | 0 | c1Next = 0; |
956 | |
|
957 | 0 | if (symbol1 + firstC2Flag == 3) |
958 | 0 | firstC2Flag = symbol2; |
959 | |
|
960 | 0 | if (symbol1 + firstC2Idx == 9) |
961 | 0 | firstC2Idx = idx; |
962 | |
|
963 | 0 | c1 = (c1Next & 3); |
964 | 0 | c1Next >>= 2; |
965 | 0 | X265_CHECK(c1 <= 3, "c1 check failure\n"); |
966 | 0 | idx++; |
967 | 0 | } |
968 | 0 | while(idx < numC1Flag); |
969 | |
|
970 | 0 | if (!c1) |
971 | 0 | { |
972 | 0 | X265_CHECK((firstC2Flag <= 1), "firstC2FlagIdx check failure\n"); |
973 | |
|
974 | 0 | baseCtxMod += ctxOffset; |
975 | | |
976 | | //encodeBin(firstC2Flag, baseCtxMod[0]); |
977 | 0 | { |
978 | 0 | const uint32_t mstate = baseCtxMod[0]; |
979 | 0 | baseCtxMod[0] = sbacNext(mstate, firstC2Flag); |
980 | 0 | sum += sbacGetEntropyBits(mstate, firstC2Flag); |
981 | 0 | } |
982 | 0 | } |
983 | 0 | return (sum & 0x00FFFFFF) + (c1 << 26) + (firstC2Idx << 28); |
984 | 0 | } |
985 | | template<int log2TrSize> |
986 | | static void nonPsyRdoQuant_c(int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, uint32_t blkPos) |
987 | 0 | { |
988 | 0 | const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */ |
989 | 0 | const int scaleBits = SCALE_BITS - 2 * transformShift; |
990 | 0 | const uint32_t trSize = 1 << log2TrSize; |
991 | |
|
992 | 0 | for (int y = 0; y < MLS_CG_SIZE; y++) |
993 | 0 | { |
994 | 0 | for (int x = 0; x < MLS_CG_SIZE; x++) |
995 | 0 | { |
996 | 0 | int64_t signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */ |
997 | 0 | costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits)); |
998 | 0 | *totalUncodedCost += costUncoded[blkPos + x]; |
999 | 0 | *totalRdCost += costUncoded[blkPos + x]; |
1000 | 0 | } |
1001 | 0 | blkPos += trSize; |
1002 | 0 | } |
1003 | 0 | } Unexecuted instantiation: dct.cpp:void nonPsyRdoQuant_c<2>(short*, long*, long*, long*, unsigned int) Unexecuted instantiation: dct.cpp:void nonPsyRdoQuant_c<3>(short*, long*, long*, long*, unsigned int) Unexecuted instantiation: dct.cpp:void nonPsyRdoQuant_c<4>(short*, long*, long*, long*, unsigned int) Unexecuted instantiation: dct.cpp:void nonPsyRdoQuant_c<5>(short*, long*, long*, long*, unsigned int) |
1004 | | template<int log2TrSize> |
1005 | | static void psyRdoQuant_c(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos) |
1006 | 0 | { |
1007 | 0 | const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */ |
1008 | 0 | const int scaleBits = SCALE_BITS - 2 * transformShift; |
1009 | 0 | const uint32_t trSize = 1 << log2TrSize; |
1010 | 0 | int max = X265_MAX(0, (2 * transformShift + 1)); |
1011 | |
|
1012 | 0 | for (int y = 0; y < MLS_CG_SIZE; y++) |
1013 | 0 | { |
1014 | 0 | for (int x = 0; x < MLS_CG_SIZE; x++) |
1015 | 0 | { |
1016 | 0 | int64_t signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */ |
1017 | 0 | int64_t predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/ |
1018 | |
|
1019 | 0 | costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits)); |
1020 | | |
1021 | | /* when no residual coefficient is coded, predicted coef == recon coef */ |
1022 | 0 | costUncoded[blkPos + x] -= static_cast<int64_t>((double)(((*psyScale) * predictedCoef) >> max)); |
1023 | |
|
1024 | 0 | *totalUncodedCost += costUncoded[blkPos + x]; |
1025 | 0 | *totalRdCost += costUncoded[blkPos + x]; |
1026 | 0 | } |
1027 | 0 | blkPos += trSize; |
1028 | 0 | } |
1029 | 0 | } Unexecuted instantiation: dct.cpp:void psyRdoQuant_c<2>(short*, short*, long*, long*, long*, long*, unsigned int) Unexecuted instantiation: dct.cpp:void psyRdoQuant_c<3>(short*, short*, long*, long*, long*, long*, unsigned int) Unexecuted instantiation: dct.cpp:void psyRdoQuant_c<4>(short*, short*, long*, long*, long*, long*, unsigned int) Unexecuted instantiation: dct.cpp:void psyRdoQuant_c<5>(short*, short*, long*, long*, long*, long*, unsigned int) |
1030 | | template<int log2TrSize> |
1031 | | static void psyRdoQuant_c_1(int16_t *m_resiDctCoeff, /*int16_t *m_fencDctCoeff, */ int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, /* int64_t *psyScale,*/ uint32_t blkPos) |
1032 | 0 | { |
1033 | 0 | const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */ |
1034 | 0 | const int scaleBits = SCALE_BITS - 2 * transformShift; |
1035 | 0 | const uint32_t trSize = 1 << log2TrSize; |
1036 | |
|
1037 | 0 | for (int y = 0; y < MLS_CG_SIZE; y++) |
1038 | 0 | { |
1039 | 0 | for (int x = 0; x < MLS_CG_SIZE; x++) |
1040 | 0 | { |
1041 | 0 | int64_t signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */ |
1042 | 0 | costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits)); |
1043 | 0 | *totalUncodedCost += costUncoded[blkPos + x]; |
1044 | 0 | *totalRdCost += costUncoded[blkPos + x]; |
1045 | 0 | } |
1046 | 0 | blkPos += trSize; |
1047 | 0 | } |
1048 | 0 | } Unexecuted instantiation: dct.cpp:void psyRdoQuant_c_1<2>(short*, long*, long*, long*, unsigned int) Unexecuted instantiation: dct.cpp:void psyRdoQuant_c_1<3>(short*, long*, long*, long*, unsigned int) Unexecuted instantiation: dct.cpp:void psyRdoQuant_c_1<4>(short*, long*, long*, long*, unsigned int) Unexecuted instantiation: dct.cpp:void psyRdoQuant_c_1<5>(short*, long*, long*, long*, unsigned int) |
1049 | | template<int log2TrSize> |
1050 | | static void psyRdoQuant_c_2(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos) |
1051 | 0 | { |
1052 | 0 | const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */ |
1053 | |
|
1054 | 0 | const uint32_t trSize = 1 << log2TrSize; |
1055 | 0 | int max = X265_MAX(0, (2 * transformShift + 1)); |
1056 | |
|
1057 | 0 | for (int y = 0; y < MLS_CG_SIZE; y++) |
1058 | 0 | { |
1059 | 0 | for (int x = 0; x < MLS_CG_SIZE; x++) |
1060 | 0 | { |
1061 | 0 | int64_t signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */ |
1062 | 0 | int64_t predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/ |
1063 | 0 | costUncoded[blkPos + x] -= static_cast<int64_t>((double)(((*psyScale) * predictedCoef) >> max)); |
1064 | 0 | *totalUncodedCost += costUncoded[blkPos + x]; |
1065 | 0 | *totalRdCost += costUncoded[blkPos + x]; |
1066 | 0 | } |
1067 | 0 | blkPos += trSize; |
1068 | 0 | } |
1069 | 0 | } Unexecuted instantiation: dct.cpp:void psyRdoQuant_c_2<2>(short*, short*, long*, long*, long*, long*, unsigned int) Unexecuted instantiation: dct.cpp:void psyRdoQuant_c_2<3>(short*, short*, long*, long*, long*, long*, unsigned int) Unexecuted instantiation: dct.cpp:void psyRdoQuant_c_2<4>(short*, short*, long*, long*, long*, long*, unsigned int) Unexecuted instantiation: dct.cpp:void psyRdoQuant_c_2<5>(short*, short*, long*, long*, long*, long*, unsigned int) |
1070 | | |
1071 | | namespace X265_NS { |
1072 | | // x265 private namespace |
1073 | | void setupDCTPrimitives_c(EncoderPrimitives& p) |
1074 | 0 | { |
1075 | 0 | p.dequant_scaling = dequant_scaling_c; |
1076 | 0 | p.dequant_normal = dequant_normal_c; |
1077 | 0 | p.quant = quant_c; |
1078 | 0 | p.nquant = nquant_c; |
1079 | 0 | p.cu[BLOCK_4x4].nonPsyRdoQuant = nonPsyRdoQuant_c<2>; |
1080 | 0 | p.cu[BLOCK_8x8].nonPsyRdoQuant = nonPsyRdoQuant_c<3>; |
1081 | 0 | p.cu[BLOCK_16x16].nonPsyRdoQuant = nonPsyRdoQuant_c<4>; |
1082 | 0 | p.cu[BLOCK_32x32].nonPsyRdoQuant = nonPsyRdoQuant_c<5>; |
1083 | 0 | p.cu[BLOCK_4x4].psyRdoQuant = psyRdoQuant_c<2>; |
1084 | 0 | p.cu[BLOCK_8x8].psyRdoQuant = psyRdoQuant_c<3>; |
1085 | 0 | p.cu[BLOCK_16x16].psyRdoQuant = psyRdoQuant_c<4>; |
1086 | 0 | p.cu[BLOCK_32x32].psyRdoQuant = psyRdoQuant_c<5>; |
1087 | 0 | p.dst4x4 = dst4_c; |
1088 | 0 | p.cu[BLOCK_4x4].dct = dct4_c; |
1089 | 0 | p.cu[BLOCK_8x8].dct = dct8_c; |
1090 | 0 | p.cu[BLOCK_16x16].dct = dct16_c; |
1091 | 0 | p.cu[BLOCK_32x32].dct = dct32_c; |
1092 | 0 | p.idst4x4 = idst4_c; |
1093 | 0 | p.cu[BLOCK_4x4].idct = idct4_c; |
1094 | 0 | p.cu[BLOCK_8x8].idct = idct8_c; |
1095 | 0 | p.cu[BLOCK_16x16].idct = idct16_c; |
1096 | 0 | p.cu[BLOCK_32x32].idct = idct32_c; |
1097 | 0 | p.denoiseDct = denoiseDct_c; |
1098 | 0 | p.cu[BLOCK_4x4].count_nonzero = count_nonzero_c<4>; |
1099 | 0 | p.cu[BLOCK_8x8].count_nonzero = count_nonzero_c<8>; |
1100 | 0 | p.cu[BLOCK_16x16].count_nonzero = count_nonzero_c<16>; |
1101 | 0 | p.cu[BLOCK_32x32].count_nonzero = count_nonzero_c<32>; |
1102 | |
|
1103 | 0 | p.cu[BLOCK_4x4].copy_cnt = copy_count<4>; |
1104 | 0 | p.cu[BLOCK_8x8].copy_cnt = copy_count<8>; |
1105 | 0 | p.cu[BLOCK_16x16].copy_cnt = copy_count<16>; |
1106 | 0 | p.cu[BLOCK_32x32].copy_cnt = copy_count<32>; |
1107 | 0 | p.cu[BLOCK_4x4].psyRdoQuant_1p = psyRdoQuant_c_1<2>; |
1108 | 0 | p.cu[BLOCK_4x4].psyRdoQuant_2p = psyRdoQuant_c_2<2>; |
1109 | 0 | p.cu[BLOCK_8x8].psyRdoQuant_1p = psyRdoQuant_c_1<3>; |
1110 | 0 | p.cu[BLOCK_8x8].psyRdoQuant_2p = psyRdoQuant_c_2<3>; |
1111 | 0 | p.cu[BLOCK_16x16].psyRdoQuant_1p = psyRdoQuant_c_1<4>; |
1112 | 0 | p.cu[BLOCK_16x16].psyRdoQuant_2p = psyRdoQuant_c_2<4>; |
1113 | 0 | p.cu[BLOCK_32x32].psyRdoQuant_1p = psyRdoQuant_c_1<5>; |
1114 | 0 | p.cu[BLOCK_32x32].psyRdoQuant_2p = psyRdoQuant_c_2<5>; |
1115 | 0 | p.scanPosLast = scanPosLast_c; |
1116 | 0 | p.findPosFirstLast = findPosFirstLast_c; |
1117 | 0 | p.costCoeffNxN = costCoeffNxN_c; |
1118 | 0 | p.costCoeffRemain = costCoeffRemain_c; |
1119 | 0 | p.costC1C2Flag = costC1C2Flag_c; |
1120 | 0 | } |
1121 | | } |