/work/libde265/libde265/fallback-dct.cc
Line | Count | Source |
1 | | /* |
2 | | * H.265 video codec. |
3 | | * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de> |
4 | | * |
5 | | * This file is part of libde265. |
6 | | * |
7 | | * libde265 is free software: you can redistribute it and/or modify |
8 | | * it under the terms of the GNU Lesser General Public License as |
9 | | * published by the Free Software Foundation, either version 3 of |
10 | | * the License, or (at your option) any later version. |
11 | | * |
12 | | * libde265 is distributed in the hope that it will be useful, |
13 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
15 | | * GNU Lesser General Public License for more details. |
16 | | * |
17 | | * You should have received a copy of the GNU Lesser General Public License |
18 | | * along with libde265. If not, see <http://www.gnu.org/licenses/>. |
19 | | */ |
20 | | |
21 | | #include "fallback-dct.h" |
22 | | |
23 | | #if defined(_MSC_VER) || defined(__MINGW32__) |
24 | | # include <malloc.h> |
25 | | #elif defined(HAVE_ALLOCA_H) |
26 | | # include <alloca.h> |
27 | | #endif |
28 | | |
29 | | #include <assert.h> |
30 | | #include <algorithm> |
31 | | |
32 | | |
33 | | static void printMatrix(const char* name, const int16_t* v, int n) |
34 | 0 | { |
35 | 0 | printf("--- %s ---\n",name); |
36 | 0 | for (int r=0;r<n;r++) { |
37 | 0 | for (int c=0;c<n;c++) { |
38 | 0 | printf("%4d ",v[c+r*n]); |
39 | 0 | } |
40 | 0 | printf("\n"); |
41 | 0 | } |
42 | 0 | } |
43 | | |
44 | | |
45 | | |
46 | | void transform_skip_8_fallback(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride) |
47 | 0 | { |
48 | 0 | int nT = 4; |
49 | 0 | int bdShift2 = 20-8; |
50 | |
|
51 | 0 | assert(0); // DEPRECATED, should not be used anymore because of fixed 4x4 size |
52 | | |
53 | 0 | for (int y=0;y<nT;y++) |
54 | 0 | for (int x=0;x<nT;x++) { |
55 | 0 | int32_t c = coeffs[x+y*nT] << 7; |
56 | 0 | c = (c+(1<<(bdShift2-1)))>>bdShift2; |
57 | |
|
58 | 0 | dst[y*stride+x] = Clip1_8bit(dst[y*stride+x] + c); |
59 | 0 | } |
60 | 0 | } |
61 | | |
62 | | |
63 | | void transform_skip_16_fallback(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth) |
64 | 0 | { |
65 | 0 | int nT = 4; |
66 | 0 | int bdShift2 = 20-bit_depth; |
67 | |
|
68 | 0 | assert(0); // DEPRECATED, should not be used anymore because of fixed 4x4 size |
69 | | |
70 | 0 | for (int y=0;y<nT;y++) |
71 | 0 | for (int x=0;x<nT;x++) { |
72 | 0 | int32_t c = coeffs[x+y*nT] << 7; |
73 | 0 | c = (c+(1<<(bdShift2-1)))>>bdShift2; |
74 | |
|
75 | 0 | dst[y*stride+x] = Clip_BitDepth(dst[y*stride+x] + c, bit_depth); |
76 | 0 | } |
77 | 0 | } |
78 | | |
79 | | |
80 | | void transform_skip_residual_fallback(int32_t *residual, const int16_t *coeffs, int nT, |
81 | | int tsShift,int bdShift) |
82 | 0 | { |
83 | 0 | const int rnd = 1<<(bdShift-1); |
84 | |
|
85 | 0 | for (int y=0;y<nT;y++) |
86 | 0 | for (int x=0;x<nT;x++) { |
87 | 0 | int32_t c = (int32_t)((uint32_t)coeffs[x+y*nT] << tsShift); // C++ up to C++17 treats left-shift of signed values as UB |
88 | 0 | residual[x+y*nT] = (c + rnd) >> bdShift; |
89 | 0 | } |
90 | 0 | } |
91 | | |
92 | | |
93 | | void transform_skip_rdpcm_v_8_fallback(uint8_t *dst, const int16_t *coeffs, int log2nT, ptrdiff_t stride) |
94 | 0 | { |
95 | 0 | int bitDepth = 8; |
96 | 0 | int bdShift2 = 20-bitDepth; |
97 | 0 | int offset = (1<<(bdShift2-1)); |
98 | 0 | int tsShift = 5 + log2nT; // TODO: extended_precision |
99 | 0 | int nT = 1<<log2nT; |
100 | |
|
101 | 0 | for (int x=0;x<nT;x++) { |
102 | 0 | int32_t sum = 0; |
103 | |
|
104 | 0 | for (int y=0;y<nT;y++) { |
105 | 0 | int c = coeffs[x+y*nT] << tsShift; |
106 | 0 | sum += (c+offset)>>bdShift2; |
107 | |
|
108 | 0 | dst[y*stride+x] = Clip1_8bit(dst[y*stride+x] + sum); |
109 | 0 | } |
110 | 0 | } |
111 | 0 | } |
112 | | |
113 | | void transform_skip_rdpcm_h_8_fallback(uint8_t *dst, const int16_t *coeffs, int log2nT, ptrdiff_t stride) |
114 | 0 | { |
115 | 0 | int bitDepth = 8; |
116 | 0 | int bdShift2 = 20-bitDepth; |
117 | 0 | int offset = (1<<(bdShift2-1)); |
118 | 0 | int tsShift = 5 + log2nT; // TODO: extended_precision |
119 | 0 | int nT = 1<<log2nT; |
120 | |
|
121 | 0 | for (int y=0;y<nT;y++) { |
122 | 0 | int32_t sum = 0; |
123 | |
|
124 | 0 | for (int x=0;x<nT;x++) { |
125 | 0 | int c = coeffs[x+y*nT] << tsShift; |
126 | 0 | sum += (c+offset)>>bdShift2; |
127 | |
|
128 | 0 | dst[y*stride+x] = Clip1_8bit(dst[y*stride+x] + sum); |
129 | 0 | } |
130 | 0 | } |
131 | 0 | } |
132 | | |
133 | | |
134 | | void transform_bypass_rdpcm_v_8_fallback(uint8_t *dst, const int16_t *coeffs,int nT,ptrdiff_t stride) |
135 | 0 | { |
136 | 0 | for (int x=0;x<nT;x++) { |
137 | 0 | int32_t sum=0; |
138 | 0 | for (int y=0;y<nT;y++) { |
139 | 0 | sum += coeffs[x+y*nT]; |
140 | |
|
141 | 0 | dst[y*stride+x] = Clip1_8bit(dst[y*stride+x] + sum); |
142 | 0 | } |
143 | 0 | } |
144 | 0 | } |
145 | | |
146 | | |
147 | | void transform_bypass_rdpcm_h_8_fallback(uint8_t *dst, const int16_t *coeffs,int nT,ptrdiff_t stride) |
148 | 0 | { |
149 | 0 | for (int y=0;y<nT;y++) { |
150 | 0 | int32_t sum=0; |
151 | 0 | for (int x=0;x<nT;x++) { |
152 | 0 | sum += coeffs[x+y*nT]; |
153 | |
|
154 | 0 | dst[y*stride+x] = Clip1_8bit(dst[y*stride+x] + sum); |
155 | 0 | } |
156 | 0 | } |
157 | 0 | } |
158 | | |
159 | | |
160 | | void transform_bypass_rdpcm_v_fallback(int32_t *dst, const int16_t *coeffs,int nT) |
161 | 0 | { |
162 | 0 | for (int x=0;x<nT;x++) { |
163 | 0 | int32_t sum=0; |
164 | 0 | for (int y=0;y<nT;y++) { |
165 | 0 | sum += coeffs[x+y*nT]; |
166 | |
|
167 | 0 | dst[y*nT+x] = sum; |
168 | 0 | } |
169 | 0 | } |
170 | 0 | } |
171 | | |
172 | | |
173 | | void transform_bypass_rdpcm_h_fallback(int32_t *dst, const int16_t *coeffs,int nT) |
174 | 0 | { |
175 | 0 | for (int y=0;y<nT;y++) { |
176 | 0 | int32_t sum=0; |
177 | 0 | for (int x=0;x<nT;x++) { |
178 | 0 | sum += coeffs[x+y*nT]; |
179 | |
|
180 | 0 | dst[y*nT+x] = sum; |
181 | 0 | } |
182 | 0 | } |
183 | 0 | } |
184 | | |
185 | | |
186 | | void rdpcm_v_fallback(int32_t* residual, const int16_t* coeffs, int nT,int tsShift,int bdShift) |
187 | 0 | { |
188 | 0 | int rnd = (1<<(bdShift-1)); |
189 | |
|
190 | 0 | for (int x=0;x<nT;x++) { |
191 | 0 | int sum=0; |
192 | 0 | for (int y=0;y<nT;y++) { |
193 | 0 | int c = coeffs[x+y*nT] << tsShift; |
194 | 0 | sum += (c+rnd)>>bdShift; |
195 | 0 | residual[y*nT+x] = sum; |
196 | 0 | } |
197 | 0 | } |
198 | 0 | } |
199 | | |
200 | | |
201 | | void rdpcm_h_fallback(int32_t* residual, const int16_t* coeffs, int nT,int tsShift,int bdShift) |
202 | 0 | { |
203 | 0 | int rnd = (1<<(bdShift-1)); |
204 | |
|
205 | 0 | for (int y=0;y<nT;y++) { |
206 | 0 | int sum=0; |
207 | 0 | for (int x=0;x<nT;x++) { |
208 | 0 | int c = coeffs[x+y*nT] << tsShift; |
209 | 0 | sum += (c+rnd)>>bdShift; |
210 | 0 | residual[y*nT+x] = sum; |
211 | 0 | } |
212 | 0 | } |
213 | 0 | } |
214 | | |
215 | | |
216 | | void transform_bypass_fallback(int32_t *dst, const int16_t *coeffs, int nT) |
217 | 0 | { |
218 | 0 | for (int y=0;y<nT;y++) |
219 | 0 | for (int x=0;x<nT;x++) { |
220 | 0 | int32_t c = coeffs[x+y*nT]; |
221 | |
|
222 | 0 | dst[y*nT+x] = c; |
223 | 0 | } |
224 | 0 | } |
225 | | |
226 | | |
227 | | void transform_bypass_8_fallback(uint8_t *dst, const int16_t *coeffs, int nT, ptrdiff_t stride) |
228 | 0 | { |
229 | 0 | for (int y=0;y<nT;y++) |
230 | 0 | for (int x=0;x<nT;x++) { |
231 | 0 | int32_t c = coeffs[x+y*nT]; |
232 | |
|
233 | 0 | dst[y*stride+x] = Clip1_8bit(dst[y*stride+x] + c); |
234 | 0 | } |
235 | 0 | } |
236 | | |
237 | | |
238 | | void transform_bypass_16_fallback(uint16_t *dst, const int16_t *coeffs, int nT, ptrdiff_t stride, int bit_depth) |
239 | 0 | { |
240 | 0 | int bdShift2 = 20-bit_depth; |
241 | |
|
242 | 0 | for (int y=0;y<nT;y++) |
243 | 0 | for (int x=0;x<nT;x++) { |
244 | 0 | int32_t c = coeffs[x+y*nT]; |
245 | |
|
246 | 0 | dst[y*stride+x] = Clip_BitDepth(dst[y*stride+x] + c, bit_depth); |
247 | 0 | } |
248 | 0 | } |
249 | | |
250 | | |
251 | | void rotate_coefficients_fallback(int16_t *coeff, int nT) |
252 | 0 | { |
253 | 0 | for (int y=0;y<nT/2;y++) |
254 | 0 | for (int x=0;x<nT;x++) { |
255 | 0 | std::swap(coeff[y*nT+x], coeff[(nT-1-y)*nT + nT-1-x]); |
256 | 0 | } |
257 | 0 | } |
258 | | |
259 | | |
260 | | |
261 | | static int8_t mat_8_357[4][4] = { |
262 | | { 29, 55, 74, 84 }, |
263 | | { 74, 74, 0,-74 }, |
264 | | { 84,-29,-74, 55 }, |
265 | | { 55,-84, 74,-29 } |
266 | | }; |
267 | | |
268 | | |
269 | | |
270 | | void transform_4x4_luma_add_8_fallback(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride) |
271 | 0 | { |
272 | 0 | int16_t g[4][4]; |
273 | |
|
274 | 0 | int postShift = 20-8; // 8 bit |
275 | 0 | int rndV = 1<<(7-1); |
276 | 0 | int rndH = 1<<(postShift-1); |
277 | | |
278 | | |
279 | | // --- V --- |
280 | |
|
281 | 0 | for (int c=0;c<4;c++) { |
282 | | /* |
283 | | logtrace(LogTransform,"DST-V: "); |
284 | | for (int r=0;r<4;r++) { |
285 | | logtrace(LogTransform,"%d ",coeffs[c+r*4]); |
286 | | } |
287 | | logtrace(LogTransform,"* -> "); |
288 | | */ |
289 | |
|
290 | 0 | for (int i=0;i<4;i++) { |
291 | 0 | int sum=0; |
292 | |
|
293 | 0 | for (int j=0;j<4;j++) { |
294 | 0 | sum += mat_8_357[j][i] * coeffs[c+j*4]; |
295 | 0 | } |
296 | |
|
297 | 0 | g[i][c] = Clip3(-32768,32767, (sum+rndV)>>7); |
298 | 0 | } |
299 | | |
300 | | /* |
301 | | for (int y=0;y<4;y++) { |
302 | | logtrace(LogTransform,"*%d ",g[y][c]); |
303 | | } |
304 | | logtrace(LogTransform,"*\n"); |
305 | | */ |
306 | 0 | } |
307 | | |
308 | | |
309 | | // --- H --- |
310 | |
|
311 | 0 | for (int y=0;y<4;y++) { |
312 | | |
313 | | /* |
314 | | logtrace(LogTransform,"DST-H: "); |
315 | | for (int c=0;c<4;c++) { |
316 | | logtrace(LogTransform,"%d ",g[y][c]); |
317 | | } |
318 | | logtrace(LogTransform,"* -> "); |
319 | | */ |
320 | |
|
321 | 0 | for (int i=0;i<4;i++) { |
322 | 0 | int sum=0; |
323 | |
|
324 | 0 | for (int j=0;j<4;j++) { |
325 | 0 | sum += mat_8_357[j][i] * g[y][j]; |
326 | 0 | } |
327 | |
|
328 | 0 | int out = Clip3(-32768,32767, (sum+rndH)>>postShift); |
329 | |
|
330 | 0 | dst[y*stride+i] = Clip1_8bit(dst[y*stride+i] + out); |
331 | |
|
332 | 0 | logtrace(LogTransform,"*%d ",out); |
333 | 0 | } |
334 | |
|
335 | 0 | logtrace(LogTransform,"*\n"); |
336 | 0 | } |
337 | 0 | } |
338 | | |
339 | | |
340 | | void transform_4x4_luma_add_16_fallback(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, |
341 | | int bit_depth) |
342 | 0 | { |
343 | 0 | int16_t g[4][4]; |
344 | |
|
345 | 0 | int postShift = 20-bit_depth; |
346 | 0 | int rndV = 1<<(7-1); |
347 | 0 | int rndH = 1<<(postShift-1); |
348 | | |
349 | | |
350 | | // --- V --- |
351 | |
|
352 | 0 | for (int c=0;c<4;c++) { |
353 | | /* |
354 | | logtrace(LogTransform,"DST-V: "); |
355 | | for (int r=0;r<4;r++) { |
356 | | logtrace(LogTransform,"%d ",coeffs[c+r*4]); |
357 | | } |
358 | | logtrace(LogTransform,"* -> "); |
359 | | */ |
360 | |
|
361 | 0 | for (int i=0;i<4;i++) { |
362 | 0 | int sum=0; |
363 | |
|
364 | 0 | for (int j=0;j<4;j++) { |
365 | 0 | sum += mat_8_357[j][i] * coeffs[c+j*4]; |
366 | 0 | } |
367 | |
|
368 | 0 | g[i][c] = Clip3(-32768,32767, (sum+rndV)>>7); |
369 | 0 | } |
370 | | |
371 | | /* |
372 | | for (int y=0;y<4;y++) { |
373 | | logtrace(LogTransform,"*%d ",g[y][c]); |
374 | | } |
375 | | logtrace(LogTransform,"*\n"); |
376 | | */ |
377 | 0 | } |
378 | | |
379 | | |
380 | | // --- H --- |
381 | |
|
382 | 0 | for (int y=0;y<4;y++) { |
383 | | |
384 | | /* |
385 | | logtrace(LogTransform,"DST-H: "); |
386 | | for (int c=0;c<4;c++) { |
387 | | logtrace(LogTransform,"%d ",g[y][c]); |
388 | | } |
389 | | logtrace(LogTransform,"* -> "); |
390 | | */ |
391 | |
|
392 | 0 | for (int i=0;i<4;i++) { |
393 | 0 | int sum=0; |
394 | |
|
395 | 0 | for (int j=0;j<4;j++) { |
396 | 0 | sum += mat_8_357[j][i] * g[y][j]; |
397 | 0 | } |
398 | |
|
399 | 0 | int out = Clip3(-32768,32767, (sum+rndH)>>postShift); |
400 | |
|
401 | 0 | dst[y*stride+i] = Clip_BitDepth(dst[y*stride+i] + out, bit_depth); |
402 | |
|
403 | 0 | logtrace(LogTransform,"*%d ",out); |
404 | 0 | } |
405 | |
|
406 | 0 | logtrace(LogTransform,"*\n"); |
407 | 0 | } |
408 | 0 | } |
409 | | |
410 | | |
411 | | void fdst_4x4_8_fallback(int16_t *coeffs, const int16_t *input, ptrdiff_t stride) |
412 | 0 | { |
413 | 0 | int16_t g[4*4]; |
414 | |
|
415 | 0 | int BD = 8; |
416 | 0 | int shift1 = Log2(4) + BD -9; |
417 | 0 | int shift2 = Log2(4) + 6; |
418 | |
|
419 | 0 | int rnd1 = 1<<(shift1-1); |
420 | 0 | int rnd2 = 1<<(shift2-1); |
421 | | |
422 | | |
423 | | // --- V --- |
424 | |
|
425 | 0 | for (int c=0;c<4;c++) { |
426 | | |
427 | | /* |
428 | | logtrace(LogTransform,"DST-V: "); |
429 | | for (int r=0;r<4;r++) { |
430 | | logtrace(LogTransform,"%d ",coeffs[c+r*4]); |
431 | | } |
432 | | logtrace(LogTransform,"* -> "); |
433 | | */ |
434 | |
|
435 | 0 | for (int i=0;i<4;i++) { |
436 | 0 | int sum=0; |
437 | |
|
438 | 0 | for (int j=0;j<4;j++) { |
439 | 0 | sum += mat_8_357[i][j] * input[c+j*stride]; |
440 | 0 | } |
441 | |
|
442 | 0 | g[c+4*i] = Clip3(-32768,32767, (sum+rnd1)>>shift1); |
443 | 0 | } |
444 | 0 | } |
445 | | |
446 | | |
447 | | // --- H --- |
448 | |
|
449 | 0 | for (int y=0;y<4;y++) { |
450 | 0 | for (int i=0;i<4;i++) { |
451 | 0 | int sum=0; |
452 | |
|
453 | 0 | for (int j=0;j<4;j++) { |
454 | 0 | sum += mat_8_357[i][j] * g[y*4+j]; |
455 | 0 | } |
456 | | |
457 | | // TODO: do we need clipping ? |
458 | 0 | int out = (sum+rnd2)>>shift2; // Clip3(-32768,32767, (sum+rndH)>>postShift); |
459 | |
|
460 | 0 | coeffs[y*4+i] = out; |
461 | |
|
462 | 0 | logtrace(LogTransform,"*%d ",out); |
463 | 0 | } |
464 | |
|
465 | 0 | logtrace(LogTransform,"*\n"); |
466 | 0 | } |
467 | 0 | } |
468 | | |
469 | | |
470 | | void transform_idst_4x4_fallback(int32_t *dst, const int16_t *coeffs, int bdShift, int max_coeff_bits) |
471 | 0 | { |
472 | 0 | int16_t g[4][4]; |
473 | |
|
474 | 0 | int rndV = 1<<(7-1); |
475 | 0 | int rndH = 1<<(bdShift-1); |
476 | |
|
477 | 0 | int CoeffMax = (1<<max_coeff_bits)-1; |
478 | 0 | int CoeffMin = -(1<<max_coeff_bits); |
479 | | |
480 | | |
481 | | // --- V --- |
482 | |
|
483 | 0 | for (int c=0;c<4;c++) { |
484 | 0 | for (int i=0;i<4;i++) { |
485 | 0 | int sum=0; |
486 | |
|
487 | 0 | for (int j=0;j<4;j++) { |
488 | 0 | sum += mat_8_357[j][i] * coeffs[c+j*4]; |
489 | 0 | } |
490 | |
|
491 | 0 | g[i][c] = Clip3(CoeffMin,CoeffMax, (sum+rndV)>>7); |
492 | 0 | } |
493 | 0 | } |
494 | | |
495 | | |
496 | | // --- H --- |
497 | |
|
498 | 0 | for (int y=0;y<4;y++) { |
499 | 0 | for (int i=0;i<4;i++) { |
500 | 0 | int sum=0; |
501 | |
|
502 | 0 | for (int j=0;j<4;j++) { |
503 | 0 | sum += mat_8_357[j][i] * g[y][j]; |
504 | 0 | } |
505 | |
|
506 | 0 | dst[y*4+i] = (sum + rndH)>>bdShift; |
507 | 0 | } |
508 | 0 | } |
509 | 0 | } |
510 | | |
511 | | |
512 | | |
513 | | static int8_t mat_dct[32][32] = { |
514 | | { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64}, |
515 | | { 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4, -4,-13,-22,-31,-38,-46,-54,-61,-67,-73,-78,-82,-85,-88,-90,-90}, |
516 | | { 90, 87, 80, 70, 57, 43, 25, 9, -9,-25,-43,-57,-70,-80,-87,-90, -90,-87,-80,-70,-57,-43,-25, -9, 9, 25, 43, 57, 70, 80, 87, 90}, |
517 | | { 90, 82, 67, 46, 22, -4,-31,-54,-73,-85,-90,-88,-78,-61,-38,-13, 13, 38, 61, 78, 88, 90, 85, 73, 54, 31, 4,-22,-46,-67,-82,-90}, |
518 | | { 89, 75, 50, 18,-18,-50,-75,-89,-89,-75,-50,-18, 18, 50, 75, 89, 89, 75, 50, 18,-18,-50,-75,-89,-89,-75,-50,-18, 18, 50, 75, 89}, |
519 | | { 88, 67, 31,-13,-54,-82,-90,-78,-46, -4, 38, 73, 90, 85, 61, 22, -22,-61,-85,-90,-73,-38, 4, 46, 78, 90, 82, 54, 13,-31,-67,-88}, |
520 | | { 87, 57, 9,-43,-80,-90,-70,-25, 25, 70, 90, 80, 43, -9,-57,-87, -87,-57, -9, 43, 80, 90, 70, 25,-25,-70,-90,-80,-43, 9, 57, 87}, |
521 | | { 85, 46,-13,-67,-90,-73,-22, 38, 82, 88, 54, -4,-61,-90,-78,-31, 31, 78, 90, 61, 4,-54,-88,-82,-38, 22, 73, 90, 67, 13,-46,-85}, |
522 | | { 83, 36,-36,-83,-83,-36, 36, 83, 83, 36,-36,-83,-83,-36, 36, 83, 83, 36,-36,-83,-83,-36, 36, 83, 83, 36,-36,-83,-83,-36, 36, 83}, |
523 | | { 82, 22,-54,-90,-61, 13, 78, 85, 31,-46,-90,-67, 4, 73, 88, 38, -38,-88,-73, -4, 67, 90, 46,-31,-85,-78,-13, 61, 90, 54,-22,-82}, |
524 | | { 80, 9,-70,-87,-25, 57, 90, 43,-43,-90,-57, 25, 87, 70, -9,-80, -80, -9, 70, 87, 25,-57,-90,-43, 43, 90, 57,-25,-87,-70, 9, 80}, |
525 | | { 78, -4,-82,-73, 13, 85, 67,-22,-88,-61, 31, 90, 54,-38,-90,-46, 46, 90, 38,-54,-90,-31, 61, 88, 22,-67,-85,-13, 73, 82, 4,-78}, |
526 | | { 75,-18,-89,-50, 50, 89, 18,-75,-75, 18, 89, 50,-50,-89,-18, 75, 75,-18,-89,-50, 50, 89, 18,-75,-75, 18, 89, 50,-50,-89,-18, 75}, |
527 | | { 73,-31,-90,-22, 78, 67,-38,-90,-13, 82, 61,-46,-88, -4, 85, 54, -54,-85, 4, 88, 46,-61,-82, 13, 90, 38,-67,-78, 22, 90, 31,-73}, |
528 | | { 70,-43,-87, 9, 90, 25,-80,-57, 57, 80,-25,-90, -9, 87, 43,-70, -70, 43, 87, -9,-90,-25, 80, 57,-57,-80, 25, 90, 9,-87,-43, 70}, |
529 | | { 67,-54,-78, 38, 85,-22,-90, 4, 90, 13,-88,-31, 82, 46,-73,-61, 61, 73,-46,-82, 31, 88,-13,-90, -4, 90, 22,-85,-38, 78, 54,-67}, |
530 | | { 64,-64,-64, 64, 64,-64,-64, 64, 64,-64,-64, 64, 64,-64,-64, 64, 64,-64,-64, 64, 64,-64,-64, 64, 64,-64,-64, 64, 64,-64,-64, 64}, |
531 | | { 61,-73,-46, 82, 31,-88,-13, 90, -4,-90, 22, 85,-38,-78, 54, 67, -67,-54, 78, 38,-85,-22, 90, 4,-90, 13, 88,-31,-82, 46, 73,-61}, |
532 | | { 57,-80,-25, 90, -9,-87, 43, 70,-70,-43, 87, 9,-90, 25, 80,-57, -57, 80, 25,-90, 9, 87,-43,-70, 70, 43,-87, -9, 90,-25,-80, 57}, |
533 | | { 54,-85, -4, 88,-46,-61, 82, 13,-90, 38, 67,-78,-22, 90,-31,-73, 73, 31,-90, 22, 78,-67,-38, 90,-13,-82, 61, 46,-88, 4, 85,-54}, |
534 | | { 50,-89, 18, 75,-75,-18, 89,-50,-50, 89,-18,-75, 75, 18,-89, 50, 50,-89, 18, 75,-75,-18, 89,-50,-50, 89,-18,-75, 75, 18,-89, 50}, |
535 | | { 46,-90, 38, 54,-90, 31, 61,-88, 22, 67,-85, 13, 73,-82, 4, 78, -78, -4, 82,-73,-13, 85,-67,-22, 88,-61,-31, 90,-54,-38, 90,-46}, |
536 | | { 43,-90, 57, 25,-87, 70, 9,-80, 80, -9,-70, 87,-25,-57, 90,-43, -43, 90,-57,-25, 87,-70, -9, 80,-80, 9, 70,-87, 25, 57,-90, 43}, |
537 | | { 38,-88, 73, -4,-67, 90,-46,-31, 85,-78, 13, 61,-90, 54, 22,-82, 82,-22,-54, 90,-61,-13, 78,-85, 31, 46,-90, 67, 4,-73, 88,-38}, |
538 | | { 36,-83, 83,-36,-36, 83,-83, 36, 36,-83, 83,-36,-36, 83,-83, 36, 36,-83, 83,-36,-36, 83,-83, 36, 36,-83, 83,-36,-36, 83,-83, 36}, |
539 | | { 31,-78, 90,-61, 4, 54,-88, 82,-38,-22, 73,-90, 67,-13,-46, 85, -85, 46, 13,-67, 90,-73, 22, 38,-82, 88,-54, -4, 61,-90, 78,-31}, |
540 | | { 25,-70, 90,-80, 43, 9,-57, 87,-87, 57, -9,-43, 80,-90, 70,-25, -25, 70,-90, 80,-43, -9, 57,-87, 87,-57, 9, 43,-80, 90,-70, 25}, |
541 | | { 22,-61, 85,-90, 73,-38, -4, 46,-78, 90,-82, 54,-13,-31, 67,-88, 88,-67, 31, 13,-54, 82,-90, 78,-46, 4, 38,-73, 90,-85, 61,-22}, |
542 | | { 18,-50, 75,-89, 89,-75, 50,-18,-18, 50,-75, 89,-89, 75,-50, 18, 18,-50, 75,-89, 89,-75, 50,-18,-18, 50,-75, 89,-89, 75,-50, 18}, |
543 | | { 13,-38, 61,-78, 88,-90, 85,-73, 54,-31, 4, 22,-46, 67,-82, 90, -90, 82,-67, 46,-22, -4, 31,-54, 73,-85, 90,-88, 78,-61, 38,-13}, |
544 | | { 9,-25, 43,-57, 70,-80, 87,-90, 90,-87, 80,-70, 57,-43, 25, -9, -9, 25,-43, 57,-70, 80,-87, 90,-90, 87,-80, 70,-57, 43,-25, 9}, |
545 | | { 4,-13, 22,-31, 38,-46, 54,-61, 67,-73, 78,-82, 85,-88, 90,-90, 90,-90, 88,-85, 82,-78, 73,-67, 61,-54, 46,-38, 31,-22, 13, -4} |
546 | | }; |
547 | | |
548 | | |
549 | | |
550 | | |
551 | | template <class pixel_t> |
552 | | void transform_idct_add(pixel_t *dst, ptrdiff_t stride, |
553 | | int nT, const int16_t *coeffs, int bit_depth) |
554 | 0 | { |
555 | | /* |
556 | | The effective shift is |
557 | | 7 bits right for bit-depth 8, |
558 | | 6 bits right for bit-depth 9, |
559 | | 5 bits right for bit-depth 10. |
560 | | |
561 | | Computation is independent of the block size. |
562 | | Each multiplication with the table includes a left shift of 6 bits. |
563 | | Hence, we have 2* 6 bits = 12 bits left shift. |
564 | | V-pass has fixed 7 bit right shift. |
565 | | H-pass has 20-BitDepth bit right shift; |
566 | | |
567 | | Effective shift 's' means: residual value 1 gives DC-coeff (1<<s). |
568 | | */ |
569 | | |
570 | |
|
571 | 0 | int postShift = 20-bit_depth; |
572 | 0 | int rnd1 = 1<<(7-1); |
573 | 0 | int rnd2 = 1<<(postShift-1); |
574 | 0 | int fact = (1<<(5-Log2(nT))); |
575 | |
|
576 | 0 | int16_t g[32*32]; // actually, only [nT*nT] used |
577 | | |
578 | | // TODO: valgrind reports that dst[] contains uninitialized data. |
579 | | // Probably from intra-prediction. |
580 | | |
581 | | /* |
582 | | for (int i=0;i<nT*nT;i++) { |
583 | | printf("%d\n",coeffs[i]); |
584 | | } |
585 | | |
586 | | for (int y=0;y<nT;y++) { |
587 | | for (int i=0;i<nT;i++) { |
588 | | printf("%d ",dst[y*stride+i]); |
589 | | } |
590 | | } |
591 | | printf("\n"); |
592 | | */ |
593 | | |
594 | | /* |
595 | | printf("--- input\n"); |
596 | | for (int r=0;r<nT;r++, printf("\n")) |
597 | | for (int c=0;c<nT;c++) { |
598 | | printf("%3d ",coeffs[c+r*nT]); |
599 | | } |
600 | | */ |
601 | |
|
602 | 0 | for (int c=0;c<nT;c++) { |
603 | | |
604 | | /* |
605 | | logtrace(LogTransform,"DCT-V: "); |
606 | | for (int i=0;i<nT;i++) { |
607 | | logtrace(LogTransform,"*%d ",coeffs[c+i*nT]); |
608 | | } |
609 | | logtrace(LogTransform,"* -> "); |
610 | | */ |
611 | | |
612 | | |
613 | | // find last non-zero coefficient to reduce computations carried out in DCT |
614 | |
|
615 | 0 | int lastCol = nT-1; |
616 | 0 | for (;lastCol>=0;lastCol--) { |
617 | 0 | if (coeffs[c+lastCol*nT]) { break; } |
618 | 0 | } |
619 | |
|
620 | 0 | for (int i=0;i<nT;i++) { |
621 | 0 | int sum=0; |
622 | | |
623 | | /* |
624 | | printf("input: "); |
625 | | for (int j=0;j<nT;j++) { |
626 | | printf("%3d ",coeffs[c+j*nT]); |
627 | | } |
628 | | printf("\n"); |
629 | | |
630 | | printf("mat: "); |
631 | | for (int j=0;j<nT;j++) { |
632 | | printf("%3d ",mat_dct[fact*j][i]); |
633 | | } |
634 | | printf("\n"); |
635 | | */ |
636 | |
|
637 | 0 | for (int j=0;j<=lastCol /*nT*/;j++) { |
638 | 0 | sum += mat_dct[fact*j][i] * coeffs[c+j*nT]; |
639 | 0 | } |
640 | |
|
641 | 0 | g[c+i*nT] = Clip3(-32768,32767, (sum+rnd1)>>7); |
642 | |
|
643 | 0 | logtrace(LogTransform,"*%d ",g[c+i*nT]); |
644 | 0 | } |
645 | 0 | logtrace(LogTransform,"*\n"); |
646 | 0 | } |
647 | | |
648 | | /* |
649 | | printf("--- temp\n"); |
650 | | for (int r=0;r<nT;r++, printf("\n")) |
651 | | for (int c=0;c<nT;c++) { |
652 | | printf("%3d ",g[c+r*nT]); |
653 | | } |
654 | | */ |
655 | |
|
656 | 0 | for (int y=0;y<nT;y++) { |
657 | | /* |
658 | | logtrace(LogTransform,"DCT-H: "); |
659 | | for (int i=0;i<nT;i++) { |
660 | | logtrace(LogTransform,"*%d ",g[i+y*nT]); |
661 | | } |
662 | | logtrace(LogTransform,"* -> "); |
663 | | */ |
664 | | |
665 | | |
666 | | // find last non-zero coefficient to reduce computations carried out in DCT |
667 | |
|
668 | 0 | int lastCol = nT-1; |
669 | 0 | for (;lastCol>=0;lastCol--) { |
670 | 0 | if (g[y*nT+lastCol]) { break; } |
671 | 0 | } |
672 | | |
673 | |
|
674 | 0 | for (int i=0;i<nT;i++) { |
675 | 0 | int sum=0; |
676 | |
|
677 | 0 | for (int j=0;j<=lastCol /*nT*/;j++) { |
678 | 0 | sum += mat_dct[fact*j][i] * g[y*nT+j]; |
679 | 0 | } |
680 | | |
681 | | //int out = Clip3(-32768,32767, (sum+rnd2)>>postShift); |
682 | 0 | int out = (sum+rnd2)>>postShift; |
683 | | |
684 | | //fprintf(stderr,"%d*%d+%d = %d\n",y,stride,i,y*stride+i); |
685 | | //fprintf(stderr,"[%p]=%d\n",&dst[y*stride+i], Clip1_8bit(dst[y*stride+i])); |
686 | 0 | dst[y*stride+i] = Clip_BitDepth(dst[y*stride+i] + out, bit_depth); |
687 | |
|
688 | 0 | logtrace(LogTransform,"*%d ",out); |
689 | 0 | } |
690 | 0 | logtrace(LogTransform,"*\n"); |
691 | 0 | } |
692 | 0 | } Unexecuted instantiation: void transform_idct_add<unsigned char>(unsigned char*, long, int, short const*, int) Unexecuted instantiation: void transform_idct_add<unsigned short>(unsigned short*, long, int, short const*, int) |
693 | | |
694 | | |
695 | | |
696 | | void transform_idct_fallback(int32_t *dst, int nT, const int16_t *coeffs, int bdShift, int max_coeff_bits) |
697 | 0 | { |
698 | | /* |
699 | | The effective shift is |
700 | | 7 bits right for bit-depth 8, |
701 | | 6 bits right for bit-depth 9, |
702 | | 5 bits right for bit-depth 10. |
703 | | |
704 | | One transformation with raw transform filter values increases range be 2048 (=32*64). |
705 | | This equals 11 bits. |
706 | | |
707 | | Computation is independent of the block size. |
708 | | Each multiplication with the table includes a left shift of 6 bits. |
709 | | Hence, we have 2* 6 bits = 12 bits left shift. |
710 | | V-pass has fixed 7 bit right shift. |
711 | | H-pass has 20-BitDepth bit right shift; |
712 | | |
713 | | Effective shift 's' means: residual value 1 gives DC-coeff (1<<s). |
714 | | */ |
715 | | |
716 | |
|
717 | 0 | int rnd1 = 1<<(7-1); |
718 | 0 | int fact = (1<<(5-Log2(nT))); |
719 | | |
720 | | //int bdShift = 20-bit_depth; |
721 | 0 | int rnd2 = 1<<(bdShift-1); |
722 | |
|
723 | 0 | int16_t g[32*32]; // actually, only [nT*nT] used |
724 | |
|
725 | 0 | int CoeffMax = (1<<max_coeff_bits)-1; |
726 | 0 | int CoeffMin = -(1<<max_coeff_bits); |
727 | | |
728 | | // TODO: valgrind reports that dst[] contains uninitialized data. |
729 | | // Probably from intra-prediction. |
730 | | |
731 | | /* |
732 | | for (int i=0;i<nT*nT;i++) { |
733 | | printf("%d\n",coeffs[i]); |
734 | | } |
735 | | |
736 | | for (int y=0;y<nT;y++) { |
737 | | for (int i=0;i<nT;i++) { |
738 | | printf("%d ",dst[y*stride+i]); |
739 | | } |
740 | | } |
741 | | printf("\n"); |
742 | | */ |
743 | | |
744 | | /* |
745 | | printf("--- input\n"); |
746 | | for (int r=0;r<nT;r++, printf("\n")) |
747 | | for (int c=0;c<nT;c++) { |
748 | | printf("%3d ",coeffs[c+r*nT]); |
749 | | } |
750 | | */ |
751 | |
|
752 | 0 | for (int c=0;c<nT;c++) { |
753 | | |
754 | | /* |
755 | | logtrace(LogTransform,"DCT-V: "); |
756 | | for (int i=0;i<nT;i++) { |
757 | | logtrace(LogTransform,"*%d ",coeffs[c+i*nT]); |
758 | | } |
759 | | logtrace(LogTransform,"* -> "); |
760 | | */ |
761 | | |
762 | | |
763 | | // find last non-zero coefficient to reduce computations carried out in DCT |
764 | |
|
765 | 0 | int lastCol = nT-1; |
766 | 0 | for (;lastCol>=0;lastCol--) { |
767 | 0 | if (coeffs[c+lastCol*nT]) { break; } |
768 | 0 | } |
769 | |
|
770 | 0 | for (int i=0;i<nT;i++) { |
771 | 0 | int sum=0; |
772 | | |
773 | | /* |
774 | | printf("input: "); |
775 | | for (int j=0;j<nT;j++) { |
776 | | printf("%3d ",coeffs[c+j*nT]); |
777 | | } |
778 | | printf("\n"); |
779 | | |
780 | | printf("mat: "); |
781 | | for (int j=0;j<nT;j++) { |
782 | | printf("%3d ",mat_dct[fact*j][i]); |
783 | | } |
784 | | printf("\n"); |
785 | | */ |
786 | |
|
787 | 0 | for (int j=0;j<=lastCol /*nT*/;j++) { |
788 | 0 | sum += mat_dct[fact*j][i] * coeffs[c+j*nT]; |
789 | 0 | } |
790 | |
|
791 | 0 | g[c+i*nT] = Clip3(CoeffMin,CoeffMax, (sum+rnd1)>>7); |
792 | |
|
793 | 0 | logtrace(LogTransform,"*%d ",g[c+i*nT]); |
794 | 0 | } |
795 | 0 | logtrace(LogTransform,"*\n"); |
796 | 0 | } |
797 | | |
798 | | /* |
799 | | printf("--- temp\n"); |
800 | | for (int r=0;r<nT;r++, printf("\n")) |
801 | | for (int c=0;c<nT;c++) { |
802 | | printf("%3d ",g[c+r*nT]); |
803 | | } |
804 | | */ |
805 | |
|
806 | 0 | for (int y=0;y<nT;y++) { |
807 | | /* |
808 | | logtrace(LogTransform,"DCT-H: "); |
809 | | for (int i=0;i<nT;i++) { |
810 | | logtrace(LogTransform,"*%d ",g[i+y*nT]); |
811 | | } |
812 | | logtrace(LogTransform,"* -> "); |
813 | | */ |
814 | | |
815 | | |
816 | | // find last non-zero coefficient to reduce computations carried out in DCT |
817 | |
|
818 | 0 | int lastCol = nT-1; |
819 | 0 | for (;lastCol>=0;lastCol--) { |
820 | 0 | if (g[y*nT+lastCol]) { break; } |
821 | 0 | } |
822 | | |
823 | |
|
824 | 0 | for (int i=0;i<nT;i++) { |
825 | 0 | int sum=0; |
826 | |
|
827 | 0 | for (int j=0;j<=lastCol /*nT*/;j++) { |
828 | 0 | sum += mat_dct[fact*j][i] * g[y*nT+j]; |
829 | 0 | } |
830 | |
|
831 | 0 | dst[y*nT+i] = (sum + rnd2)>>bdShift; |
832 | |
|
833 | 0 | logtrace(LogTransform,"*%d ",sum); |
834 | 0 | } |
835 | 0 | logtrace(LogTransform,"*\n"); |
836 | 0 | } |
837 | 0 | } |
838 | | |
839 | | |
840 | | void transform_idct_4x4_fallback(int32_t *dst, const int16_t *coeffs, int bdShift, int max_coeff_bits) |
841 | 0 | { |
842 | 0 | transform_idct_fallback(dst,4,coeffs,bdShift,max_coeff_bits); |
843 | 0 | } |
844 | | |
845 | | void transform_idct_8x8_fallback(int32_t *dst, const int16_t *coeffs, int bdShift, int max_coeff_bits) |
846 | 0 | { |
847 | 0 | transform_idct_fallback(dst,8,coeffs,bdShift,max_coeff_bits); |
848 | 0 | } |
849 | | |
850 | | void transform_idct_16x16_fallback(int32_t *dst, const int16_t *coeffs, |
851 | | int bdShift, int max_coeff_bits) |
852 | 0 | { |
853 | 0 | transform_idct_fallback(dst,16,coeffs,bdShift,max_coeff_bits); |
854 | 0 | } |
855 | | |
856 | | void transform_idct_32x32_fallback(int32_t *dst, const int16_t *coeffs, |
857 | | int bdShift, int max_coeff_bits) |
858 | 0 | { |
859 | 0 | transform_idct_fallback(dst,32,coeffs,bdShift,max_coeff_bits); |
860 | 0 | } |
861 | | |
862 | | |
863 | | |
864 | | |
865 | | void transform_4x4_add_8_fallback(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride) |
866 | 0 | { |
867 | 0 | transform_idct_add<uint8_t>(dst,stride, 4, coeffs, 8); |
868 | 0 | } |
869 | | |
870 | | void transform_8x8_add_8_fallback(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride) |
871 | 0 | { |
872 | 0 | transform_idct_add<uint8_t>(dst,stride, 8, coeffs, 8); |
873 | 0 | } |
874 | | |
875 | | void transform_16x16_add_8_fallback(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride) |
876 | 0 | { |
877 | 0 | transform_idct_add<uint8_t>(dst,stride, 16, coeffs, 8); |
878 | 0 | } |
879 | | |
880 | | void transform_32x32_add_8_fallback(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride) |
881 | 0 | { |
882 | 0 | transform_idct_add<uint8_t>(dst,stride, 32, coeffs, 8); |
883 | 0 | } |
884 | | |
885 | | |
886 | | void transform_4x4_add_16_fallback(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth) |
887 | 0 | { |
888 | 0 | transform_idct_add<uint16_t>(dst,stride, 4, coeffs, bit_depth); |
889 | 0 | } |
890 | | |
891 | | void transform_8x8_add_16_fallback(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth) |
892 | 0 | { |
893 | 0 | transform_idct_add<uint16_t>(dst,stride, 8, coeffs, bit_depth); |
894 | 0 | } |
895 | | |
896 | | void transform_16x16_add_16_fallback(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth) |
897 | 0 | { |
898 | 0 | transform_idct_add<uint16_t>(dst,stride, 16, coeffs, bit_depth); |
899 | 0 | } |
900 | | |
901 | | void transform_32x32_add_16_fallback(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth) |
902 | 0 | { |
903 | 0 | transform_idct_add<uint16_t>(dst,stride, 32, coeffs, bit_depth); |
904 | 0 | } |
905 | | |
906 | | |
907 | | static void transform_fdct_8(int16_t* coeffs, int nT, |
908 | | const int16_t *input, ptrdiff_t stride) |
909 | 0 | { |
910 | | /* |
911 | | Each sum over a basis vector sums nT elements, which is compensated by |
912 | | shifting right by Log2(nT), effectively dividing by 2^Log2(nT) = nT. |
913 | | Do this in each of the H/V passes. |
914 | | |
915 | | Each multiplication with the table includes a left shift of 6 bits. |
916 | | Hence, we have in total 2* 6 bits = 12 bits left shift because of the |
917 | | multiplications. |
918 | | |
919 | | We carry out shifts after each pass: |
920 | | First (V) pass has BitDepth-9 bit right shift, |
921 | | Second (H) pass has fixed 6 bit right shift. |
922 | | |
923 | | For bit-depth 8, the total shift is 7 bits left. |
924 | | For bit-depth 9, the total shift is 6 bits left. |
925 | | For bit-depth 10, the total shift is 5 bits left. |
926 | | |
927 | | I.e.: a constant residual value 1 gives DC-coeff (1<<s). |
928 | | |
929 | | For 8-bit images in a 32x32 block, the input are 8 bits + 1 sign bit. |
930 | | After the first pass, we need 9+5+6=20 bits for the intermediate sum |
931 | | (9 bit input, 5 bit because we sum 2^5 elements, 6 bit because of multiplication with 64). |
932 | | The first pass shift is Log2(32) - 1 -> 4 bits and we are down to 16 bits again. |
933 | | After the second pass, we need 16+5+6=27 bits for the intermediate sum |
934 | | (16 bit input, 5 bit because we sum 2^5 elements, 6 bit because of coefficient multiplication). |
935 | | The second pass shift is Log2(32)+6 = 11 and we are down again to 16 bits. |
936 | | |
937 | | For larger input bit-depths, the intermediate result after the first pass |
938 | | will be wider accordingly, but the widths after the shifts are the same. |
939 | | */ |
940 | |
|
941 | 0 | int BitDepth = 8; |
942 | | |
943 | | // / compensate everything | / effective word length | |
944 | 0 | int shift1 = Log2(nT) + 6 + BitDepth - 15; |
945 | 0 | int shift2 = Log2(nT) + 6; |
946 | |
|
947 | 0 | int rnd1 = 1<<(shift1-1); |
948 | 0 | int rnd2 = 1<<(shift2-1); |
949 | 0 | int fact = (1<<(5-Log2(nT))); |
950 | |
|
951 | 0 | int16_t g[32*32]; // actually, only [nT*nT] used |
952 | |
|
953 | 0 | for (int c=0;c<nT;c++) { |
954 | |
|
955 | 0 | for (int i=0;i<nT;i++) { |
956 | 0 | int sum=0; |
957 | |
|
958 | 0 | for (int j=0;j<nT;j++) { |
959 | 0 | sum += mat_dct[fact*i][j] * input[c+j*stride]; |
960 | 0 | } |
961 | |
|
962 | 0 | g[c+i*nT] = (sum+rnd1)>>shift1; // clipping to -32768;32767 unnecessary |
963 | 0 | } |
964 | 0 | } |
965 | | |
966 | |
|
967 | 0 | for (int y=0;y<nT;y++) { |
968 | 0 | for (int i=0;i<nT;i++) { |
969 | 0 | int sum=0; |
970 | |
|
971 | 0 | for (int j=0;j<nT;j++) { |
972 | 0 | sum += mat_dct[fact*i][j] * g[y*nT+j]; |
973 | 0 | } |
974 | | |
975 | | // no clipping to -32768;32767 required |
976 | 0 | int out = (sum+rnd2)>>shift2; |
977 | |
|
978 | 0 | coeffs[y*nT+i] = out; |
979 | 0 | } |
980 | 0 | } |
981 | 0 | } |
982 | | |
983 | | |
984 | | void fdct_4x4_8_fallback(int16_t *coeffs, const int16_t *input, ptrdiff_t stride) |
985 | 0 | { |
986 | 0 | transform_fdct_8(coeffs, 4, input,stride); |
987 | 0 | } |
988 | | |
989 | | void fdct_8x8_8_fallback(int16_t *coeffs, const int16_t *input, ptrdiff_t stride) |
990 | 0 | { |
991 | 0 | transform_fdct_8(coeffs, 8, input,stride); |
992 | 0 | } |
993 | | |
994 | | void fdct_16x16_8_fallback(int16_t *coeffs, const int16_t *input, ptrdiff_t stride) |
995 | 0 | { |
996 | 0 | transform_fdct_8(coeffs, 16, input,stride); |
997 | 0 | } |
998 | | |
999 | | void fdct_32x32_8_fallback(int16_t *coeffs, const int16_t *input, ptrdiff_t stride) |
1000 | 0 | { |
1001 | 0 | transform_fdct_8(coeffs, 32, input,stride); |
1002 | 0 | } |
1003 | | |
1004 | | |
1005 | | |
1006 | | |
1007 | | void hadamard_transform_8(int16_t *coeffs, int n, const int16_t *input, ptrdiff_t stride) |
1008 | 0 | { |
1009 | 0 | int16_t tmp[32*32]; |
1010 | | |
1011 | | // row transforms |
1012 | | |
1013 | | //printMatrix("input",input,n); |
1014 | |
|
1015 | 0 | int16_t am[32],bm[32]; |
1016 | 0 | int16_t *a = am, *b = bm; |
1017 | 0 | for (int row=0;row<n;row++) { |
1018 | 0 | int rs = row*stride; |
1019 | 0 | for (int i=0;i<(n>>1);i++) { |
1020 | 0 | a[ i] = input[i+rs] + input[i+(n>>1)+rs]; |
1021 | 0 | a[(n>>1)+i] = input[i+rs] - input[i+(n>>1)+rs]; |
1022 | 0 | } |
1023 | |
|
1024 | 0 | int iOuter=(n>>1); |
1025 | 0 | int nInner=(n>>2); |
1026 | |
|
1027 | 0 | while (nInner>=2) { |
1028 | 0 | std::swap(a,b); |
1029 | |
|
1030 | 0 | for (int k=0;k<n;k+=iOuter) { |
1031 | 0 | for (int i=0;i<nInner;i++) { |
1032 | 0 | a[k+i ] = b[k+i] + b[k+i+nInner]; |
1033 | 0 | a[k+i+nInner] = b[k+i] - b[k+i+nInner]; |
1034 | 0 | } |
1035 | 0 | } |
1036 | |
|
1037 | 0 | iOuter>>=1; |
1038 | 0 | nInner>>=1; |
1039 | 0 | } |
1040 | |
|
1041 | 0 | for (int k=0;k<n;k+=2) { |
1042 | 0 | tmp[k +n*row] = a[k] + a[k+1]; |
1043 | 0 | tmp[k+1+n*row] = a[k] - a[k+1]; |
1044 | 0 | } |
1045 | 0 | } |
1046 | | |
1047 | | //printMatrix("tmp",tmp,n); |
1048 | | |
1049 | | // column transforms |
1050 | |
|
1051 | 0 | for (int col=0;col<n;col++) { |
1052 | 0 | for (int i=0;i<(n>>1);i++) { |
1053 | 0 | a[ i] = tmp[i*n+col] + tmp[(i+(n>>1))*n+col]; |
1054 | 0 | a[(n>>1)+i] = tmp[i*n+col] - tmp[(i+(n>>1))*n+col]; |
1055 | 0 | } |
1056 | |
|
1057 | 0 | int iOuter=(n>>1); |
1058 | 0 | int nInner=(n>>2); |
1059 | |
|
1060 | 0 | while (nInner>=2) { |
1061 | 0 | std::swap(a,b); |
1062 | |
|
1063 | 0 | for (int k=0;k<n;k+=iOuter) { |
1064 | 0 | for (int i=0;i<nInner;i++) { |
1065 | 0 | a[k+i ] = b[k+i] + b[k+i+nInner]; |
1066 | 0 | a[k+i+nInner] = b[k+i] - b[k+i+nInner]; |
1067 | 0 | } |
1068 | 0 | } |
1069 | |
|
1070 | 0 | iOuter>>=1; |
1071 | 0 | nInner>>=1; |
1072 | 0 | } |
1073 | |
|
1074 | 0 | for (int k=0;k<n;k+=2) { |
1075 | 0 | coeffs[col+(k )*n] = a[k] + a[k+1]; |
1076 | 0 | coeffs[col+(k+1)*n] = a[k] - a[k+1]; |
1077 | 0 | } |
1078 | 0 | } |
1079 | | |
1080 | | //printMatrix("coeffs",coeffs,n); |
1081 | 0 | } |
1082 | | |
1083 | | |
1084 | | void hadamard_4x4_8_fallback(int16_t *coeffs, const int16_t *input, ptrdiff_t stride) |
1085 | 0 | { |
1086 | 0 | int16_t tmp[4*4]; |
1087 | | |
1088 | | // row transforms |
1089 | | |
1090 | | //printMatrix("input",input,4); |
1091 | |
|
1092 | 0 | int16_t a[4]; |
1093 | 0 | for (int row=0;row<4;row++) { |
1094 | 0 | int rs = row*stride; |
1095 | 0 | a[0] = input[0+rs] + input[2+rs]; |
1096 | 0 | a[1] = input[1+rs] + input[3+rs]; |
1097 | 0 | a[2] = input[0+rs] - input[2+rs]; |
1098 | 0 | a[3] = input[1+rs] - input[3+rs]; |
1099 | |
|
1100 | 0 | tmp[0+4*row] = a[0]+a[1]; |
1101 | 0 | tmp[1+4*row] = a[0]-a[1]; |
1102 | 0 | tmp[2+4*row] = a[2]+a[3]; |
1103 | 0 | tmp[3+4*row] = a[2]-a[3]; |
1104 | 0 | } |
1105 | | |
1106 | | //printMatrix("tmp",tmp,4); |
1107 | | |
1108 | | // column transforms |
1109 | |
|
1110 | 0 | for (int col=0;col<4;col++) { |
1111 | 0 | a[0] = tmp[col+0*4] + tmp[col+2*4]; |
1112 | 0 | a[1] = tmp[col+1*4] + tmp[col+3*4]; |
1113 | 0 | a[2] = tmp[col+0*4] - tmp[col+2*4]; |
1114 | 0 | a[3] = tmp[col+1*4] - tmp[col+3*4]; |
1115 | |
|
1116 | 0 | coeffs[col+0*4] = a[0]+a[1]; |
1117 | 0 | coeffs[col+1*4] = a[0]-a[1]; |
1118 | 0 | coeffs[col+2*4] = a[2]+a[3]; |
1119 | 0 | coeffs[col+3*4] = a[2]-a[3]; |
1120 | 0 | } |
1121 | | |
1122 | | //printMatrix("coeffs",coeffs,4); |
1123 | 0 | } |
1124 | | |
1125 | | |
1126 | | void hadamard_8x8_8_fallback(int16_t *coeffs, const int16_t *input, ptrdiff_t stride) |
1127 | 0 | { |
1128 | 0 | int16_t tmp[8*8]; |
1129 | | |
1130 | | // row transforms |
1131 | | |
1132 | | //printMatrix("input",input,8); |
1133 | |
|
1134 | 0 | int16_t a[8],b[8]; |
1135 | 0 | for (int row=0;row<8;row++) { |
1136 | 0 | int rs = row*stride; |
1137 | 0 | a[0] = input[0+rs] + input[4+rs]; |
1138 | 0 | a[1] = input[1+rs] + input[5+rs]; |
1139 | 0 | a[2] = input[2+rs] + input[6+rs]; |
1140 | 0 | a[3] = input[3+rs] + input[7+rs]; |
1141 | 0 | a[4] = input[0+rs] - input[4+rs]; |
1142 | 0 | a[5] = input[1+rs] - input[5+rs]; |
1143 | 0 | a[6] = input[2+rs] - input[6+rs]; |
1144 | 0 | a[7] = input[3+rs] - input[7+rs]; |
1145 | |
|
1146 | 0 | b[0] = a[0]+a[2]; |
1147 | 0 | b[1] = a[1]+a[3]; |
1148 | 0 | b[2] = a[0]-a[2]; |
1149 | 0 | b[3] = a[1]-a[3]; |
1150 | 0 | b[4] = a[4]+a[6]; |
1151 | 0 | b[5] = a[5]+a[7]; |
1152 | 0 | b[6] = a[4]-a[6]; |
1153 | 0 | b[7] = a[5]-a[7]; |
1154 | |
|
1155 | 0 | tmp[0+8*row] = b[0]+b[1]; |
1156 | 0 | tmp[1+8*row] = b[0]-b[1]; |
1157 | 0 | tmp[2+8*row] = b[2]+b[3]; |
1158 | 0 | tmp[3+8*row] = b[2]-b[3]; |
1159 | 0 | tmp[4+8*row] = b[4]+b[5]; |
1160 | 0 | tmp[5+8*row] = b[4]-b[5]; |
1161 | 0 | tmp[6+8*row] = b[6]+b[7]; |
1162 | 0 | tmp[7+8*row] = b[6]-b[7]; |
1163 | 0 | } |
1164 | | |
1165 | | //printMatrix("tmp",tmp,8); |
1166 | | |
1167 | | // column transforms |
1168 | |
|
1169 | 0 | for (int col=0;col<8;col++) { |
1170 | 0 | a[0] = tmp[col+0*8] + tmp[col+4*8]; |
1171 | 0 | a[1] = tmp[col+1*8] + tmp[col+5*8]; |
1172 | 0 | a[2] = tmp[col+2*8] + tmp[col+6*8]; |
1173 | 0 | a[3] = tmp[col+3*8] + tmp[col+7*8]; |
1174 | 0 | a[4] = tmp[col+0*8] - tmp[col+4*8]; |
1175 | 0 | a[5] = tmp[col+1*8] - tmp[col+5*8]; |
1176 | 0 | a[6] = tmp[col+2*8] - tmp[col+6*8]; |
1177 | 0 | a[7] = tmp[col+3*8] - tmp[col+7*8]; |
1178 | |
|
1179 | 0 | b[0] = a[0]+a[2]; |
1180 | 0 | b[1] = a[1]+a[3]; |
1181 | 0 | b[2] = a[0]-a[2]; |
1182 | 0 | b[3] = a[1]-a[3]; |
1183 | 0 | b[4] = a[4]+a[6]; |
1184 | 0 | b[5] = a[5]+a[7]; |
1185 | 0 | b[6] = a[4]-a[6]; |
1186 | 0 | b[7] = a[5]-a[7]; |
1187 | |
|
1188 | 0 | coeffs[col+0*8] = b[0]+b[1]; |
1189 | 0 | coeffs[col+1*8] = b[0]-b[1]; |
1190 | 0 | coeffs[col+2*8] = b[2]+b[3]; |
1191 | 0 | coeffs[col+3*8] = b[2]-b[3]; |
1192 | 0 | coeffs[col+4*8] = b[4]+b[5]; |
1193 | 0 | coeffs[col+5*8] = b[4]-b[5]; |
1194 | 0 | coeffs[col+6*8] = b[6]+b[7]; |
1195 | 0 | coeffs[col+7*8] = b[6]-b[7]; |
1196 | 0 | } |
1197 | | |
1198 | | //printMatrix("coeffs",coeffs,8); |
1199 | 0 | } |
1200 | | |
1201 | | |
1202 | | void hadamard_16x16_8_fallback(int16_t *coeffs, const int16_t *input, ptrdiff_t stride) |
1203 | 0 | { |
1204 | 0 | hadamard_transform_8(coeffs,16, input,stride); |
1205 | 0 | } |
1206 | | |
1207 | | void hadamard_32x32_8_fallback(int16_t *coeffs, const int16_t *input, ptrdiff_t stride) |
1208 | 0 | { |
1209 | 0 | hadamard_transform_8(coeffs,32, input,stride); |
1210 | 0 | } |